We should add inode to the orphan list in the same transaction
as block allocation. This ensures that if we crash after a failed
block allocation and before we do a vmtruncate we don't leak block
(ie block marked as used in bitmap but not claimed by the inode).
Signed-off-by: Aneesh Kumar K.V <[email protected]>
CC: Jan Kara <[email protected]>
---
fs/ext4/inode.c | 15 +++++++++++++--
1 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e90965..28e95ee 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1426,7 +1426,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+ int ret, needed_blocks;
handle_t *handle;
int retries = 0;
struct page *page;
@@ -1437,6 +1437,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
"dev %s ino %lu pos %llu len %u flags %u",
inode->i_sb->s_id, inode->i_ino,
(unsigned long long) pos, len, flags);
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1470,14 +1475,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (ret) {
unlock_page(page);
- ext4_journal_stop(handle);
page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before
+ * truncate finishes
*/
if (pos + len > inode->i_size)
+ ext4_orphan_add(handle, inode);
+
+ ext4_journal_stop(handle);
+ if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
}
--
1.6.3.rc4
In generic_perform_write if we fail to copy the user data we don't
update the inode->i_size. We should truncate the file in the above case
so that we don't have blocks allocated outside inode->i_size. Add
the inode to orphan list in the same transaction as block allocation
This ensures that if we crash in between the recovery would do the truncate.
Signed-off-by: Aneesh Kumar K.V <[email protected]>
CC: Jan Kara <[email protected]>
---
fs/ext4/inode.c | 103 +++++++++++++++++++++++++++++++++++++++++--------------
1 files changed, 77 insertions(+), 26 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28e95ee..43884e3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1507,6 +1507,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
+static int ext4_generic_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int i_size_changed = 0;
+ struct inode *inode = mapping->host;
+ handle_t *handle = ext4_journal_current_handle();
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos + copied > inode->i_size) {
+ i_size_write(inode, pos + copied);
+ i_size_changed = 1;
+ }
+
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_update_i_disksize(inode, (pos + copied));
+ i_size_changed = 1;
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ ext4_mark_inode_dirty(handle, inode);
+
+ return copied;
+}
+
/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
@@ -1530,21 +1576,15 @@ static int ext4_ordered_write_end(struct file *file,
ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
- loff_t new_i_size;
-
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
-
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+ ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
+ if (pos + len > inode->i_size)
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
if (ret2 < 0)
ret = ret2;
}
@@ -1552,6 +1592,9 @@ static int ext4_ordered_write_end(struct file *file,
if (!ret)
ret = ret2;
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+
return ret ? ret : copied;
}
@@ -1563,25 +1606,21 @@ static int ext4_writeback_write_end(struct file *file,
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
int ret = 0, ret2;
- loff_t new_i_size;
trace_mark(ext4_writeback_write_end,
"dev %s ino %lu pos %llu len %u copied %u",
inode->i_sb->s_id, inode->i_ino,
(unsigned long long) pos, len, copied);
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
-
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+ ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
+ if (pos + len > inode->i_size)
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
if (ret2 < 0)
ret = ret2;
@@ -1589,6 +1628,9 @@ static int ext4_writeback_write_end(struct file *file,
if (!ret)
ret = ret2;
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+
return ret ? ret : copied;
}
@@ -1633,10 +1675,19 @@ static int ext4_journalled_write_end(struct file *file,
}
unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size)
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- page_cache_release(page);
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
return ret ? ret : copied;
}
--
1.6.3.rc4
On Mon 04-05-09 14:43:01, Aneesh Kumar K.V wrote:
> In generic_perform_write if we fail to copy the user data we don't
> update the inode->i_size. We should truncate the file in the above case
> so that we don't have blocks allocated outside inode->i_size. Add
> the inode to orphan list in the same transaction as block allocation
> This ensures that if we crash in between the recovery would do the truncate.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> CC: Jan Kara <[email protected]>
This patch looks fine as well.
Acked-by: Jan Kara <[email protected]>
Honza
> ---
> fs/ext4/inode.c | 103 +++++++++++++++++++++++++++++++++++++++++--------------
> 1 files changed, 77 insertions(+), 26 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 28e95ee..43884e3 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1507,6 +1507,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
> return ext4_handle_dirty_metadata(handle, NULL, bh);
> }
>
> +static int ext4_generic_write_end(struct file *file,
> + struct address_space *mapping,
> + loff_t pos, unsigned len, unsigned copied,
> + struct page *page, void *fsdata)
> +{
> + int i_size_changed = 0;
> + struct inode *inode = mapping->host;
> + handle_t *handle = ext4_journal_current_handle();
> +
> + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
> +
> + /*
> + * No need to use i_size_read() here, the i_size
> + * cannot change under us because we hold i_mutex.
> + *
> + * But it's important to update i_size while still holding page lock:
> + * page writeout could otherwise come in and zero beyond i_size.
> + */
> + if (pos + copied > inode->i_size) {
> + i_size_write(inode, pos + copied);
> + i_size_changed = 1;
> + }
> +
> + if (pos + copied > EXT4_I(inode)->i_disksize) {
> + /* We need to mark inode dirty even if
> + * new_i_size is less that inode->i_size
> + * bu greater than i_disksize.(hint delalloc)
> + */
> + ext4_update_i_disksize(inode, (pos + copied));
> + i_size_changed = 1;
> + }
> + unlock_page(page);
> + page_cache_release(page);
> +
> + /*
> + * Don't mark the inode dirty under page lock. First, it unnecessarily
> + * makes the holding time of page lock longer. Second, it forces lock
> + * ordering of page lock and transaction start for journaling
> + * filesystems.
> + */
> + if (i_size_changed)
> + ext4_mark_inode_dirty(handle, inode);
> +
> + return copied;
> +}
> +
> /*
> * We need to pick up the new inode size which generic_commit_write gave us
> * `file' can be NULL - eg, when called from page_symlink().
> @@ -1530,21 +1576,15 @@ static int ext4_ordered_write_end(struct file *file,
> ret = ext4_jbd2_file_inode(handle, inode);
>
> if (ret == 0) {
> - loff_t new_i_size;
> -
> - new_i_size = pos + copied;
> - if (new_i_size > EXT4_I(inode)->i_disksize) {
> - ext4_update_i_disksize(inode, new_i_size);
> - /* We need to mark inode dirty even if
> - * new_i_size is less that inode->i_size
> - * bu greater than i_disksize.(hint delalloc)
> - */
> - ext4_mark_inode_dirty(handle, inode);
> - }
> -
> - ret2 = generic_write_end(file, mapping, pos, len, copied,
> + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
> page, fsdata);
> copied = ret2;
> + if (pos + len > inode->i_size)
> + /* if we have allocated more blocks and copied
> + * less. We will have blocks allocated outside
> + * inode->i_size. So truncate them
> + */
> + ext4_orphan_add(handle, inode);
> if (ret2 < 0)
> ret = ret2;
> }
> @@ -1552,6 +1592,9 @@ static int ext4_ordered_write_end(struct file *file,
> if (!ret)
> ret = ret2;
>
> + if (pos + len > inode->i_size)
> + vmtruncate(inode, inode->i_size);
> +
> return ret ? ret : copied;
> }
>
> @@ -1563,25 +1606,21 @@ static int ext4_writeback_write_end(struct file *file,
> handle_t *handle = ext4_journal_current_handle();
> struct inode *inode = mapping->host;
> int ret = 0, ret2;
> - loff_t new_i_size;
>
> trace_mark(ext4_writeback_write_end,
> "dev %s ino %lu pos %llu len %u copied %u",
> inode->i_sb->s_id, inode->i_ino,
> (unsigned long long) pos, len, copied);
> - new_i_size = pos + copied;
> - if (new_i_size > EXT4_I(inode)->i_disksize) {
> - ext4_update_i_disksize(inode, new_i_size);
> - /* We need to mark inode dirty even if
> - * new_i_size is less that inode->i_size
> - * bu greater than i_disksize.(hint delalloc)
> - */
> - ext4_mark_inode_dirty(handle, inode);
> - }
> -
> - ret2 = generic_write_end(file, mapping, pos, len, copied,
> + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
> page, fsdata);
> copied = ret2;
> + if (pos + len > inode->i_size)
> + /* if we have allocated more blocks and copied
> + * less. We will have blocks allocated outside
> + * inode->i_size. So truncate them
> + */
> + ext4_orphan_add(handle, inode);
> +
> if (ret2 < 0)
> ret = ret2;
>
> @@ -1589,6 +1628,9 @@ static int ext4_writeback_write_end(struct file *file,
> if (!ret)
> ret = ret2;
>
> + if (pos + len > inode->i_size)
> + vmtruncate(inode, inode->i_size);
> +
> return ret ? ret : copied;
> }
>
> @@ -1633,10 +1675,19 @@ static int ext4_journalled_write_end(struct file *file,
> }
>
> unlock_page(page);
> + page_cache_release(page);
> + if (pos + len > inode->i_size)
> + /* if we have allocated more blocks and copied
> + * less. We will have blocks allocated outside
> + * inode->i_size. So truncate them
> + */
> + ext4_orphan_add(handle, inode);
> +
> ret2 = ext4_journal_stop(handle);
> if (!ret)
> ret = ret2;
> - page_cache_release(page);
> + if (pos + len > inode->i_size)
> + vmtruncate(inode, inode->i_size);
>
> return ret ? ret : copied;
> }
> --
> 1.6.3.rc4
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
Did you see the comments I made to the original version of your patch
(e-mail dated April 6, 2008).
The comments can be easily found in the ext4 patchwork website:
http://patchwork.ozlabs.org/patch/25380/
http://patchwork.ozlabs.org/patch/25381/
My concern is about stray inodes on the orphan list causing a panic on
umount, and it looks like these patches increases the exposure of this
happening.
- Ted