2011-06-03 19:20:34

by Lukas Czerner

[permalink] [raw]
Subject: [PATCH 1/2] ext4: Fix max file size and logical block counting of extent format file

Kazuya Mio reported that he was able to hit BUG_ON(next == lblock)
in ext4_ext_put_gap_in_cache() while creating a sparse file in extent
format and fill the tail of file up to its end. We will hit the BUG_ON
when we write the last block (2^32-1) into the sparse file.

The root cause of the problem lies in the fact that we specifically set
s_maxbytes so that block at s_maxbytes fit into on-disk extent format,
which is 32 bit long. However, we are not storing start and end block
number, but rather start block number and length in blocks. It means
that in order to cover extent from 0 to EXT_MAX_BLOCK we need
EXT_MAX_BLOCK+1 to fit into len (because we counting block 0 as well) -
and it does not.

The only way to fix it without changing the meaning of the struct
ext4_extent members is, as Kazuya Mio suggested, to lower s_maxbytes
by one fs block so we can cover the whole extent we can get by the
on-disk extent format.

Also in many places EXT_MAX_BLOCK is used as length instead of maximum
logical block number as the name suggests, it is all a bit messy. So
this commit renames it to EXT_MAX_BLOCKS and change its usage in some
places to actually be maximum number of blocks in the extent.

The bug which this commit fixes can be reproduced as follows:

dd if=/dev/zero of=/mnt/mp1/file bs=<blocksize> count=1 seek=$((2**32-2))
sync
dd if=/dev/zero of=/mnt/mp1/file bs=<blocksize> count=1 seek=$((2**32-1))

Reported-by: Kazuya Mio <[email protected]>
Signed-off-by: Lukas Czerner <[email protected]>
---
fs/ext4/ext4_extents.h | 7 +++++--
fs/ext4/extents.c | 34 +++++++++++++++++-----------------
fs/ext4/move_extent.c | 10 +++++-----
fs/ext4/super.c | 15 ++++++++++++---
4 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 2e29abb..4764146 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -133,8 +133,11 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
#define EXT_BREAK 1
#define EXT_REPEAT 2

-/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
-#define EXT_MAX_BLOCK 0xffffffff
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff

/*
* EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5199bac..4157570 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1408,7 +1408,7 @@ got_index:

/*
* ext4_ext_next_allocated_block:
- * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
* NOTE: it considers block number from index entry as
* allocated block. Thus, index entries have to be consistent
* with leaves.
@@ -1422,7 +1422,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
depth = path->p_depth;

if (depth == 0 && path->p_ext == NULL)
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;

while (depth >= 0) {
if (depth == path->p_depth) {
@@ -1439,12 +1439,12 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
depth--;
}

- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
}

/*
* ext4_ext_next_leaf_block:
- * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
*/
static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
struct ext4_ext_path *path)
@@ -1456,7 +1456,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,

/* zero-tree has no leaf blocks at all */
if (depth == 0)
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;

/* go to index block */
depth--;
@@ -1469,7 +1469,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
depth--;
}

- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
}

/*
@@ -1677,13 +1677,13 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
*/
if (b2 < b1) {
b2 = ext4_ext_next_allocated_block(path);
- if (b2 == EXT_MAX_BLOCK)
+ if (b2 == EXT_MAX_BLOCKS)
goto out;
}

/* check for wrap through zero on extent logical start block*/
if (b1 + len1 < b1) {
- len1 = EXT_MAX_BLOCK - b1;
+ len1 = EXT_MAX_BLOCKS - b1;
newext->ee_len = cpu_to_le16(len1);
ret = 1;
}
@@ -1767,7 +1767,7 @@ repeat:
fex = EXT_LAST_EXTENT(eh);
next = ext4_ext_next_leaf_block(inode, path);
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
- && next != EXT_MAX_BLOCK) {
+ && next != EXT_MAX_BLOCKS) {
ext_debug("next leaf block - %d\n", next);
BUG_ON(npath != NULL);
npath = ext4_ext_find_extent(inode, next, NULL);
@@ -1887,7 +1887,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
BUG_ON(func == NULL);
BUG_ON(inode == NULL);

- while (block < last && block != EXT_MAX_BLOCK) {
+ while (block < last && block != EXT_MAX_BLOCKS) {
num = last - block;
/* find extent for this block */
down_read(&EXT4_I(inode)->i_data_sem);
@@ -2020,7 +2020,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
if (ex == NULL) {
/* there is no extent yet, so gap is [0;-] */
lblock = 0;
- len = EXT_MAX_BLOCK;
+ len = EXT_MAX_BLOCKS;
ext_debug("cache gap(whole file):");
} else if (block < le32_to_cpu(ex->ee_block)) {
lblock = block;
@@ -2350,7 +2350,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
* never happen because at least one of the end points
* needs to be on the edge of the extent.
*/
- if (end == EXT_MAX_BLOCK) {
+ if (end == EXT_MAX_BLOCKS - 1) {
ext_debug(" bad truncate %u:%u\n",
start, end);
block = 0;
@@ -2398,7 +2398,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
* If this is a truncate, this condition
* should never happen
*/
- if (end == EXT_MAX_BLOCK) {
+ if (end == EXT_MAX_BLOCKS - 1) {
ext_debug(" bad truncate %u:%u\n",
start, end);
err = -EIO;
@@ -2478,7 +2478,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
* we need to remove it from the leaf
*/
if (num == 0) {
- if (end != EXT_MAX_BLOCK) {
+ if (end != EXT_MAX_BLOCKS - 1) {
/*
* For hole punching, we need to scoot all the
* extents up when an extent is removed so that
@@ -3699,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)

last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
- err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);

/* In a multi-transaction truncate, we only make the final
* transaction synchronous.
@@ -4347,8 +4347,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,

start_blk = start >> inode->i_sb->s_blocksize_bits;
last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
- if (last_blk >= EXT_MAX_BLOCK)
- last_blk = EXT_MAX_BLOCK-1;
+ if (last_blk >= EXT_MAX_BLOCKS)
+ last_blk = EXT_MAX_BLOCKS-1;
len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;

/*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2b8304b..f57455a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1002,12 +1002,12 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}

- if ((orig_start > EXT_MAX_BLOCK) ||
- (donor_start > EXT_MAX_BLOCK) ||
- (*len > EXT_MAX_BLOCK) ||
- (orig_start + *len > EXT_MAX_BLOCK)) {
+ if ((orig_start >= EXT_MAX_BLOCKS) ||
+ (donor_start >= EXT_MAX_BLOCKS) ||
+ (*len > EXT_MAX_BLOCKS) ||
+ (orig_start + *len >= EXT_MAX_BLOCKS)) {
ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
- "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cc5c157..9ea71aa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2243,6 +2243,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
* in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
* so that won't be a limiting factor.
*
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
* Note, this does *not* consider any metadata overhead for vfs i_blocks.
*/
static loff_t ext4_max_size(int blkbits, int has_huge_files)
@@ -2264,10 +2270,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
upper_limit <<= blkbits;
}

- /* 32-bit extent-start container, ee_block */
- res = 1LL << 32;
+ /*
+ * 32-bit extent-start container, ee_block. We lower the maxbytes
+ * by one fs block, so ee_len can cover the extent of maximum file
+ * size
+ */
+ res = (1LL << 32) - 1;
res <<= blkbits;
- res -= 1;

/* Sanity check against vm- & vfs- imposed limits */
if (res > upper_limit)
--
1.7.4.4



2011-06-03 19:20:35

by Lukas Czerner

[permalink] [raw]
Subject: [PATCH 2/2] ext4: in fiemap use FIEMAP_EXTENT_LAST flag for last extent

Currently we are not marking the extent as the last one
(FIEMAP_EXTENT_LAST) if there is a hole at the end of the file. This is
because we just do not check for it right now and continue searching for
next extent. But at the point we hit the hole at the end of the file, it
is too late.

This commit adds check for the allocated block in subsequent extent and
if there is no more extents (block = EXT_MAX_BLOCKS) just flag the
current one as the last one.

This behaviour has been spotted unintentionally by 252 xfstest, when the
test hangs out, because of wrong loop condition. However on other
filesystems (like xfs) it will exit anyway, because we notice the last
extent flag and exit.

With this patch xfstest 252 does not hang anymore, ext4 fiemap
implementation still reports bad extent type in some cases, however
this seems to be different issue.

Signed-off-by: Lukas Czerner <[email protected]>
---
fs/ext4/ext4_extents.h | 2 +-
fs/ext4/extents.c | 8 +++-----
2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 4764146..095c36f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -125,7 +125,7 @@ struct ext4_ext_path {
* positive retcode - signal for ext4_ext_walk_space(), see below
* callback must return valid extent (passed or newly created)
*/
-typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
struct ext4_ext_cache *,
struct ext4_extent *, void *);

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4157570..f815cc8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1958,7 +1958,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
err = -EIO;
break;
}
- err = func(inode, path, &cbex, ex, cbdata);
+ err = func(inode, next, &cbex, ex, cbdata);
ext4_ext_drop_refs(path);

if (err < 0)
@@ -3914,14 +3914,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
/*
* Callback function called for each extent to gather FIEMAP information.
*/
-static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
struct ext4_ext_cache *newex, struct ext4_extent *ex,
void *data)
{
__u64 logical;
__u64 physical;
__u64 length;
- loff_t size;
__u32 flags = 0;
int ret = 0;
struct fiemap_extent_info *fieinfo = data;
@@ -4103,8 +4102,7 @@ found_delayed_extent:
if (ex && ext4_ext_is_uninitialized(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;

- size = i_size_read(inode);
- if (logical + length >= size)
+ if (next == EXT_MAX_BLOCKS)
flags |= FIEMAP_EXTENT_LAST;

ret = fiemap_fill_next_extent(fieinfo, logical, physical,
--
1.7.4.4


2011-06-04 00:13:38

by Allison Henderson

[permalink] [raw]
Subject: Re: [PATCH 2/2] ext4: in fiemap use FIEMAP_EXTENT_LAST flag for last extent

On 6/3/2011 12:20 PM, Lukas Czerner wrote:
> Currently we are not marking the extent as the last one
> (FIEMAP_EXTENT_LAST) if there is a hole at the end of the file. This is
> because we just do not check for it right now and continue searching for
> next extent. But at the point we hit the hole at the end of the file, it
> is too late.
>
> This commit adds check for the allocated block in subsequent extent and
> if there is no more extents (block = EXT_MAX_BLOCKS) just flag the
> current one as the last one.
>
> This behaviour has been spotted unintentionally by 252 xfstest, when the
> test hangs out, because of wrong loop condition. However on other
> filesystems (like xfs) it will exit anyway, because we notice the last
> extent flag and exit.
>
> With this patch xfstest 252 does not hang anymore, ext4 fiemap
> implementation still reports bad extent type in some cases, however
> this seems to be different issue.
>
> Signed-off-by: Lukas Czerner<[email protected]>
> ---
> fs/ext4/ext4_extents.h | 2 +-
> fs/ext4/extents.c | 8 +++-----
> 2 files changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
> index 4764146..095c36f 100644
> --- a/fs/ext4/ext4_extents.h
> +++ b/fs/ext4/ext4_extents.h
> @@ -125,7 +125,7 @@ struct ext4_ext_path {
> * positive retcode - signal for ext4_ext_walk_space(), see below
> * callback must return valid extent (passed or newly created)
> */
> -typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
> +typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
> struct ext4_ext_cache *,
> struct ext4_extent *, void *);
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 4157570..f815cc8 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -1958,7 +1958,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
> err = -EIO;
> break;
> }
> - err = func(inode, path,&cbex, ex, cbdata);
> + err = func(inode, next,&cbex, ex, cbdata);
> ext4_ext_drop_refs(path);
>
> if (err< 0)
> @@ -3914,14 +3914,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> /*
> * Callback function called for each extent to gather FIEMAP information.
> */
> -static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
> +static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
> struct ext4_ext_cache *newex, struct ext4_extent *ex,
> void *data)
> {
> __u64 logical;
> __u64 physical;
> __u64 length;
> - loff_t size;
> __u32 flags = 0;
> int ret = 0;
> struct fiemap_extent_info *fieinfo = data;
> @@ -4103,8 +4102,7 @@ found_delayed_extent:
> if (ex&& ext4_ext_is_uninitialized(ex))
> flags |= FIEMAP_EXTENT_UNWRITTEN;
>
> - size = i_size_read(inode);
> - if (logical + length>= size)
> + if (next == EXT_MAX_BLOCKS)
> flags |= FIEMAP_EXTENT_LAST;
>
> ret = fiemap_fill_next_extent(fieinfo, logical, physical,

Hi Lukas,

I tried this patch and it fixed the 252 hang for me too. This is a fix
I needed to continue the tests that I am working on. Thank you! :)

Allison Henderson

2011-06-06 04:09:50

by Theodore Ts'o

[permalink] [raw]