LinuxLists.cc - [PATCH 1/4] ext4: fix potential use after free during resize V2

2014-12-02 11:01:07

Subject: [PATCH 1/4] ext4: fix potential use after free during resize V2

We need some sort of synchronization while updating ->s_group_desc
because there are a lot of users which can access old ->s_group_desc
array after it was released.

changes from V1:
- use RCU instead seqcount

Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/balloc.c | 12 ++++++++----
fs/ext4/resize.c | 6 ++++--
2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 83a6f49..2d0a0de 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -282,6 +282,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
unsigned int offset;
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
+ struct buffer_head *gd_bh;
struct ext4_sb_info *sbi = EXT4_SB(sb);

if (block_group >= ngroups) {
@@ -293,7 +294,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,

group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- if (!sbi->s_group_desc[group_desc]) {
+ rcu_read_lock();
+ gd_bh = *rcu_dereference(sbi->s_group_desc) + group_desc;
+ rcu_read_unlock();
+ if (!gd_bh) {
ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
@@ -301,10 +305,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
}

desc = (struct ext4_group_desc *)(
- (__u8 *)sbi->s_group_desc[group_desc]->b_data +
- offset * EXT4_DESC_SIZE(sb));
+ (__u8 *)gd_bh->b_data + offset * EXT4_DESC_SIZE(sb));
if (bh)
- *bh = sbi->s_group_desc[group_desc];
+ *bh = gd_bh;
+
return desc;
}

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bf76f40..08c2256 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -854,8 +854,9 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
n_group_desc[gdb_num] = gdb_bh;
- EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
+ synchronize_rcu();
kvfree(o_group_desc);

le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
@@ -907,8 +908,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
n_group_desc[gdb_num] = gdb_bh;
- EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
+ synchronize_rcu();
kvfree(o_group_desc);
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
--
1.7.1

2014-12-02 11:01:12

by Dmitry Monakhov

[permalink] [raw]

Subject: [PATCH 2/4] ext4: prevent fsreentrance deadlock for inline_data

ext4_da_convert_inline_data_to_extent() invokes grab_cache_page_write_begin().
grab_cache_page_write_begin performs memory allocation, so fs-reentrance
should be prohibited because we are inside journal transaction.

Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/inline.c | 11 ++++++-----
1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3ea6269..efdcede 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -870,6 +870,12 @@ retry_journal:
goto out_journal;
}

+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags |= AOP_FLAG_NOFS;
+
if (ret == -ENOSPC) {
ret = ext4_da_convert_inline_data_to_extent(mapping,
inode,
@@ -882,11 +888,6 @@ retry_journal:
goto out;
}

- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- flags |= AOP_FLAG_NOFS;

page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page) {
--
1.7.1

2014-12-02 11:01:15

by Dmitry Monakhov

[permalink] [raw]

Subject: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

It is rediculus practice to scan inode block by block, this technique
applicable only for old indirect files. This takes signifficant amount
of time for really large files. Let's reuse ext4_fiemap which already
traverse inode-tree in most optimal meaner.

TESTCASE:
ftruncate64(fd, 0);
ftruncate64(fd, 1ULL << 40);
/* lseek will spin very long time */
lseek64(fd, 0, SEEK_DATA);
lseek64(fd, 0, SEEK_HOLE);

Original report: https://lkml.org/lkml/2014/10/16/620

##################################
BTW: Why do we need i_mutex here?

---
fs/ext4/extents.c | 4 +-
fs/ext4/file.c | 220 +++++++++++++++++++++++++---------------------------
2 files changed, 108 insertions(+), 116 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bed4308..e5d3ead 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5166,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,

/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext4_get_block);
+ return __generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);

if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
return -EBADR;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8..513c12c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
* we determine this extent as a data or a hole according to whether the
* page cache has data or not.
*/
-static int ext4_find_unwritten_pgoff(struct inode *inode,
- int whence,
- struct ext4_map_blocks *map,
- loff_t *offset)
+static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
+ loff_t endoff, loff_t *offset)
{
struct pagevec pvec;
- unsigned int blkbits;
pgoff_t index;
pgoff_t end;
- loff_t endoff;
loff_t startoff;
loff_t lastoff;
int found = 0;

- blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+

index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -408,147 +403,144 @@ out:
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t dataoff, isize;
- int blkbits;
- int ret = 0;
+ struct fiemap_extent_info fie;
+ struct fiemap_extent ext[2];
+ loff_t next;
+ int i, ret = 0;

mutex_lock(&inode->i_mutex);
-
- isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset >= inode->i_size) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}
-
- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- dataoff = offset;
-
- do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
+ fie.fi_flags = 0;
+ fie.fi_extents_max = 2;
+ fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
+ while (1) {
+ mm_segment_t old_fs = get_fs();
+
+ fie.fi_extents_mapped = 0;
+ memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
+
+ set_fs(get_ds());
+ ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+ set_fs(old_fs);
+ if (ret)
break;
- }

- /*
- * If there is a delay extent at this offset,
- * it will be as a data.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
+ /* No extents found, EOF */
+ if (!fie.fi_extents_mapped) {
+ ret = -ENXIO;
break;
}
+ for (i = 0; i < fie.fi_extents_mapped; i++) {
+ next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);

- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- &map, &dataoff);
- if (unwritten)
- break;
- }
+ if (offset < (loff_t)ext[i].fe_logical)
+ offset = (loff_t)ext[i].fe_logical;
+ /*
+ * If extent is not unwritten, then it contains valid
+ * data, mapped or delayed.
+ */
+ if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
+ goto out;

- last++;
- dataoff = (loff_t)last << blkbits;
- } while (last <= end);
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ next, &offset))
+ goto out;

+ if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
+ ret = -ENXIO;
+ goto out;
+ }
+ offset = next;
+ }
+ }
+ if (offset > inode->i_size)
+ offset = inode->i_size;
+out:
mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;

- if (dataoff > isize)
- return -ENXIO;
-
- return vfs_setpos(file, dataoff, maxsize);
+ return vfs_setpos(file, offset, maxsize);
}

/*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE
*/
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t holeoff, isize;
- int blkbits;
- int ret = 0;
+ struct fiemap_extent_info fie;
+ struct fiemap_extent ext[2];
+ loff_t next;
+ int i, ret = 0;

mutex_lock(&inode->i_mutex);
-
- isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset >= inode->i_size) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}

- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- holeoff = offset;
+ fie.fi_flags = 0;
+ fie.fi_extents_max = 2;
+ fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
+ while (1) {
+ mm_segment_t old_fs = get_fs();

- do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
- }
+ fie.fi_extents_mapped = 0;
+ memset(ext, 0, sizeof(*ext));

- /*
- * If there is a delay extent at this offset,
- * we will skip this extent.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- last = es.es_lblk + es.es_len;
- holeoff = (loff_t)last << blkbits;
- continue;
- }
+ set_fs(get_ds());
+ ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+ set_fs(old_fs);
+ if (ret)
+ break;

- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- &map, &holeoff);
- if (!unwritten) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
+ /* No extents found */
+ if (!fie.fi_extents_mapped)
+ break;
+
+ for (i = 0; i < fie.fi_extents_mapped; i++) {
+ next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
+ /*
+ * If extent is not unwritten, then it contains valid
+ * data, mapped or delayed.
+ */
+ if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
+ if (offset < (loff_t)ext[i].fe_logical)
+ goto out;
+ offset = next;
continue;
}
- }
-
- /* find a hole */
- break;
- } while (last <= end);
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ next, &offset))
+ goto out;

+ offset = next;
+ if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
+ goto out;
+ }
+ }
+ if (offset > inode->i_size)
+ offset = inode->i_size;
+out:
mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;

- if (holeoff > isize)
- holeoff = isize;

2014-12-02 11:01:13

by Dmitry Monakhov

[permalink] [raw]

Subject: [PATCH 3/4] ext4: ext4_inline_data_fiemap should respect callers argument

Currently ext4_inline_data_fiemap ignores requested arguments (start and len)
which may lead endless loop if start != 0. Also fix incorrect extent length
determination.

Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4.h | 2 +-
fs/ext4/extents.c | 3 ++-
fs/ext4/inline.c | 18 ++++++++++++------
3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e2eba82..29c43e7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2638,7 +2638,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
int *retval);
extern int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
- int *has_inline);
+ int *has_inline, __u64 start, __u64 len);
extern int ext4_try_to_evict_inline_data(handle_t *handle,
struct inode *inode,
int needed);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c3a1fa1..bed4308 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5151,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ext4_has_inline_data(inode)) {
int has_inline = 1;

- error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+ error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
+ start, len);

if (has_inline)
return error;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index efdcede..d3d8192 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1808,11 +1808,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)

int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
- int *has_inline)
+ int *has_inline, __u64 start, __u64 len)
{
__u64 physical = 0;
- __u64 length;
- __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+ __u64 inline_len;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+ FIEMAP_EXTENT_LAST;
int error = 0;
struct ext4_iloc iloc;

@@ -1821,6 +1822,12 @@ int ext4_inline_data_fiemap(struct inode *inode,
*has_inline = 0;
goto out;
}
+ inline_len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+ if (start >= inline_len)
+ goto out;
+ if (start + len < inline_len)
+ inline_len = start + len;
+ inline_len -= start;

error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -1829,11 +1836,10 @@ int ext4_inline_data_fiemap(struct inode *inode,
physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
- length = i_size_read(inode);

if (physical)
- error = fiemap_fill_next_extent(fieinfo, 0, physical,
- length, flags);
+ error = fiemap_fill_next_extent(fieinfo, start, physical,
+ inline_len, flags);
brelse(iloc.bh);
out:
up_read(&EXT4_I(inode)->xattr_sem);
--
1.7.1

2014-12-02 11:13:18

by Dmitry Monakhov

[permalink] [raw]

Subject: Re: [PATCH 1/4] ext4: fix potential use after free during resize V2

Dmitry Monakhov <[email protected]> writes:

> We need some sort of synchronization while updating ->s_group_desc
> because there are a lot of users which can access old ->s_group_desc
> array after it was released.
This patch supersedes V1 (commit from tytso.git/dev : 6e77765ea74a18a8bbd)
Patch-set was tested via xfstests-bld -c inline_data -g auto, but
w/o metadata_csum feature because it triggers another csum related bug which
should be fixed separately.
>
> changes from V1:
> - use RCU instead seqcount
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/balloc.c | 12 ++++++++----
> fs/ext4/resize.c | 6 ++++--
> 2 files changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 83a6f49..2d0a0de 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -282,6 +282,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
> unsigned int offset;
> ext4_group_t ngroups = ext4_get_groups_count(sb);
> struct ext4_group_desc *desc;
> + struct buffer_head *gd_bh;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
>
> if (block_group >= ngroups) {
> @@ -293,7 +294,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
>
> group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
> offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
> - if (!sbi->s_group_desc[group_desc]) {
> + rcu_read_lock();
> + gd_bh = *rcu_dereference(sbi->s_group_desc) + group_desc;
> + rcu_read_unlock();
> + if (!gd_bh) {
> ext4_error(sb, "Group descriptor not loaded - "
> "block_group = %u, group_desc = %u, desc = %u",
> block_group, group_desc, offset);
> @@ -301,10 +305,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
> }
>
> desc = (struct ext4_group_desc *)(
> - (__u8 *)sbi->s_group_desc[group_desc]->b_data +
> - offset * EXT4_DESC_SIZE(sb));
> + (__u8 *)gd_bh->b_data + offset * EXT4_DESC_SIZE(sb));
> if (bh)
> - *bh = sbi->s_group_desc[group_desc];
> + *bh = gd_bh;
> +
> return desc;
> }
>
> diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
> index bf76f40..08c2256 100644
> --- a/fs/ext4/resize.c
> +++ b/fs/ext4/resize.c
> @@ -854,8 +854,9 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
> memcpy(n_group_desc, o_group_desc,
> EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> n_group_desc[gdb_num] = gdb_bh;
> - EXT4_SB(sb)->s_group_desc = n_group_desc;
> EXT4_SB(sb)->s_gdb_count++;
> + rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
> + synchronize_rcu();
> kvfree(o_group_desc);
>
> le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
> @@ -907,8 +908,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
> memcpy(n_group_desc, o_group_desc,
> EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> n_group_desc[gdb_num] = gdb_bh;
> - EXT4_SB(sb)->s_group_desc = n_group_desc;
> EXT4_SB(sb)->s_gdb_count++;
> + rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
> + synchronize_rcu();
> kvfree(o_group_desc);
> BUFFER_TRACE(gdb_bh, "get_write_access");
> err = ext4_journal_get_write_access(handle, gdb_bh);
> --
> 1.7.1

Attachments:

signature.asc (472.00 B)

2014-12-02 23:06:59

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 1/4] ext4: fix potential use after free during resize V2

On Tue, Dec 02, 2014 at 03:00:51PM +0400, Dmitry Monakhov wrote:
> We need some sort of synchronization while updating ->s_group_desc
> because there are a lot of users which can access old ->s_group_desc
> array after it was released.
>
> changes from V1:
> - use RCU instead seqcount
>
> Signed-off-by: Dmitry Monakhov <[email protected]>

Thanks, applied

- Ted

2014-12-02 23:07:08

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 2/4] ext4: prevent fsreentrance deadlock for inline_data

On Tue, Dec 02, 2014 at 03:00:52PM +0400, Dmitry Monakhov wrote:
> ext4_da_convert_inline_data_to_extent() invokes grab_cache_page_write_begin().
> grab_cache_page_write_begin performs memory allocation, so fs-reentrance
> should be prohibited because we are inside journal transaction.
>
> Signed-off-by: Dmitry Monakhov <[email protected]>

Thanks, applied.

- Ted

2014-12-02 23:07:18

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 3/4] ext4: ext4_inline_data_fiemap should respect callers argument

On Tue, Dec 02, 2014 at 03:00:53PM +0400, Dmitry Monakhov wrote:
> Currently ext4_inline_data_fiemap ignores requested arguments (start and len)
> which may lead endless loop if start != 0. Also fix incorrect extent length
> determination.
>
> Signed-off-by: Dmitry Monakhov <[email protected]>

Thanks, applied.

- Ted

2014-12-02 23:07:31

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

On Tue, Dec 02, 2014 at 03:00:54PM +0400, Dmitry Monakhov wrote:
> It is rediculus practice to scan inode block by block, this technique
> applicable only for old indirect files. This takes signifficant amount
> of time for really large files. Let's reuse ext4_fiemap which already
> traverse inode-tree in most optimal meaner.
>
> TESTCASE:
> ftruncate64(fd, 0);
> ftruncate64(fd, 1ULL << 40);
> /* lseek will spin very long time */
> lseek64(fd, 0, SEEK_DATA);
> lseek64(fd, 0, SEEK_HOLE);

Thanks, applied.

- Ted

2014-12-11 20:05:45

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

Hi Dmitry,

I only noticed this after I sent the pull request to Linus, but it
looks like this patch is triggering regression using the ext3 config:

./kvm-xfstests -c ext3 generic/285:

The failure reported by seek_santy_test is:

10. Test a huge file for offset overflow
10.01 SEEK_HOLE expected 65536 or 0, got 0. FAIL
10.02 SEEK_HOLE expected 65536 or 0, got 1. FAIL
10.03 SEEK_DATA expected 0 or 0, got -1. FAIL
10.04 SEEK_DATA expected 1 or 1, got -1. FAIL
10.05 SEEK_HOLE expected 0 or 0, got -65536. FAIL
10.06 SEEK_DATA expected -65536 or -65536, got -1. FAIL
10.07 SEEK_DATA expected -65535 or -65535, got -1. FAIL
10.08 SEEK_DATA expected -65536 or -65536, got -1. FAIL

What's strange is that if I run the commands by hand, I get a very
different failure:

root@kvm-xfstests:~# ./xfstests/src/seek_sanity_test /vdd/seek_sanity_testfile
File system magic#: 0xef53
Allocation size: 4096
Kernel does not support llseek(2) extensions SEEK_HOLE and/or SEEK_DATA. Aborting.

Using strace, the problem is that the SEEK_DATA fallocate is failing:

pwrite64(3, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"..., 8192, 0) = 8192
ftruncate64(3, 16384) = 0
_llseek(3, 0, 0xbff7fe70, SEEK_DATA) = -1 ENXIO (No such device or address)

This fails with commit 14516bb: "ext4: fix suboptimal seek_{data,hole}
extents traversial" and succeeds with its immediate predesssor commit.

I've tried looking at this, but hte fact that I'm seeing different
results when I run it by hand (sometimes I can trigger the failure
with runtests.sh, usually I can't), means that it appears to be timing
dependent.

Could you take a look?

Many thanks!!

- Ted

2014-12-12 08:53:03

by Dmitry Monakhov

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

Theodore Ts'o <[email protected]> writes:

> Hi Dmitry,
>
> I only noticed this after I sent the pull request to Linus, but it
> looks like this patch is triggering regression using the ext3 config:
>
> ./kvm-xfstests -c ext3 generic/285:
>
> The failure reported by seek_santy_test is:
>
> 10. Test a huge file for offset overflow
> 10.01 SEEK_HOLE expected 65536 or 0, got 0. FAIL
> 10.02 SEEK_HOLE expected 65536 or 0, got 1. FAIL
> 10.03 SEEK_DATA expected 0 or 0, got -1. FAIL
> 10.04 SEEK_DATA expected 1 or 1, got -1. FAIL
> 10.05 SEEK_HOLE expected 0 or 0, got -65536. FAIL
> 10.06 SEEK_DATA expected -65536 or -65536, got -1. FAIL
> 10.07 SEEK_DATA expected -65535 or -65535, got -1. FAIL
> 10.08 SEEK_DATA expected -65536 or -65536, got -1. FAIL
>
> What's strange is that if I run the commands by hand, I get a very
> different failure:
>
> root@kvm-xfstests:~# ./xfstests/src/seek_sanity_test /vdd/seek_sanity_testfile
> File system magic#: 0xef53
> Allocation size: 4096
> Kernel does not support llseek(2) extensions SEEK_HOLE and/or SEEK_DATA. Aborting.
>
> Using strace, the problem is that the SEEK_DATA fallocate is failing:
>
> pwrite64(3, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"..., 8192, 0) = 8192
> ftruncate64(3, 16384) = 0
> _llseek(3, 0, 0xbff7fe70, SEEK_DATA) = -1 ENXIO (No such device or address)
Oh. Indeed That is my mistake.

ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
/* No extents found, EOF */
if (!fie.fi_extents_mapped) {
ret = -ENXIO;
break;
}
Delalloc case handled incorrectly. Will fix that ASAP.

>
> This fails with commit 14516bb: "ext4: fix suboptimal seek_{data,hole}
> extents traversial" and succeeds with its immediate predesssor commit.
>
> I've tried looking at this, but hte fact that I'm seeing different
> results when I run it by hand (sometimes I can trigger the failure
> with runtests.sh, usually I can't), means that it appears to be timing
> dependent.
>
> Could you take a look?
>
> Many thanks!!
>
> - Ted
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

Attachments:

signature.asc (472.00 B)

2014-12-17 03:57:16

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

On Fri, Dec 12, 2014 at 11:52:29AM +0300, Dmitry Monakhov wrote:
> Oh. Indeed That is my mistake.
>
> ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
> /* No extents found, EOF */
> if (!fie.fi_extents_mapped) {
> ret = -ENXIO;
> break;
> }
> Delalloc case handled incorrectly. Will fix that ASAP.

Hi Dmitry, how are things going with a fix? It would be great if I
can get something which I can test and push to Linus before he
releases -rc1.

Thanks!!

- Ted

2014-12-17 15:07:08

by Dmitry Monakhov

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

Theodore Ts'o <[email protected]> writes:

> On Fri, Dec 12, 2014 at 11:52:29AM +0300, Dmitry Monakhov wrote:
>> Oh. Indeed That is my mistake.
>>
>> ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
>> /* No extents found, EOF */
>> if (!fie.fi_extents_mapped) {
>> ret = -ENXIO;
>> break;
>> }
>> Delalloc case handled incorrectly. Will fix that ASAP.
>
> Hi Dmitry, how are things going with a fix? It would be great if I
> can get something which I can test and push to Linus before he
> releases -rc1.
I'm working on that. Sorry for delay. Delay is caused by confusion of
existing seek_xxx implementation (even w/o my changes)
For example: it is not obvious to believe that if bh is (bh_unwritten |
bh_uptodate) then it contains valid data. IMHO it is just a fallocated
area. I already have simple fix for actual regression. But I try to
review,optimize and retest all related logic.

FYI: Currently test failed only on EXT4 w/o extents but w/ -odelalloc enabled.
This configuration is absent in xfstest-bld where conf/ext3 is EXT4 w/o
extents and w/ nodelalloc. That is why we oversee this from very beginning.
>
> Thanks!!
>
> - Ted

Attachments:

signature.asc (472.00 B)

2014-12-18 02:39:22

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

On Wed, Dec 17, 2014 at 06:06:27PM +0300, Dmitry Monakhov wrote:
> FYI: Currently test failed only on EXT4 w/o extents but w/ -odelalloc enabled.
> This configuration is absent in xfstest-bld where conf/ext3 is EXT4 w/o
> extents and w/ nodelalloc. That is why we oversee this from very beginning.

Hmm.. I definitely found this because it was failing with the
conf/ext3 configuration for me....

- Ted

2014-12-27 15:39:25

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

On Wed, Dec 17, 2014 at 06:06:27PM +0300, Dmitry Monakhov wrote:
> I'm working on that. Sorry for delay. Delay is caused by confusion of
> existing seek_xxx implementation (even w/o my changes)
> For example: it is not obvious to believe that if bh is (bh_unwritten |
> bh_uptodate) then it contains valid data. IMHO it is just a fallocated
> area. I already have simple fix for actual regression. But I try to
> review,optimize and retest all related logic.

Hi Dmitry,

Any update on how your testing is going? I was wondering if I could
take a look at your fix just so I can understand what is going on.

> FYI: Currently test failed only on EXT4 w/o extents but w/ -odelalloc enabled.
> This configuration is absent in xfstest-bld where conf/ext3 is EXT4 w/o
> extents and w/ nodelalloc. That is why we oversee this from very beginning.

One of the reasons why I'm having trouble understanding what is
happening is that I can *only* reproduce the test if I use

./kvm-xfstests -c ext3 generic/285

If I run it by hand, I get this instead:

# xfstests/src/seek_sanity_test /mnt/test
File system magic#: 0xef53
Allocation size: 4096
Kernel does not support llseek(2) extensions SEEK_HOLE and/or SEEK_DATA. Aborting.

... which is what I would expect, since this test should be refusing
to run in the first place if extents aren't available. Also, if
extents aren't available, then we shouldn't be seeing *any* fallocated
space, so if the fault is in handling (bh_unwritten | bh_uptodate),
then this case shouldn't be happening at all on a file system w/o
extents, right?

One of the reasons why I am worrying is now that it's post -rc1, more
people will be testing out the kernel, and I'm concerned that I don't
understand what the risk exposure might be, especially since there are
programs like /bin/cp that assume SEEK_DATA and SEEK_HOLE work
correctly, and early testers might suffer data loss if they aren't
working correctly.

So I'm trying to determine whether this is something where we
understand what is going on so we can fix the bug, or whether I'm
better off reverting the commit and waiting until we have fully
characterized the bug.

Thanks,

- Ted

2014-12-28 18:55:51

by Dmitry Monakhov

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

Theodore Ts'o <[email protected]> writes:

> On Wed, Dec 17, 2014 at 06:06:27PM +0300, Dmitry Monakhov wrote:
>> I'm working on that. Sorry for delay. Delay is caused by confusion of
>> existing seek_xxx implementation (even w/o my changes)
>> For example: it is not obvious to believe that if bh is (bh_unwritten |
>> bh_uptodate) then it contains valid data. IMHO it is just a fallocated
>> area. I already have simple fix for actual regression. But I try to
>> review,optimize and retest all related logic.
>
> Hi Dmitry,
>
> Any update on how your testing is going? I was wondering if I could
> take a look at your fix just so I can understand what is going on.
Yes. I've already sent you updated patches
https://patchwork.ozlabs.org/patch/422594/
https://patchwork.ozlabs.org/patch/422595/
>
>> FYI: Currently test failed only on EXT4 w/o extents but w/ -odelalloc enabled.
>> This configuration is absent in xfstest-bld where conf/ext3 is EXT4 w/o
>> extents and w/ nodelalloc. That is why we oversee this from very beginning.
>
> One of the reasons why I'm having trouble understanding what is
> happening is that I can *only* reproduce the test if I use
>
> ./kvm-xfstests -c ext3 generic/285
>
> If I run it by hand, I get this instead:
>
> # xfstests/src/seek_sanity_test /mnt/test
> File system magic#: 0xef53
> Allocation size: 4096
> Kernel does not support llseek(2) extensions SEEK_HOLE and/or SEEK_DATA. Aborting.
>
> ... which is what I would expect, since this test should be refusing
> to run in the first place if extents aren't available. Also, if
> extents aren't available, then we shouldn't be seeing *any* fallocated
> space, so if the fault is in handling (bh_unwritten | bh_uptodate),
> then this case shouldn't be happening at all on a file system w/o
> extents, right?
>
> One of the reasons why I am worrying is now that it's post -rc1, more
> people will be testing out the kernel, and I'm concerned that I don't
> understand what the risk exposure might be, especially since there are
> programs like /bin/cp that assume SEEK_DATA and SEEK_HOLE work
> correctly, and early testers might suffer data loss if they aren't
> working correctly.
Agree. This bug us serious because may result in bad data lose
for backup/copy software which use this API.
So if you see any failures on my updated patches please let me know.
>
> So I'm trying to determine whether this is something where we
> understand what is going on so we can fix the bug, or whether I'm
> better off reverting the commit and waiting until we have fully
> characterized the bug.
>
> Thanks,
>
> - Ted

2014-12-29 04:13:10

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

On Sun, Dec 28, 2014 at 10:55:46PM +0400, Dmitry Monakhov wrote:
> Yes. I've already sent you updated patches
> https://patchwork.ozlabs.org/patch/422594/
> https://patchwork.ozlabs.org/patch/422595/

Thanks, this somehow got lost from my inbox. I'm not sure how it
happened, but I do see it in patchwork.

- Ted

2015-01-02 20:03:08

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

On Sun, Dec 28, 2014 at 11:13:10PM -0500, Theodore Ts'o wrote:
> On Sun, Dec 28, 2014 at 10:55:46PM +0400, Dmitry Monakhov wrote:
> > Yes. I've already sent you updated patches
> > https://patchwork.ozlabs.org/patch/422594/
> > https://patchwork.ozlabs.org/patch/422595/
>
> Thanks, this somehow got lost from my inbox. I'm not sure how it
> happened, but I do see it in patchwork.

I tried applying both of these patches, but I'm still seeing a failure:

% git log --oneline -2
e195ba5 ext4: fix seek_data for indirect layout
dd634e3 ext4: fix seek_data cleanup

# uname -a
Linux kvm-xfstests 3.18.0-rc3-00003-ge195ba5 #2548 SMP Fri Jan 2 14:35:32 EST 2015 i686 GNU/Linux
# mke2fs -t ext4 -O ^extent,^flex_bg,^uninit_bg -Fq /dev/vdd
/dev/vdd contains a ext4 file system
last mounted on /vdd on Fri Jan 2 14:49:58 2015
# mount -t ext4 -o nodelalloc /dev/vdd /vdd
# xfstests/src/seek_sanity_test /vdd/seek_sanity_testfile
File system magic#: 0xef53
Allocation size: 4096

...
10. Test a huge file for offset overflow
10.01 SEEK_HOLE expected 65536 or 0, got 0. FAIL
10.02 SEEK_HOLE expected 65536 or 0, got 1. FAIL
10.03 SEEK_DATA expected 0 or 0, got -1. FAIL
10.04 SEEK_DATA expected 1 or 1, got -1. FAIL
10.05 SEEK_HOLE expected 0 or 0, got -65536. FAIL
10.06 SEEK_DATA expected -65536 or -65536, got -1. FAIL
10.07 SEEK_DATA expected -65535 or -65535, got -1. FAIL
10.08 SEEK_DATA expected -65536 or -65536, got -1. FAIL

I'm getting the exact same failure with a kernel compiled with

# CONFIG_EXT3_FS is not set
CONFIG_EXT4_USE_FOR_EXT23=y

and then using "mke2fs -t ext3 -Fq /dev/vdd" and "mount -t ext3
/dev/vdd /vdd" and I'm seeing the same failure. Which really worries
me, since that's something that many more users will be using.

At this point I'm beginning to think that we would be better off
reverting 14516bb7bb6 until we can figure out what is going on. I'm
really wondering why you're not seeing this failure, but I am.

- Ted

2015-01-03 19:16:17

by Dmitry Monakhov

[permalink] [raw]

Subject: Re: [PATCH 4/4] ext4: fix suboptimal seek_{data,hole} extents traversial

Theodore Ts'o <[email protected]> writes:

> On Sun, Dec 28, 2014 at 11:13:10PM -0500, Theodore Ts'o wrote:
>> On Sun, Dec 28, 2014 at 10:55:46PM +0400, Dmitry Monakhov wrote:
>> > Yes. I've already sent you updated patches
>> > https://patchwork.ozlabs.org/patch/422594/
>> > https://patchwork.ozlabs.org/patch/422595/
>>
>> Thanks, this somehow got lost from my inbox. I'm not sure how it
>> happened, but I do see it in patchwork.
>
> I tried applying both of these patches, but I'm still seeing a failure:
>
> % git log --oneline -2
> e195ba5 ext4: fix seek_data for indirect layout
> dd634e3 ext4: fix seek_data cleanup
>
> # uname -a
> Linux kvm-xfstests 3.18.0-rc3-00003-ge195ba5 #2548 SMP Fri Jan 2 14:35:32 EST 2015 i686 GNU/Linux
> # mke2fs -t ext4 -O ^extent,^flex_bg,^uninit_bg -Fq /dev/vdd
> /dev/vdd contains a ext4 file system
> last mounted on /vdd on Fri Jan 2 14:49:58 2015
> # mount -t ext4 -o nodelalloc /dev/vdd /vdd
> # xfstests/src/seek_sanity_test /vdd/seek_sanity_testfile
> File system magic#: 0xef53
> Allocation size: 4096
>
> ...
> 10. Test a huge file for offset overflow
> 10.01 SEEK_HOLE expected 65536 or 0, got 0. FAIL
> 10.02 SEEK_HOLE expected 65536 or 0, got 1. FAIL
> 10.03 SEEK_DATA expected 0 or 0, got -1. FAIL
> 10.04 SEEK_DATA expected 1 or 1, got -1. FAIL
> 10.05 SEEK_HOLE expected 0 or 0, got -65536. FAIL
> 10.06 SEEK_DATA expected -65536 or -65536, got -1. FAIL
> 10.07 SEEK_DATA expected -65535 or -65535, got -1. FAIL
> 10.08 SEEK_DATA expected -65536 or -65536, got -1. FAIL
>
> I'm getting the exact same failure with a kernel compiled with
>
> # CONFIG_EXT3_FS is not set
> CONFIG_EXT4_USE_FOR_EXT23=y
>
> and then using "mke2fs -t ext3 -Fq /dev/vdd" and "mount -t ext3
> /dev/vdd /vdd" and I'm seeing the same failure. Which really worries
> me, since that's something that many more users will be using.
>
> At this point I'm beginning to think that we would be better off
> reverting 14516bb7bb6 until we can figure out what is going on. I'm
> really wondering why you're not seeing this failure, but I am.
Crap. I do not understand why I cant not reproduce this.
I'm out of my normal dev environment for couple of days,
so patch reverting looks reasonable. But please add code which
break the loop on signal because otherwise this result in DOS for huge file
#touch file
#truncate -s 1T file
#./seek_hole file

>
> - Ted