From: Zhang Yi <[email protected]>
v1->v2:
- Fix a long standing race issue between determine hole and inserting
new delalloc extent analyzed by Jan Kara.
- Change method of adjusting hole length, instead of skip holes in
ext4_map_blocks(), now we find delalloc and correct length and type
in ext4_ext_determine_hole().
v1: https://lore.kernel.org/linux-ext4/[email protected]/
Hello, all!
I'm working on switching ext4 buffer IO from buffer_head to iomap
and enable large folio on regular file recently [1] (I've been fixing a
lot of issues and should be able to send out v2 soon), this patch set
is one of a preparation of this work. It first fix a long standing race
issue between bmap querying and adding new delalloc extents, then
correct the hole length returned by ext4_map_blocks() when user querying
map type and blocks range, after that, make this function and
ext4_set_iomap() are able to distinguish delayed allocated only mapping
from hole, finally BTW cleanup the ext4_iomap_begin_report().
This preparation patch set changes the ext4 map -> iomap converting logic
in ext4_set_iomap(), so that the later buffer IO conversion can use this
helper to connect iomap frame. This patch set is already passed
'kvm-xfstests -g auto' tests.
Thanks,
Yi.
[1] https://lore.kernel.org/linux-ext4/[email protected]/
Zhang Yi (6):
ext4: refactor ext4_da_map_blocks()
ext4: convert to exclusive lock while inserting delalloc extents
ext4: correct the hole length returned by ext4_map_blocks()
ext4: add a hole extent entry in cache after punch
ext4: make ext4_map_blocks() distinguish delalloc only extent
ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC map type
fs/ext4/ext4.h | 4 +-
fs/ext4/extents.c | 110 +++++++++++++++++++++++++++-------------------
fs/ext4/inode.c | 84 ++++++++++++-----------------------
3 files changed, 96 insertions(+), 102 deletions(-)
--
2.39.2
From: Zhang Yi <[email protected]>
ext4_da_map_blocks() only hold i_data_sem in shared mode and i_rwsem
when inserting delalloc extents, it could be raced by another querying
path of ext4_map_blocks() without i_rwsem, .e.g buffered read path.
Suppose we buffered read a file containing just a hole, and without any
cached extents tree, then it is raced by another delayed buffered write
to the same area or the near area belongs to the same hole, and the new
delalloc extent could be overwritten to a hole extent.
pread() pwrite()
filemap_read_folio()
ext4_mpage_readpages()
ext4_map_blocks()
down_read(i_data_sem)
ext4_ext_determine_hole()
//find hole
ext4_ext_put_gap_in_cache()
ext4_es_find_extent_range()
//no delalloc extent
ext4_da_map_blocks()
down_read(i_data_sem)
ext4_insert_delayed_block()
//insert delalloc extent
ext4_es_insert_extent()
//overwrite delalloc extent to hole
This race could lead to inconsistent delalloc extents tree and
incorrect reserved space counter. Fix this by converting to hold
i_data_sem in exclusive mode when adding a new delalloc extent in
ext4_da_map_blocks().
Cc: [email protected]
Signed-off-by: Zhang Yi <[email protected]>
Suggested-by: Jan Kara <[email protected]>
---
fs/ext4/inode.c | 25 +++++++++++--------------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5b0d3075be12..142c67f5c7fc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1703,10 +1703,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
- if (ext4_es_is_hole(&es)) {
- down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_es_is_hole(&es))
goto add_delayed;
- }
/*
* Delayed extent could be allocated by fallocate.
@@ -1748,8 +1746,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
- if (retval < 0)
- goto out_unlock;
+ if (retval < 0) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ return retval;
+ }
if (retval > 0) {
unsigned int status;
@@ -1765,24 +1765,21 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
- goto out_unlock;
+ up_read(&EXT4_I(inode)->i_data_sem);
+ return retval;
}
+ up_read(&EXT4_I(inode)->i_data_sem);
add_delayed:
- /*
- * XXX: __block_prepare_write() unmaps passed block,
- * is it OK?
- */
+ down_write(&EXT4_I(inode)->i_data_sem);
retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ up_write(&EXT4_I(inode)->i_data_sem);
if (retval)
- goto out_unlock;
+ return retval;
map_bh(bh, inode->i_sb, invalid_block);
set_buffer_new(bh);
set_buffer_delay(bh);
-
-out_unlock:
- up_read((&EXT4_I(inode)->i_data_sem));
return retval;
}
--
2.39.2
From: Zhang Yi <[email protected]>
Refactor and cleanup ext4_da_map_blocks(), reduce some unnecessary
parameters and branches, no logic changes.
Signed-off-by: Zhang Yi <[email protected]>
---
fs/ext4/inode.c | 39 +++++++++++++++++----------------------
1 file changed, 17 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61277f7f8722..5b0d3075be12 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1704,7 +1704,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
if (ext4_es_is_hole(&es)) {
- retval = 0;
down_read(&EXT4_I(inode)->i_data_sem);
goto add_delayed;
}
@@ -1749,26 +1748,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
-
-add_delayed:
- if (retval == 0) {
- int ret;
-
- /*
- * XXX: __block_prepare_write() unmaps passed block,
- * is it OK?
- */
-
- ret = ext4_insert_delayed_block(inode, map->m_lblk);
- if (ret != 0) {
- retval = ret;
- goto out_unlock;
- }
-
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
- } else if (retval > 0) {
+ if (retval < 0)
+ goto out_unlock;
+ if (retval > 0) {
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -1783,11 +1765,24 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
+ goto out_unlock;
}
+add_delayed:
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ if (retval)
+ goto out_unlock;
+
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+
out_unlock:
up_read((&EXT4_I(inode)->i_data_sem));
-
return retval;
}
--
2.39.2
From: Zhang Yi <[email protected]>
Add a new map flag EXT4_MAP_DELAYED to indicate the mapping range is a
delayed allocated only (not unwritten) one, and making
ext4_map_blocks() can distinguish it, no longer mixing it with holes.
Signed-off-by: Zhang Yi <[email protected]>
---
fs/ext4/ext4.h | 4 +++-
fs/ext4/extents.c | 5 +++--
fs/ext4/inode.c | 2 ++
3 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a5d784872303..55195909d32f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -252,8 +252,10 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
+#define EXT4_MAP_DELAYED BIT(BH_Delay)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_DELAYED)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0892d0568013..fc69f13cf510 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4073,9 +4073,10 @@ static void ext4_ext_determine_hole(struct inode *inode,
} else if (in_range(map->m_lblk, es.es_lblk, es.es_len)) {
/*
* Straddle the beginning of the queried range, it's no
- * longer a hole, adjust the length to the delayed extent's
- * after map->m_lblk.
+ * longer a hole, mark it is a delalloc and adjust the
+ * length to the delayed extent's after map->m_lblk.
*/
+ map->m_flags |= EXT4_MAP_DELAYED;
len = es.es_lblk + es.es_len - map->m_lblk;
goto out;
} else {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1b5e6409f958..c141bf6d8db2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -515,6 +515,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
map->m_pblk = 0;
+ map->m_flags |= ext4_es_is_delayed(&es) ?
+ EXT4_MAP_DELAYED : 0;
retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
--
2.39.2
From: Zhang Yi <[email protected]>
Since ext4_map_blocks() can recognize a delayed allocated only extent,
make ext4_set_iomap() can also recognize it, and remove the useless
separate check in ext4_iomap_begin_report().
Signed-off-by: Zhang Yi <[email protected]>
---
fs/ext4/inode.c | 32 +++-----------------------------
1 file changed, 3 insertions(+), 29 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c141bf6d8db2..0458d7f0c059 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3261,6 +3261,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
iomap->addr = (u64) map->m_pblk << blkbits;
if (flags & IOMAP_DAX)
iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
+ } else if (map->m_flags & EXT4_MAP_DELAYED) {
+ iomap->type = IOMAP_DELALLOC;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3423,35 +3426,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_end = ext4_iomap_end,
};
-static bool ext4_iomap_is_delalloc(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct extent_status es;
- ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map->m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end)
- return false;
-
- if (es.es_lblk > map->m_lblk) {
- map->m_len = es.es_lblk - map->m_lblk;
- return false;
- }
-
- offset = map->m_lblk - es.es_lblk;
- map->m_len = es.es_len - offset;
-
- return true;
-}
-
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
int ret;
- bool delalloc = false;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3492,13 +3471,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
- if (ret == 0)
- delalloc = ext4_iomap_is_delalloc(inode, &map);
-
set_iomap:
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
- if (delalloc && iomap->type == IOMAP_HOLE)
- iomap->type = IOMAP_DELALLOC;
return 0;
}
--
2.39.2