Currently in ext4_can_extents_be_merged(), if one file has unwritten
extents under io, we will not merge any other unwritten extents, even
they are not in range of those unwritten extents under io. This limit
is coarse, indeed we can merge these unwritten extents that are not
under io.
Here add a new ES_IO_B flag to track unwritten extents under io in
extents status tree. When we try to merge unwritten extents, search
given extents in extents status tree, if not found, then we can merge
these unwritten extents.
Note currently we only track unwritten extents under io.
Signed-off-by: Xiaoguang Wang <[email protected]>
---
fs/ext4/extents.c | 24 +++++++++++++++++++++--
fs/ext4/extents_status.c | 41 ++++++++++++++++++++++++++++++++++++++++
fs/ext4/extents_status.h | 12 +++++++++++-
fs/ext4/inode.c | 28 ++++++++++++++++++++++++++-
4 files changed, 101 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 240b6dea5441..444c739470a5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1713,6 +1713,25 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
return err;
}
+static int ext4_unwritten_extent_under_io(struct inode *inode,
+ ext4_lblk_t start, unsigned int len)
+{
+ /*
+ * The check for IO to unwritten extent is somewhat racy as we
+ * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
+ * dropping i_data_sem. But reserved blocks should save us in that
+ * case.
+ */
+ if (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)
+ return 0;
+
+ if (ext4_es_scan_range(inode, &ext4_es_is_under_io, start,
+ start + len - 1))
+ return 1;
+
+ return 0;
+}
+
int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
@@ -1744,8 +1763,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/
if (ext4_ext_is_unwritten(ex1) &&
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
- atomic_read(&EXT4_I(inode)->i_unwritten) ||
- (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+ (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) ||
+ ext4_unwritten_extent_under_io(inode, le32_to_cpu(ex1->ee_block),
+ ext1_ee_len + ext2_ee_len)))
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 2b439afafe13..1262184dac29 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -860,6 +860,38 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
return err;
}
+/*
+ * When writing unwritten extents, we mark EXTENT_STATUS_IO in es,
+ * but if some errors happen and stop submitting IO, also need to
+ * clear EXTENT_STATUS_IO flag.
+ */
+int ext4_es_clear_io_status(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end, ext4_fsblk_t block)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct extent_status *es;
+ unsigned int status;
+ int err = 0;
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ es = __es_tree_search(&tree->root, lblk);
+ if (!es || es->es_lblk > end) {
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+ return err;
+ }
+ status = ext4_es_type(es);
+ status &= ~EXTENT_STATUS_IO;
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ /*
+ * Note ext4_es_insert_extent will remove es firstly and insert new es
+ * with new status without EXTENT_STATUS_IO.
+ */
+ err = ext4_es_insert_extent(inode, lblk, end - lblk + 1, block, status);
+ return err;
+}
+
+
/*
* ext4_es_cache_extent() inserts information into the extent status
* tree if and only if there isn't information about the range in
@@ -1332,6 +1364,15 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
*/
if (ext4_es_is_delayed(es))
goto next;
+
+ /*
+ * We don't reclaim unwritten extent under io because we use
+ * it to check whether we can merge other unwritten extents
+ * who are not under io, and when io completes, then we can
+ * reclaim this extent.
+ */
+ if (ext4_es_is_under_io(es))
+ goto next;
if (ext4_es_is_referenced(es)) {
ext4_es_clear_referenced(es);
goto next;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..0452a98af90d 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -36,6 +36,7 @@ enum {
ES_DELAYED_B,
ES_HOLE_B,
ES_REFERENCED_B,
+ ES_IO_B,
ES_FLAGS
};
@@ -47,11 +48,13 @@ enum {
#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B)
#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B)
+#define EXTENT_STATUS_IO (1 << ES_IO_B)
#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
- EXTENT_STATUS_HOLE) << ES_SHIFT)
+ EXTENT_STATUS_HOLE | \
+ EXTENT_STATUS_IO) << ES_SHIFT)
struct ext4_sb_info;
struct ext4_extent;
@@ -147,6 +150,8 @@ extern bool ext4_es_scan_range(struct inode *inode,
extern bool ext4_es_scan_clu(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk);
+extern int ext4_es_clear_io_status(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end, ext4_fsblk_t block);
static inline unsigned int ext4_es_status(struct extent_status *es)
{
@@ -173,6 +178,11 @@ static inline int ext4_es_is_delayed(struct extent_status *es)
return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}
+static inline int ext4_es_is_under_io(struct extent_status *es)
+{
+ return (ext4_es_type(es) & EXTENT_STATUS_IO) != 0;
+}
+
static inline int ext4_es_is_hole(struct extent_status *es)
{
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 22a9d8159720..ba557a731081 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -704,6 +704,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
+ /*
+ * Track unwritten extent under io. When io completes, we'll
+ * convert unwritten extent to written, ext4_es_insert_extent()
+ * will be called again to insert this written extent, then
+ * EXTENT_STATUS_IO will be cleared automatically, see remove
+ * logic in ext4_es_insert_extent().
+ */
+ if ((status & EXTENT_STATUS_UNWRITTEN) && (flags &
+ EXT4_GET_BLOCKS_IO_SUBMIT))
+ status |= EXTENT_STATUS_IO;
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
if (ret < 0) {
@@ -2526,6 +2536,8 @@ static int mpage_map_and_submit_extent(handle_t *handle,
int err;
loff_t disksize;
int progress = 0;
+ ext4_lblk_t start = 0, end = 0, submitted = 0;
+ ext4_fsblk_t phy_start = 0;
mpd->io_submit.io_end->offset =
((loff_t)map->m_lblk) << inode->i_blkbits;
@@ -2565,13 +2577,27 @@ static int mpage_map_and_submit_extent(handle_t *handle,
return err;
}
progress = 1;
+
+ if (mpd->io_submit.io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ start = mpd->map.m_lblk;
+ end = start + mpd->map.m_len - 1;
+ phy_start = mpd->map.m_pblk;
+ }
/*
* Update buffer state, submit mapped pages, and get us new
* extent to map
*/
err = mpage_map_and_submit_buffers(mpd);
- if (err < 0)
+ if (err < 0) {
+ submitted = mpd->io_submit.io_end->size >>
+ inode->i_blkbits;
+ if (mpd->io_submit.io_end->flag & EXT4_IO_END_UNWRITTEN
+ && submitted < (end - start + 1))
+ ext4_es_clear_io_status(inode,
+ start + submitted, end,
+ phy_start + submitted);
goto update_disksize;
+ }
} while (map->m_len);
update_disksize:
--
2.17.2
With "nodelalloc", blocks are allocated at the time of writing, and with
"dioread_nolock", these allocated blocks are marked as unwritten as well,
so bh(s) attached to the blocks have BH_Unwritten and BH_Mapped.
Everything looks normal except with "dioread_nolock", all allocated
extents are with EXT4_GET_BLOCKS_PRE_IO, which doesn't allow merging
adjacent extents.
And when it comes to writepages, given the fact that bh marked as
BH_Unwritten, it has to hold a journal handle to process these extents,
but when writepages() prepared a bunch of pages in a mpd, it could only
find one block to map to and submit one page at a time, and loop to the
next page over and over again.
ext4_writepages
...
# starting from the 1st dirty page
ext4_journal_start_with_reserve
mpage_prepare_extent_to_map
# batch up to 2048 dirty pages
mpage_map_and_submit_extent
mpage_map_one_extent
ext4_map_blocks #with EXT4_GET_BLOCKS_IO_CREATE_EXT
ext4_ext_map_blocks
ext4_find_extent
# find an extent with only one block at the offset
ext4_ext_handle_unwritten_extents
# try to split due to EXT4_GET_BLOCKS_PRE_IO,
# but no need to in this case as there is
# only one block in this extent
mpage_map_and_submit_buffers
#submit io for only 1st page
#start from the 2nd dirty page
...
---
Given this is for buffered writes, the nice thing we want from
"dioread_nolock" is that extents are converted from unwritten at endio, so
thus we really don't have to take PRE_IO which is desigend for direct IO
path originally.
With this, we do extent merging in case of "nodelalloc" and writeback
doesn't need to do those extra batching and looping, the performance
number is shown as follows:
mount -o dioread_nolock,nodelalloc /dev/loop0 /mnt/
xfs_io -f -c "pwrite -W 0 1G" $M/foobar
- w/o:
wrote 1073741824/1073741824 bytes at offset 0
1 GiB, 262144 ops; 0:02:27.00 (6.951 MiB/sec and 1779.3791 ops/sec)
- w/
wrote 1073741824/1073741824 bytes at offset 0
1 GiB, 262144 ops; 0:00:06.00 (161.915 MiB/sec and 41450.3184 ops/sec)
Signed-off-by: Liu Bo <[email protected]>
Signed-off-by: Xiaoguang Wang <[email protected]>
---
fs/ext4/extents.c | 6 +++++-
fs/ext4/inode.c | 2 +-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 444c739470a5..de73b0152892 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4038,9 +4038,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
/*
* repeat fallocate creation request
* we already have an unwritten extent
+ *
+ * With nodelalloc + dioread_nolock, write can also come here,
+ * so make sure map is set with new to avoid exposing stale
+ * data to reads.
*/
if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
- map->m_flags |= EXT4_MAP_UNWRITTEN;
+ map->m_flags |= EXT4_MAP_UNWRITTEN | EXT4_MAP_NEW;
goto map_out;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ba557a731081..4dbb43ab9d6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -822,7 +822,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
}
/* Maximum number of blocks we map for direct IO at once. */
--
2.17.2
On Sat, Dec 15, 2018 at 01:48:40PM +0800, Xiaoguang Wang wrote:
> With "nodelalloc", blocks are allocated at the time of writing, and with
> "dioread_nolock", these allocated blocks are marked as unwritten as well,
> so bh(s) attached to the blocks have BH_Unwritten and BH_Mapped.
I've been looking at your patches, and it seems that a simpler way,
perhaps more maintainable approach in the long term is to change how
we write to newly allocated blocks. Today, we have two ways of doing
this:
1) In the dioread_nolock case, we allocate blocks, insert an entry in
the extent tree with the blocks marked uninitialized, write the
blocks, and then mark the blocks initialized.
2) In the !dioread_nolock case, we allocate blocks, insert an entry to
the extent tree, write the blocks --- and if we start a commit, we
write out all dirty pages associated with that inode (in the default
data=writeback case) to avoid stale writes.
So what if we change the dioread_nolock case to do write the blocks
first, and *then* insert the entry into the extent tree? This avoids
stale data getting exposed, either by a direct I/O read, or after a
crash (which means we avoid needing to do the force write-out).
So what we would need to do is to pass a flag to ext4_map_blocks()
which causes it to *not* make any on-disk changes. Instead, it would
track the fact that blocks have be reserved in the buddy bitmap (this
is how we prevent blocks from being preallocated after they are
deleted, but before the transaction has been committed), and the
location of the assigned blocks in the extent_status tree. Since no
on-disk changes are being made, we wouldn't need to hold the
transaction open.
Then in the callback after the blocks are written, using the starting
logical block number stored in the io_end structure, we either convert
the unwritten extents or actually insert the newly allocated blocks in
the extent tree and update the on-disk bitmap allocation bitmaps.
Once we get this working, it should be easy to make dioread_nolock for
1k block sizes; it keeps the time that the handle open very short; and
it completely obviates the need for data=writeback.
What do folks think?
- Ted
On Wed 23-01-19 01:27:38, Theodore Y. Ts'o wrote:
> On Sat, Dec 15, 2018 at 01:48:40PM +0800, Xiaoguang Wang wrote:
> > With "nodelalloc", blocks are allocated at the time of writing, and with
> > "dioread_nolock", these allocated blocks are marked as unwritten as well,
> > so bh(s) attached to the blocks have BH_Unwritten and BH_Mapped.
>
> I've been looking at your patches, and it seems that a simpler way,
> perhaps more maintainable approach in the long term is to change how
> we write to newly allocated blocks. Today, we have two ways of doing
> this:
>
> 1) In the dioread_nolock case, we allocate blocks, insert an entry in
> the extent tree with the blocks marked uninitialized, write the
> blocks, and then mark the blocks initialized.
>
> 2) In the !dioread_nolock case, we allocate blocks, insert an entry to
> the extent tree, write the blocks --- and if we start a commit, we
> write out all dirty pages associated with that inode (in the default
> data=writeback case) to avoid stale writes.
>
> So what if we change the dioread_nolock case to do write the blocks
> first, and *then* insert the entry into the extent tree? This avoids
> stale data getting exposed, either by a direct I/O read, or after a
> crash (which means we avoid needing to do the force write-out).
>
> So what we would need to do is to pass a flag to ext4_map_blocks()
> which causes it to *not* make any on-disk changes. Instead, it would
> track the fact that blocks have be reserved in the buddy bitmap (this
> is how we prevent blocks from being preallocated after they are
> deleted, but before the transaction has been committed), and the
> location of the assigned blocks in the extent_status tree. Since no
> on-disk changes are being made, we wouldn't need to hold the
> transaction open.
>
> Then in the callback after the blocks are written, using the starting
> logical block number stored in the io_end structure, we either convert
> the unwritten extents or actually insert the newly allocated blocks in
> the extent tree and update the on-disk bitmap allocation bitmaps.
>
> Once we get this working, it should be easy to make dioread_nolock for
> 1k block sizes; it keeps the time that the handle open very short; and
> it completely obviates the need for data=writeback.
>
> What do folks think?
Hum, so there is the problem that adding extent to the extent tree may need
some block allocations for metadata. So we'd have to carry over delalloc
reservations upto the io-end time. But that should be doable, just needs
some work. Also in the dioread_nolock case we don't have problems with
page lock & page writeback vs transaction start deadlocks as I've described
in my another email regarding ext4_writepages(). So I don't see any hole in
this and the performance should be good. I like this!
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, Jan 22, 2019 at 10:30 PM Theodore Y. Ts'o <[email protected]> wrote:
>
> On Sat, Dec 15, 2018 at 01:48:40PM +0800, Xiaoguang Wang wrote:
> > With "nodelalloc", blocks are allocated at the time of writing, and with
> > "dioread_nolock", these allocated blocks are marked as unwritten as well,
> > so bh(s) attached to the blocks have BH_Unwritten and BH_Mapped.
>
> I've been looking at your patches, and it seems that a simpler way,
> perhaps more maintainable approach in the long term is to change how
> we write to newly allocated blocks. Today, we have two ways of doing
> this:
>
> 1) In the dioread_nolock case, we allocate blocks, insert an entry in
> the extent tree with the blocks marked uninitialized, write the
> blocks, and then mark the blocks initialized.
>
> 2) In the !dioread_nolock case, we allocate blocks, insert an entry to
> the extent tree, write the blocks --- and if we start a commit, we
> write out all dirty pages associated with that inode (in the default
> data=writeback case) to avoid stale writes.
>
> So what if we change the dioread_nolock case to do write the blocks
> first, and *then* insert the entry into the extent tree? This avoids
> stale data getting exposed, either by a direct I/O read, or after a
> crash (which means we avoid needing to do the force write-out).
>
> So what we would need to do is to pass a flag to ext4_map_blocks()
> which causes it to *not* make any on-disk changes. Instead, it would
> track the fact that blocks have be reserved in the buddy bitmap (this
> is how we prevent blocks from being preallocated after they are
> deleted, but before the transaction has been committed), and the
> location of the assigned blocks in the extent_status tree. Since no
> on-disk changes are being made, we wouldn't need to hold the
> transaction open.
>
> Then in the callback after the blocks are written, using the starting
> logical block number stored in the io_end structure, we either convert
> the unwritten extents or actually insert the newly allocated blocks in
> the extent tree and update the on-disk bitmap allocation bitmaps.
>
> Once we get this working, it should be easy to make dioread_nolock for
> 1k block sizes; it keeps the time that the handle open very short; and
> it completely obviates the need for data=writeback.
>
> What do folks think?
>
So that (reserve, write, insert extent records) is basically what
btrfs is doing and I feel like it will work better than the current
way.
My only concern is performance since metadata reservation for delalloc
now becomes more and needs to be carried until endio, a perf. spike
would appear if the foreground writer needs to wait for flushing dirty
pages to reclaim metadata credits.
thanks,
liubo
hi,
> On Wed 23-01-19 01:27:38, Theodore Y. Ts'o wrote:
>> On Sat, Dec 15, 2018 at 01:48:40PM +0800, Xiaoguang Wang wrote:
>>> With "nodelalloc", blocks are allocated at the time of writing, and with
>>> "dioread_nolock", these allocated blocks are marked as unwritten as well,
>>> so bh(s) attached to the blocks have BH_Unwritten and BH_Mapped.
>>
>> I've been looking at your patches, and it seems that a simpler way,
>> perhaps more maintainable approach in the long term is to change how
>> we write to newly allocated blocks. Today, we have two ways of doing
>> this:
>>
>> 1) In the dioread_nolock case, we allocate blocks, insert an entry in
>> the extent tree with the blocks marked uninitialized, write the
>> blocks, and then mark the blocks initialized.
>>
>> 2) In the !dioread_nolock case, we allocate blocks, insert an entry to
>> the extent tree, write the blocks --- and if we start a commit, we
>> write out all dirty pages associated with that inode (in the default
>> data=writeback case) to avoid stale writes.
>>
>> So what if we change the dioread_nolock case to do write the blocks
>> first, and *then* insert the entry into the extent tree? This avoids
>> stale data getting exposed, either by a direct I/O read, or after a
>> crash (which means we avoid needing to do the force write-out).
>>
>> So what we would need to do is to pass a flag to ext4_map_blocks()
>> which causes it to *not* make any on-disk changes. Instead, it would
>> track the fact that blocks have be reserved in the buddy bitmap (this
>> is how we prevent blocks from being preallocated after they are
>> deleted, but before the transaction has been committed), and the
>> location of the assigned blocks in the extent_status tree. Since no
>> on-disk changes are being made, we wouldn't need to hold the
>> transaction open.
>>
>> Then in the callback after the blocks are written, using the starting
>> logical block number stored in the io_end structure, we either convert
>> the unwritten extents or actually insert the newly allocated blocks in
>> the extent tree and update the on-disk bitmap allocation bitmaps.
>>
>> Once we get this working, it should be easy to make dioread_nolock for
>> 1k block sizes; it keeps the time that the handle open very short; and
>> it completely obviates the need for data=writeback.
>>
>> What do folks think?
>
> Hum, so there is the problem that adding extent to the extent tree may need
> some block allocations for metadata. So we'd have to carry over delalloc
> reservations upto the io-end time. But that should be doable, just needs
> some work. Also in the dioread_nolock case we don't have problems with
> page lock & page writeback vs transaction start deadlocks as I've described
> in my another email regarding ext4_writepages(). So I don't see any hole in
> this and the performance should be good. I like this!
First sorry for late response, I was busy with some internal work.
I really like your ideas, thanks both of you. Writing dirty pages while holding
open jbd2 handle will result in some issues, for example, system load will be high,
many tasks in the D state, some of them is stuck in wait_transaction_locked().
Currently I have tried some methods to fix this issue, for example, change
MAX_WRITEPAGES_EXTENT_LEN to 256, then we can make sure that one bio will
be generated at most, and this bio will be submitted after journal stop:
if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle);
handle = NULL;
mpd.do_map = 0;
}
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
After this change, the number of tasks in D state will decrease much,
Of course, my method is not good, I just say writing diry pages while holding
open jbd2 handle is not good :) and I believe that your ideas can fix this kind
of problems, thanks again.
Regards,
Xiaoguang Wang
Regards,
Xiaoguang Wang
>
> Honza
>