2024-04-10 13:45:14

by Zhang Yi

[permalink] [raw]
Subject: [RFC PATCH v4 24/34] ext4: implement buffered write iomap path

From: Zhang Yi <[email protected]>

Implement buffered write iomap path, use ext4_da_map_blocks() to map
delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
delalloc is disabled or free space is about to run out.

Note that we always allocate unwritten extents for new blocks in the
iomap write path, this means that the allocation type is no longer
controlled by the dioread_nolock mount option. After that, we could
postpone the i_disksize updating to the writeback path, and drop journal
handle in the buffered dealloc write path completely.

Signed-off-by: Zhang Yi <[email protected]>
---
fs/ext4/ext4.h | 3 +
fs/ext4/file.c | 19 +++++-
fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 183 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 05949a8136ae..2bd543c43341 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2970,6 +2970,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+int ext4_nonda_switch(struct super_block *sb);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2

@@ -3827,6 +3828,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
+extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 54d6ff22585c..52f37c49572a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return count;
}

+static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ const struct iomap_ops *iomap_ops;
+
+ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+ iomap_ops = &ext4_iomap_buffered_da_write_ops;
+ else
+ iomap_ops = &ext4_iomap_buffered_write_ops;
+
+ return iomap_file_buffered_write(iocb, from, iomap_ops);
+}
+
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
@@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;

- ret = generic_perform_write(iocb, from);
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ ret = ext4_iomap_buffered_write(iocb, from);
+ else
+ ret = generic_perform_write(iocb, from);

out:
inode_unlock(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 20eb772f4f62..e825ed16fd60 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2857,7 +2857,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
return ret;
}

-static int ext4_nonda_switch(struct super_block *sb)
+int ext4_nonda_switch(struct super_block *sb)
{
s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3254,6 +3254,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
return inode->i_state & I_DIRTY_DATASYNC;
}

+static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
+{
+ return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
+}
+
+static const struct iomap_folio_ops ext4_iomap_folio_ops = {
+ .iomap_valid = ext4_iomap_valid,
+};
+
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
loff_t length, unsigned int flags)
@@ -3284,6 +3293,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
iomap->flags |= IOMAP_F_MERGED;

+ iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
+ iomap->folio_ops = &ext4_iomap_folio_ops;
+
/*
* Flags passed to ext4_map_blocks() for direct I/O writes can result
* in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3523,11 +3535,42 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};

-static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
+static int ext4_iomap_get_blocks(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ handle_t *handle;
+ int ret, needed_blocks;
+
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason.
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ /*
+ * Have to stop journal here since there is a potential deadlock
+ * caused by later balance_dirty_pages(), it might wait on the
+ * ditry pages to be written back, which might start another
+ * handle and wait this handle stop.
+ */
+ ext4_journal_stop(handle);
+
+ return ret;
+}
+
+#define IOMAP_F_EXT4_DELALLOC IOMAP_F_PRIVATE
+
+static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
loff_t length, unsigned int iomap_flags,
- struct iomap *iomap, struct iomap *srcmap)
+ struct iomap *iomap, struct iomap *srcmap,
+ bool delalloc)
{
- int ret;
+ int ret, retries = 0;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;

@@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
return -EINVAL;
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
return -ERANGE;
-
+retry:
/* Calculate the first and last logical blocks respectively. */
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ if (iomap_flags & IOMAP_WRITE) {
+ if (delalloc)
+ ret = ext4_da_map_blocks(inode, &map);
+ else
+ ret = ext4_iomap_get_blocks(inode, &map);

- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ } else {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
if (ret < 0)
return ret;

ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
+ if (delalloc)
+ iomap->flags |= IOMAP_F_EXT4_DELALLOC;
+
+ return 0;
+}
+
+static inline int ext4_iomap_buffered_io_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+ iomap, srcmap, false);
+}
+
+static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+ iomap, srcmap, true);
+}
+
+/*
+ * Drop the staled delayed allocation range from the write failure,
+ * including both start and end blocks. If not, we could leave a range
+ * of delayed extents covered by a clean folio, it could lead to
+ * inaccurate space reservation.
+ */
+static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+ ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
+ DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
return 0;
}

+static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
+ loff_t length, ssize_t written,
+ unsigned int flags,
+ struct iomap *iomap)
+{
+ handle_t *handle;
+ loff_t end;
+ int ret = 0, ret2;
+
+ /* delalloc */
+ if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
+ ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
+ offset, length, written, ext4_iomap_punch_delalloc);
+ if (ret)
+ ext4_warning(inode->i_sb,
+ "Failed to clean up delalloc for inode %lu, %d",
+ inode->i_ino, ret);
+ return ret;
+ }
+
+ /* nodelalloc */
+ end = offset + length;
+ if (!(iomap->flags & IOMAP_F_SIZE_CHANGED) && end <= inode->i_size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (iomap->flags & IOMAP_F_SIZE_CHANGED) {
+ ext4_update_i_disksize(inode, inode->i_size);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ }
+
+ /*
+ * If we have allocated more blocks and copied less.
+ * We will have blocks allocated outside inode->i_size,
+ * so truncate them.
+ */
+ if (end > inode->i_size)
+ ext4_orphan_add(handle, inode);
+
+ ret2 = ext4_journal_stop(handle);
+ ret = ret ? : ret2;
+
+ if (end > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return ret;
+}
+
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_io_begin,
+ .iomap_end = ext4_iomap_buffered_write_end,
+};
+
+const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_da_write_begin,
+ .iomap_end = ext4_iomap_buffered_write_end,
+};
+
const struct iomap_ops ext4_iomap_buffered_read_ops = {
.iomap_begin = ext4_iomap_buffered_io_begin,
};
--
2.39.2



2024-05-06 11:22:12

by Zhang Yi

[permalink] [raw]
Subject: Re: [RFC PATCH v4 24/34] ext4: implement buffered write iomap path

On 2024/5/1 16:11, Dave Chinner wrote:
> On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <[email protected]>
>>
>> Implement buffered write iomap path, use ext4_da_map_blocks() to map
>> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
>> delalloc is disabled or free space is about to run out.
>>
>> Note that we always allocate unwritten extents for new blocks in the
>> iomap write path, this means that the allocation type is no longer
>> controlled by the dioread_nolock mount option. After that, we could
>> postpone the i_disksize updating to the writeback path, and drop journal
>> handle in the buffered dealloc write path completely.
>>
>> Signed-off-by: Zhang Yi <[email protected]>
>> ---
>> fs/ext4/ext4.h | 3 +
>> fs/ext4/file.c | 19 +++++-
>> fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
>> 3 files changed, 183 insertions(+), 7 deletions(-)
>>
[...]
>> +#define IOMAP_F_EXT4_DELALLOC IOMAP_F_PRIVATE
>> +
>> +static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>> loff_t length, unsigned int iomap_flags,
>> - struct iomap *iomap, struct iomap *srcmap)
>> + struct iomap *iomap, struct iomap *srcmap,
>> + bool delalloc)
>> {
>> - int ret;
>> + int ret, retries = 0;
>> struct ext4_map_blocks map;
>> u8 blkbits = inode->i_blkbits;
>>
>> @@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>> return -EINVAL;
>> if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>> return -ERANGE;
>> -
>> +retry:
>> /* Calculate the first and last logical blocks respectively. */
>> map.m_lblk = offset >> blkbits;
>> map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>> EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
>> + if (iomap_flags & IOMAP_WRITE) {
>> + if (delalloc)
>> + ret = ext4_da_map_blocks(inode, &map);
>> + else
>> + ret = ext4_iomap_get_blocks(inode, &map);
>>
>> - ret = ext4_map_blocks(NULL, inode, &map, 0);
>> + if (ret == -ENOSPC &&
>> + ext4_should_retry_alloc(inode->i_sb, &retries))
>> + goto retry;
>> + } else {
>> + ret = ext4_map_blocks(NULL, inode, &map, 0);
>> + }
>> if (ret < 0)
>> return ret;
>>
>> ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
>> + if (delalloc)
>> + iomap->flags |= IOMAP_F_EXT4_DELALLOC;
>> +
>> + return 0;
>> +}
>
> Why are you implementing both read and write mapping paths in
> the one function? The whole point of having separate ops vectors for
> read and write is that it allows a clean separation of the read and
> write mapping operations. i.e. there is no need to use "if (write)
> else {do read}" code constructs at all.
>
> You can even have a different delalloc mapping function so you don't
> need "if (delalloc) else {do nonda}" branches everiywhere...
>

Because current ->iomap_begin() for ext4 buffered IO path
(i.e. __ext4_iomap_buffered_io_begin()) is simple, almost only the map
blocks handlers are different for read, da write and no da write paths,
the rest of the function parameter check and inode status check are
the same, and I noticed that the ->iomap_begin() for direct IO path
(i.e. ext4_iomap_begin()) also implemented in one function. So I'd
like to save some code now, and it looks like implement them in one
function doesn't make this function too complicated, I guess we could
split them if things change in the future.

But think about it again, split them now could make things more clear,
it's also fine to me.

Thanks,
Yi.


2024-05-06 11:45:06

by Zhang Yi

[permalink] [raw]
Subject: Re: [RFC PATCH v4 24/34] ext4: implement buffered write iomap path

On 2024/5/1 16:33, Dave Chinner wrote:
> On Wed, May 01, 2024 at 06:11:13PM +1000, Dave Chinner wrote:
>> On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
>>> From: Zhang Yi <[email protected]>
>>>
>>> Implement buffered write iomap path, use ext4_da_map_blocks() to map
>>> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
>>> delalloc is disabled or free space is about to run out.
>>>
>>> Note that we always allocate unwritten extents for new blocks in the
>>> iomap write path, this means that the allocation type is no longer
>>> controlled by the dioread_nolock mount option. After that, we could
>>> postpone the i_disksize updating to the writeback path, and drop journal
>>> handle in the buffered dealloc write path completely.
> .....
>>> +/*
>>> + * Drop the staled delayed allocation range from the write failure,
>>> + * including both start and end blocks. If not, we could leave a range
>>> + * of delayed extents covered by a clean folio, it could lead to
>>> + * inaccurate space reservation.
>>> + */
>>> +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
>>> + loff_t length)
>>> +{
>>> + ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
>>> + DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>>> return 0;
>>> }
>>>
>>> +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
>>> + loff_t length, ssize_t written,
>>> + unsigned int flags,
>>> + struct iomap *iomap)
>>> +{
>>> + handle_t *handle;
>>> + loff_t end;
>>> + int ret = 0, ret2;
>>> +
>>> + /* delalloc */
>>> + if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
>>> + ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
>>> + offset, length, written, ext4_iomap_punch_delalloc);
>>> + if (ret)
>>> + ext4_warning(inode->i_sb,
>>> + "Failed to clean up delalloc for inode %lu, %d",
>>> + inode->i_ino, ret);
>>> + return ret;
>>> + }
>>
>> Why are you creating a delalloc extent for the write operation and
>> then immediately deleting it from the extent tree once the write
>> operation is done?
>
> Ignore this, I mixed up the ext4_iomap_punch_delalloc() code
> directly above with iomap_file_buffered_write_punch_delalloc().
>
> In hindsight, iomap_file_buffered_write_punch_delalloc() is poorly
> named, as it is handling a short write situation which requires
> newly allocated delalloc blocks to be punched.
> iomap_file_buffered_write_finish() would probably be a better name
> for it....
>
>> Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
>> set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?
>
> But this still stands - the first thing
> iomap_file_buffered_write_punch_delalloc() is:
>
> if (iomap->type != IOMAP_DELALLOC)
> return 0;
>

Thanks for the suggestion, the delalloc and non-delalloc write paths
share the same ->iomap_end() now (i.e. ext4_iomap_buffered_write_end()),
I use the IOMAP_F_EXT4_DELALLOC to identify the write path. For
non-delalloc path, If we have allocated more blocks and copied less, we
should truncate extra blocks that newly allocated by ->iomap_begin().
If we use IOMAP_DELALLOC, we can't tell if the blocks are pre-existing
or newly allocated, we can't truncate the pre-existing blocks, so I have
to introduce IOMAP_F_EXT4_DELALLOC. But if we split the delalloc and
non-delalloc handler, we could drop IOMAP_F_EXT4_DELALLOC.

I also checked xfs, IIUC, xfs doesn't free the extra blocks beyond EOF
in xfs_buffered_write_iomap_end() for non-delalloc case since they will
be freed by xfs_free_eofblocks in some other inactive paths, like
xfs_release()/xfs_inactive()/..., is that right?

Thanks,
Yi.