From: Zhang Yi <[email protected]>
In the dio write path, we only take shared inode lock for the case of
aligned overwriting initialized blocks inside EOF. But for overwriting
preallocated blocks, it may only need to split unwritten extents, this
procedure has been protected under i_data_sem lock, it's safe to
release the exclusive inode lock and take shared inode lock.
This could give a significant speed up for multi-threaded writes. Test
on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
direct=1
ioengine=libaio
iodepth=10
numjobs=10
runtime=60
rw=randwrite
size=100G
And the test result are:
Before:
bs=4k IOPS=11.1k, BW=43.2MiB/s
bs=16k IOPS=11.1k, BW=173MiB/s
bs=64k IOPS=11.2k, BW=697MiB/s
After:
bs=4k IOPS=41.4k, BW=162MiB/s
bs=16k IOPS=41.3k, BW=646MiB/s
bs=64k IOPS=13.5k, BW=843MiB/s
Signed-off-by: Zhang Yi <[email protected]>
---
v2->v1:
- Negate the 'inited' related arguments to 'unwritten'.
fs/ext4/file.c | 34 ++++++++++++++++++++++------------
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a7a597c727e6..21abe95a0ee7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
return false;
}
-/* Is IO overwriting allocated and initialized blocks? */
-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+/* Is IO overwriting allocated or initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode,
+ loff_t pos, loff_t len, bool *unwritten)
{
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
@@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
blklen = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err != blklen)
+ return false;
/*
* 'err==len' means that all of the blocks have been preallocated,
- * regardless of whether they have been initialized or not. To exclude
- * unwritten extents, we need to check m_flags.
+ * regardless of whether they have been initialized or not. We need to
+ * check m_flags to distinguish the unwritten extents.
*/
- return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
+ return true;
}
static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
@@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
* - For extending writes case we don't take the shared lock, since it requires
* updating inode i_disksize and/or orphan handling with exclusive lock.
*
- * - shared locking will only be true mostly with overwrites. Otherwise we will
- * switch to exclusive i_rwsem lock.
+ * - shared locking will only be true mostly with overwrites, including
+ * initialized blocks and unwritten blocks. For overwrite unwritten blocks
+ * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
+ * also release exclusive i_rwsem lock.
+ *
+ * - Otherwise we will switch to exclusive i_rwsem lock.
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
- bool *ilock_shared, bool *extend)
+ bool *ilock_shared, bool *extend,
+ bool *unwritten)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
* in file_modified().
*/
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
- !ext4_overwrite_io(inode, offset, count))) {
+ !ext4_overwrite_io(inode, offset, count, unwritten))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
@@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
- bool extend = false, unaligned_io = false;
+ bool extend = false, unaligned_io = false, unwritten = false;
bool ilock_shared = true;
/*
@@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_buffered_write_iter(iocb, from);
}
- ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+ ret = ext4_dio_write_checks(iocb, from,
+ &ilock_shared, &extend, &unwritten);
if (ret <= 0)
return ret;
@@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_journal_stop(handle);
}
- if (ilock_shared)
+ if (ilock_shared && !unwritten)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
--
2.31.1
On Mon 26-12-22 14:20:15, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
>
> This could give a significant speed up for multi-threaded writes. Test
> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>
> direct=1
> ioengine=libaio
> iodepth=10
> numjobs=10
> runtime=60
> rw=randwrite
> size=100G
>
> And the test result are:
> Before:
> bs=4k IOPS=11.1k, BW=43.2MiB/s
> bs=16k IOPS=11.1k, BW=173MiB/s
> bs=64k IOPS=11.2k, BW=697MiB/s
>
> After:
> bs=4k IOPS=41.4k, BW=162MiB/s
> bs=16k IOPS=41.3k, BW=646MiB/s
> bs=64k IOPS=13.5k, BW=843MiB/s
>
> Signed-off-by: Zhang Yi <[email protected]>
Looks good to me. Feel free to add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> v2->v1:
> - Negate the 'inited' related arguments to 'unwritten'.
>
> fs/ext4/file.c | 34 ++++++++++++++++++++++------------
> 1 file changed, 22 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index a7a597c727e6..21abe95a0ee7 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
> return false;
> }
>
> -/* Is IO overwriting allocated and initialized blocks? */
> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> +/* Is IO overwriting allocated or initialized blocks? */
> +static bool ext4_overwrite_io(struct inode *inode,
> + loff_t pos, loff_t len, bool *unwritten)
> {
> struct ext4_map_blocks map;
> unsigned int blkbits = inode->i_blkbits;
> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> blklen = map.m_len;
>
> err = ext4_map_blocks(NULL, inode, &map, 0);
> + if (err != blklen)
> + return false;
> /*
> * 'err==len' means that all of the blocks have been preallocated,
> - * regardless of whether they have been initialized or not. To exclude
> - * unwritten extents, we need to check m_flags.
> + * regardless of whether they have been initialized or not. We need to
> + * check m_flags to distinguish the unwritten extents.
> */
> - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
> + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
> + return true;
> }
>
> static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
> * - For extending writes case we don't take the shared lock, since it requires
> * updating inode i_disksize and/or orphan handling with exclusive lock.
> *
> - * - shared locking will only be true mostly with overwrites. Otherwise we will
> - * switch to exclusive i_rwsem lock.
> + * - shared locking will only be true mostly with overwrites, including
> + * initialized blocks and unwritten blocks. For overwrite unwritten blocks
> + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
> + * also release exclusive i_rwsem lock.
> + *
> + * - Otherwise we will switch to exclusive i_rwsem lock.
> */
> static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> - bool *ilock_shared, bool *extend)
> + bool *ilock_shared, bool *extend,
> + bool *unwritten)
> {
> struct file *file = iocb->ki_filp;
> struct inode *inode = file_inode(file);
> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> * in file_modified().
> */
> if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> - !ext4_overwrite_io(inode, offset, count))) {
> + !ext4_overwrite_io(inode, offset, count, unwritten))) {
> if (iocb->ki_flags & IOCB_NOWAIT) {
> ret = -EAGAIN;
> goto out;
> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> loff_t offset = iocb->ki_pos;
> size_t count = iov_iter_count(from);
> const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> - bool extend = false, unaligned_io = false;
> + bool extend = false, unaligned_io = false, unwritten = false;
> bool ilock_shared = true;
>
> /*
> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> return ext4_buffered_write_iter(iocb, from);
> }
>
> - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
> + ret = ext4_dio_write_checks(iocb, from,
> + &ilock_shared, &extend, &unwritten);
> if (ret <= 0)
> return ret;
>
> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> ext4_journal_stop(handle);
> }
>
> - if (ilock_shared)
> + if (ilock_shared && !unwritten)
> iomap_ops = &ext4_iomap_overwrite_ops;
> ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
> (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
> --
> 2.31.1
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
Zhang Yi <[email protected]> writes:
> From: Zhang Yi <[email protected]>
>
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
Ok. One question though. Should we be passing IOMAP_DIO_FORCE_WAIT
(in this case as well) which will wait for the completion of dio
request even if the submitted IO is not synchronous. Like how it's being
done for unaligned overwrites case [1].
What I am mostly curious to know about is, how do we take care of
unwritten
to written conversion without racing which can happen in a
seperate workqueue context and/or are there any zeroing of extents
involved in this scenario which can race with one another?
So, I think in case of a non-aligned write it make sense [1] because it
might involve zeroing of the partial blocks. But in this case as you
said this already happens within i_data_sem lock context, so it won't be
necessary. I still thought it will be worth while to confirm it's indeed
the case or not.
[1]:
https://lore.kernel.org/linux-ext4/[email protected]/
Oh, one of the patch might run into some patch conflict depending upon
which one gets picked first...
-ritesh
>
> This could give a significant speed up for multi-threaded writes. Test
> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>
> direct=1
> ioengine=libaio
> iodepth=10
> numjobs=10
> runtime=60
> rw=randwrite
> size=100G
>
> And the test result are:
> Before:
> bs=4k IOPS=11.1k, BW=43.2MiB/s
> bs=16k IOPS=11.1k, BW=173MiB/s
> bs=64k IOPS=11.2k, BW=697MiB/s
>
> After:
> bs=4k IOPS=41.4k, BW=162MiB/s
> bs=16k IOPS=41.3k, BW=646MiB/s
> bs=64k IOPS=13.5k, BW=843MiB/s
>
> Signed-off-by: Zhang Yi <[email protected]>
> ---
> v2->v1:
> - Negate the 'inited' related arguments to 'unwritten'.
>
> fs/ext4/file.c | 34 ++++++++++++++++++++++------------
> 1 file changed, 22 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index a7a597c727e6..21abe95a0ee7 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
> return false;
> }
>
> -/* Is IO overwriting allocated and initialized blocks? */
> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> +/* Is IO overwriting allocated or initialized blocks? */
> +static bool ext4_overwrite_io(struct inode *inode,
> + loff_t pos, loff_t len, bool *unwritten)
> {
> struct ext4_map_blocks map;
> unsigned int blkbits = inode->i_blkbits;
> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> blklen = map.m_len;
>
> err = ext4_map_blocks(NULL, inode, &map, 0);
> + if (err != blklen)
> + return false;
> /*
> * 'err==len' means that all of the blocks have been preallocated,
> - * regardless of whether they have been initialized or not. To exclude
> - * unwritten extents, we need to check m_flags.
> + * regardless of whether they have been initialized or not. We need to
> + * check m_flags to distinguish the unwritten extents.
> */
> - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
> + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
> + return true;
> }
>
> static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
> * - For extending writes case we don't take the shared lock, since it requires
> * updating inode i_disksize and/or orphan handling with exclusive lock.
> *
> - * - shared locking will only be true mostly with overwrites. Otherwise we will
> - * switch to exclusive i_rwsem lock.
> + * - shared locking will only be true mostly with overwrites, including
> + * initialized blocks and unwritten blocks. For overwrite unwritten blocks
> + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
> + * also release exclusive i_rwsem lock.
> + *
> + * - Otherwise we will switch to exclusive i_rwsem lock.
> */
> static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> - bool *ilock_shared, bool *extend)
> + bool *ilock_shared, bool *extend,
> + bool *unwritten)
> {
> struct file *file = iocb->ki_filp;
> struct inode *inode = file_inode(file);
> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> * in file_modified().
> */
> if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> - !ext4_overwrite_io(inode, offset, count))) {
> + !ext4_overwrite_io(inode, offset, count, unwritten))) {
> if (iocb->ki_flags & IOCB_NOWAIT) {
> ret = -EAGAIN;
> goto out;
> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> loff_t offset = iocb->ki_pos;
> size_t count = iov_iter_count(from);
> const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> - bool extend = false, unaligned_io = false;
> + bool extend = false, unaligned_io = false, unwritten = false;
> bool ilock_shared = true;
>
> /*
> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> return ext4_buffered_write_iter(iocb, from);
> }
>
> - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
> + ret = ext4_dio_write_checks(iocb, from,
> + &ilock_shared, &extend, &unwritten);
> if (ret <= 0)
> return ret;
>
> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> ext4_journal_stop(handle);
> }
>
> - if (ilock_shared)
> + if (ilock_shared && !unwritten)
> iomap_ops = &ext4_iomap_overwrite_ops;
> ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
> (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
> --
> 2.31.1
Hello.
On 2023/2/17 1:41, Ritesh Harjani (IBM) wrote:
> Zhang Yi <[email protected]> writes:
>
>> From: Zhang Yi <[email protected]>
>>
>> In the dio write path, we only take shared inode lock for the case of
>> aligned overwriting initialized blocks inside EOF. But for overwriting
>> preallocated blocks, it may only need to split unwritten extents, this
>> procedure has been protected under i_data_sem lock, it's safe to
>> release the exclusive inode lock and take shared inode lock.
>
> Ok. One question though. Should we be passing IOMAP_DIO_FORCE_WAIT
> (in this case as well) which will wait for the completion of dio
> request even if the submitted IO is not synchronous. Like how it's being
> done for unaligned overwrites case [1].
> What I am mostly curious to know about is, how do we take care of
> unwritten
> to written conversion without racing which can happen in a
> seperate workqueue context and/or are there any zeroing of extents
> involved in this scenario which can race with one another?
>
> So, I think in case of a non-aligned write it make sense [1] because it
> might involve zeroing of the partial blocks. But in this case as you
> said this already happens within i_data_sem lock context, so it won't be
> necessary. I still thought it will be worth while to confirm it's indeed
> the case or not.
>
I'm not quite get your question, do you mean passing IOMAP_DIO_FORCE_WAIT
for the case of unaligned writing to pre-allocated(unwritten) blocks?
IIUC, That's how it's done now if you only merge my patch. And it should be
cautious to slove the conflict if you also want to merge [1] together.
After looking at [1], I think it should be:
| pure overwrite | write to pre-allocated |
-------------------------------------------------------------|
aligned | share lock, nowait (1)| share lock, nowait (3) |
unaligned | share lock, nowait (2)| excl lock, wait (4) |
In case(3), each AIO-DIO's unwritten->written conversion do not disturb each
other because of the i_data_sem lock, and the potential zeroing extents(e.g.
ext4_zero_range()) also call inode_dio_wait() to wait DIO complete. So I don't
find any race problem.
Am I missing something? or which case you want to confirm?
Thanks,
Yi.
> [1]:
> https://lore.kernel.org/linux-ext4/[email protected]/
>
> Oh, one of the patch might run into some patch conflict depending upon
> which one gets picked first...
>
> -ritesh
>
>
>>
>> This could give a significant speed up for multi-threaded writes. Test
>> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>>
>> direct=1
>> ioengine=libaio
>> iodepth=10
>> numjobs=10
>> runtime=60
>> rw=randwrite
>> size=100G
>>
>> And the test result are:
>> Before:
>> bs=4k IOPS=11.1k, BW=43.2MiB/s
>> bs=16k IOPS=11.1k, BW=173MiB/s
>> bs=64k IOPS=11.2k, BW=697MiB/s
>>
>> After:
>> bs=4k IOPS=41.4k, BW=162MiB/s
>> bs=16k IOPS=41.3k, BW=646MiB/s
>> bs=64k IOPS=13.5k, BW=843MiB/s
>>
>> Signed-off-by: Zhang Yi <[email protected]>
>> ---
>> v2->v1:
>> - Negate the 'inited' related arguments to 'unwritten'.
>>
>> fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>> 1 file changed, 22 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index a7a597c727e6..21abe95a0ee7 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>> return false;
>> }
>>
>> -/* Is IO overwriting allocated and initialized blocks? */
>> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>> +/* Is IO overwriting allocated or initialized blocks? */
>> +static bool ext4_overwrite_io(struct inode *inode,
>> + loff_t pos, loff_t len, bool *unwritten)
>> {
>> struct ext4_map_blocks map;
>> unsigned int blkbits = inode->i_blkbits;
>> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>> blklen = map.m_len;
>>
>> err = ext4_map_blocks(NULL, inode, &map, 0);
>> + if (err != blklen)
>> + return false;
>> /*
>> * 'err==len' means that all of the blocks have been preallocated,
>> - * regardless of whether they have been initialized or not. To exclude
>> - * unwritten extents, we need to check m_flags.
>> + * regardless of whether they have been initialized or not. We need to
>> + * check m_flags to distinguish the unwritten extents.
>> */
>> - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
>> + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
>> + return true;
>> }
>>
>> static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
>> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>> * - For extending writes case we don't take the shared lock, since it requires
>> * updating inode i_disksize and/or orphan handling with exclusive lock.
>> *
>> - * - shared locking will only be true mostly with overwrites. Otherwise we will
>> - * switch to exclusive i_rwsem lock.
>> + * - shared locking will only be true mostly with overwrites, including
>> + * initialized blocks and unwritten blocks. For overwrite unwritten blocks
>> + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
>> + * also release exclusive i_rwsem lock.
>> + *
>> + * - Otherwise we will switch to exclusive i_rwsem lock.
>> */
>> static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>> - bool *ilock_shared, bool *extend)
>> + bool *ilock_shared, bool *extend,
>> + bool *unwritten)
>> {
>> struct file *file = iocb->ki_filp;
>> struct inode *inode = file_inode(file);
>> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>> * in file_modified().
>> */
>> if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
>> - !ext4_overwrite_io(inode, offset, count))) {
>> + !ext4_overwrite_io(inode, offset, count, unwritten))) {
>> if (iocb->ki_flags & IOCB_NOWAIT) {
>> ret = -EAGAIN;
>> goto out;
>> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>> loff_t offset = iocb->ki_pos;
>> size_t count = iov_iter_count(from);
>> const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
>> - bool extend = false, unaligned_io = false;
>> + bool extend = false, unaligned_io = false, unwritten = false;
>> bool ilock_shared = true;
>>
>> /*
>> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>> return ext4_buffered_write_iter(iocb, from);
>> }
>>
>> - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
>> + ret = ext4_dio_write_checks(iocb, from,
>> + &ilock_shared, &extend, &unwritten);
>> if (ret <= 0)
>> return ret;
>>
>> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>> ext4_journal_stop(handle);
>> }
>>
>> - if (ilock_shared)
>> + if (ilock_shared && !unwritten)
>> iomap_ops = &ext4_iomap_overwrite_ops;
>> ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>> (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
>> --
>> 2.31.1
Zhang Yi <[email protected]> writes:
> Hello.
>
> On 2023/2/17 1:41, Ritesh Harjani (IBM) wrote:
>> Zhang Yi <[email protected]> writes:
>>
>>> From: Zhang Yi <[email protected]>
>>>
>>> In the dio write path, we only take shared inode lock for the case of
>>> aligned overwriting initialized blocks inside EOF. But for overwriting
>>> preallocated blocks, it may only need to split unwritten extents, this
>>> procedure has been protected under i_data_sem lock, it's safe to
>>> release the exclusive inode lock and take shared inode lock.
>>
>> Ok. One question though. Should we be passing IOMAP_DIO_FORCE_WAIT
>> (in this case as well) which will wait for the completion of dio
>> request even if the submitted IO is not synchronous. Like how it's being
>> done for unaligned overwrites case [1].
>> What I am mostly curious to know about is, how do we take care of
>> unwritten
>> to written conversion without racing which can happen in a
>> seperate workqueue context and/or are there any zeroing of extents
>> involved in this scenario which can race with one another?
>>
>> So, I think in case of a non-aligned write it make sense [1] because it
>> might involve zeroing of the partial blocks. But in this case as you
>> said this already happens within i_data_sem lock context, so it won't be
>> necessary. I still thought it will be worth while to confirm it's indeed
>> the case or not.
>>
>
> I'm not quite get your question, do you mean passing IOMAP_DIO_FORCE_WAIT
> for the case of unaligned writing to pre-allocated(unwritten) blocks?
> IIUC, That's how it's done now if you only merge my patch. And it should be
> cautious to slove the conflict if you also want to merge [1] together.
>
> After looking at [1], I think it should be:
>
> | pure overwrite | write to pre-allocated |
> -------------------------------------------------------------|
> aligned | share lock, nowait (1)| share lock, nowait (3) |
> unaligned | share lock, nowait (2)| excl lock, wait (4) |
>
> In case(3), each AIO-DIO's unwritten->written conversion do not disturb each
> other because of the i_data_sem lock, and the potential zeroing extents(e.g.
> ext4_zero_range()) also call inode_dio_wait() to wait DIO complete. So I don't
> find any race problem.
aah yes. Looks like we don't need IOMAP_DIO_FORCE_WAIT then for aligned
unwritten overwrite.
Because as you said conversion will be synchronized with i_data_sem lock
and other's (e.g. ext4_zero_range()) will synchronize using inode_dio_wait().
Make sense. Thanks for confirming it!!
Looks good to me. Feel free to add -
Reviewed-by: Ritesh Harjani (IBM) <[email protected]>
-ritesh
>
> Am I missing something? or which case you want to confirm?
>
> Thanks,
> Yi.
>
>> [1]:
>> https://lore.kernel.org/linux-ext4/[email protected]/
>>
>> Oh, one of the patch might run into some patch conflict depending upon
>> which one gets picked first...
>>
>> -ritesh
>>
>>
>>>
>>> This could give a significant speed up for multi-threaded writes. Test
>>> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>>>
>>> direct=1
>>> ioengine=libaio
>>> iodepth=10
>>> numjobs=10
>>> runtime=60
>>> rw=randwrite
>>> size=100G
>>>
>>> And the test result are:
>>> Before:
>>> bs=4k IOPS=11.1k, BW=43.2MiB/s
>>> bs=16k IOPS=11.1k, BW=173MiB/s
>>> bs=64k IOPS=11.2k, BW=697MiB/s
>>>
>>> After:
>>> bs=4k IOPS=41.4k, BW=162MiB/s
>>> bs=16k IOPS=41.3k, BW=646MiB/s
>>> bs=64k IOPS=13.5k, BW=843MiB/s
>>>
>>> Signed-off-by: Zhang Yi <[email protected]>
>>> ---
>>> v2->v1:
>>> - Negate the 'inited' related arguments to 'unwritten'.
>>>
>>> fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>>> 1 file changed, 22 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>>> index a7a597c727e6..21abe95a0ee7 100644
>>> --- a/fs/ext4/file.c
>>> +++ b/fs/ext4/file.c
>>> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>>> return false;
>>> }
>>>
>>> -/* Is IO overwriting allocated and initialized blocks? */
>>> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>>> +/* Is IO overwriting allocated or initialized blocks? */
>>> +static bool ext4_overwrite_io(struct inode *inode,
>>> + loff_t pos, loff_t len, bool *unwritten)
>>> {
>>> struct ext4_map_blocks map;
>>> unsigned int blkbits = inode->i_blkbits;
>>> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>>> blklen = map.m_len;
>>>
>>> err = ext4_map_blocks(NULL, inode, &map, 0);
>>> + if (err != blklen)
>>> + return false;
>>> /*
>>> * 'err==len' means that all of the blocks have been preallocated,
>>> - * regardless of whether they have been initialized or not. To exclude
>>> - * unwritten extents, we need to check m_flags.
>>> + * regardless of whether they have been initialized or not. We need to
>>> + * check m_flags to distinguish the unwritten extents.
>>> */
>>> - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
>>> + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
>>> + return true;
>>> }
>>>
>>> static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
>>> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>>> * - For extending writes case we don't take the shared lock, since it requires
>>> * updating inode i_disksize and/or orphan handling with exclusive lock.
>>> *
>>> - * - shared locking will only be true mostly with overwrites. Otherwise we will
>>> - * switch to exclusive i_rwsem lock.
>>> + * - shared locking will only be true mostly with overwrites, including
>>> + * initialized blocks and unwritten blocks. For overwrite unwritten blocks
>>> + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
>>> + * also release exclusive i_rwsem lock.
>>> + *
>>> + * - Otherwise we will switch to exclusive i_rwsem lock.
>>> */
>>> static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>>> - bool *ilock_shared, bool *extend)
>>> + bool *ilock_shared, bool *extend,
>>> + bool *unwritten)
>>> {
>>> struct file *file = iocb->ki_filp;
>>> struct inode *inode = file_inode(file);
>>> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>>> * in file_modified().
>>> */
>>> if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
>>> - !ext4_overwrite_io(inode, offset, count))) {
>>> + !ext4_overwrite_io(inode, offset, count, unwritten))) {
>>> if (iocb->ki_flags & IOCB_NOWAIT) {
>>> ret = -EAGAIN;
>>> goto out;
>>> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>> loff_t offset = iocb->ki_pos;
>>> size_t count = iov_iter_count(from);
>>> const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
>>> - bool extend = false, unaligned_io = false;
>>> + bool extend = false, unaligned_io = false, unwritten = false;
>>> bool ilock_shared = true;
>>>
>>> /*
>>> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>> return ext4_buffered_write_iter(iocb, from);
>>> }
>>>
>>> - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
>>> + ret = ext4_dio_write_checks(iocb, from,
>>> + &ilock_shared, &extend, &unwritten);
>>> if (ret <= 0)
>>> return ret;
>>>
>>> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>> ext4_journal_stop(handle);
>>> }
>>>
>>> - if (ilock_shared)
>>> + if (ilock_shared && !unwritten)
>>> iomap_ops = &ext4_iomap_overwrite_ops;
>>> ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>>> (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
>>> --
>>> 2.31.1
On Mon, 26 Dec 2022 14:20:15 +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
>
> [...]
Applied, thanks!
[1/1] ext4: dio take shared inode lock when overwriting preallocated blocks
commit: 240930fb7e6b52229bdee5b1423bfeab0002fed2
Best regards,
--
Theodore Ts'o <[email protected]>