2024-05-10 02:39:21

by Chao Yu

[permalink] [raw]
Subject: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

If lfs mode is on, buffered read may race w/ OPU dio write as below,
it may cause buffered read hits unwritten data unexpectly, and for
dio read, the race condition exists as well.

Thread A Thread B
- f2fs_file_write_iter
- f2fs_dio_write_iter
- __iomap_dio_rw
- f2fs_iomap_begin
- f2fs_map_blocks
- __allocate_data_block
- allocated blkaddr #x
- iomap_dio_submit_bio
- f2fs_file_read_iter
- filemap_read
- f2fs_read_data_folio
- f2fs_mpage_readpages
- f2fs_map_blocks
: get blkaddr #x
- f2fs_submit_read_bio
IRQ
- f2fs_read_end_io
: read IO on blkaddr #x complete
IRQ
- iomap_dio_bio_end_io
: direct write IO on blkaddr #x complete

This patch introduces a new per-inode i_opu_rwsem lock to avoid
such race condition.

Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
Signed-off-by: Chao Yu <[email protected]>
---
v2:
- fix to cover dio read path w/ i_opu_rwsem as well.
fs/f2fs/f2fs.h | 1 +
fs/f2fs/file.c | 28 ++++++++++++++++++++++++++--
fs/f2fs/super.c | 1 +
3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 30058e16a5d0..91cf4b3d6bc6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -847,6 +847,7 @@ struct f2fs_inode_info {
/* avoid racing between foreground op and gc */
struct f2fs_rwsem i_gc_rwsem[2];
struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
+ struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */

int i_extra_isize; /* size of extra space located in i_addr */
kprojid_t i_projid; /* id for project quota */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 72ce1a522fb2..4ec260af321f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
const loff_t pos = iocb->ki_pos;
const size_t count = iov_iter_count(to);
struct iomap_dio *dio;
+ bool do_opu = f2fs_lfs_mode(sbi);
ssize_t ret;

if (count == 0)
@@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = -EAGAIN;
goto out;
}
+ if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ ret = -EAGAIN;
+ goto out;
+ }
} else {
f2fs_down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_read(&fi->i_opu_rwsem);
}

/*
@@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = iomap_dio_complete(dio);
}

+ f2fs_up_read(&fi->i_opu_rwsem);
f2fs_up_read(&fi->i_gc_rwsem[READ]);

file_accessed(file);
@@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (f2fs_should_use_dio(inode, iocb, to)) {
ret = f2fs_dio_read_iter(iocb, to);
} else {
+ bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
+
+ if (do_opu)
+ f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
ret = filemap_read(iocb, to, 0);
+ if (do_opu)
+ f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_BUFFERED_READ_IO, ret);
@@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
ret = -EAGAIN;
goto out;
}
+ if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
+ ret = -EAGAIN;
+ goto out;
+ }
} else {
ret = f2fs_convert_inline_inode(inode);
if (ret)
goto out;

f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
- if (do_opu)
+ if (do_opu) {
f2fs_down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_write(&fi->i_opu_rwsem);
+ }
}

/*
@@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
ret = iomap_dio_complete(dio);
}

- if (do_opu)
+ if (do_opu) {
+ f2fs_up_write(&fi->i_opu_rwsem);
f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ }
f2fs_up_read(&fi->i_gc_rwsem[WRITE]);

if (ret < 0)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index daf2c4dbe150..b4ed3b094366 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
init_f2fs_rwsem(&fi->i_xattr_sem);
+ init_f2fs_rwsem(&fi->i_opu_rwsem);

/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
--
2.40.1



2024-05-14 16:18:23

by Jaegeuk Kim

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

On 05/10, Chao Yu wrote:
> If lfs mode is on, buffered read may race w/ OPU dio write as below,
> it may cause buffered read hits unwritten data unexpectly, and for
> dio read, the race condition exists as well.
>
> Thread A Thread B
> - f2fs_file_write_iter
> - f2fs_dio_write_iter
> - __iomap_dio_rw
> - f2fs_iomap_begin
> - f2fs_map_blocks
> - __allocate_data_block
> - allocated blkaddr #x
> - iomap_dio_submit_bio
> - f2fs_file_read_iter
> - filemap_read
> - f2fs_read_data_folio
> - f2fs_mpage_readpages
> - f2fs_map_blocks
> : get blkaddr #x
> - f2fs_submit_read_bio
> IRQ
> - f2fs_read_end_io
> : read IO on blkaddr #x complete
> IRQ
> - iomap_dio_bio_end_io
> : direct write IO on blkaddr #x complete
>
> This patch introduces a new per-inode i_opu_rwsem lock to avoid
> such race condition.

Wasn't this supposed to be managed by user-land?

>
> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
> Signed-off-by: Chao Yu <[email protected]>
> ---
> v2:
> - fix to cover dio read path w/ i_opu_rwsem as well.
> fs/f2fs/f2fs.h | 1 +
> fs/f2fs/file.c | 28 ++++++++++++++++++++++++++--
> fs/f2fs/super.c | 1 +
> 3 files changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 30058e16a5d0..91cf4b3d6bc6 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
> /* avoid racing between foreground op and gc */
> struct f2fs_rwsem i_gc_rwsem[2];
> struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
> + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */
>
> int i_extra_isize; /* size of extra space located in i_addr */
> kprojid_t i_projid; /* id for project quota */
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 72ce1a522fb2..4ec260af321f 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> const loff_t pos = iocb->ki_pos;
> const size_t count = iov_iter_count(to);
> struct iomap_dio *dio;
> + bool do_opu = f2fs_lfs_mode(sbi);
> ssize_t ret;
>
> if (count == 0)
> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> ret = -EAGAIN;
> goto out;
> }
> + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
> + f2fs_up_read(&fi->i_gc_rwsem[READ]);
> + ret = -EAGAIN;
> + goto out;
> + }
> } else {
> f2fs_down_read(&fi->i_gc_rwsem[READ]);
> + f2fs_down_read(&fi->i_opu_rwsem);
> }
>
> /*
> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> ret = iomap_dio_complete(dio);
> }
>
> + f2fs_up_read(&fi->i_opu_rwsem);
> f2fs_up_read(&fi->i_gc_rwsem[READ]);
>
> file_accessed(file);
> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
> if (f2fs_should_use_dio(inode, iocb, to)) {
> ret = f2fs_dio_read_iter(iocb, to);
> } else {
> + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
> +
> + if (do_opu)
> + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
> ret = filemap_read(iocb, to, 0);
> + if (do_opu)
> + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
> if (ret > 0)
> f2fs_update_iostat(F2FS_I_SB(inode), inode,
> APP_BUFFERED_READ_IO, ret);
> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
> ret = -EAGAIN;
> goto out;
> }
> + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
> + f2fs_up_read(&fi->i_gc_rwsem[READ]);
> + f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
> + ret = -EAGAIN;
> + goto out;
> + }
> } else {
> ret = f2fs_convert_inline_inode(inode);
> if (ret)
> goto out;
>
> f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
> - if (do_opu)
> + if (do_opu) {
> f2fs_down_read(&fi->i_gc_rwsem[READ]);
> + f2fs_down_write(&fi->i_opu_rwsem);
> + }
> }
>
> /*
> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
> ret = iomap_dio_complete(dio);
> }
>
> - if (do_opu)
> + if (do_opu) {
> + f2fs_up_write(&fi->i_opu_rwsem);
> f2fs_up_read(&fi->i_gc_rwsem[READ]);
> + }
> f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>
> if (ret < 0)
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index daf2c4dbe150..b4ed3b094366 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
> init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
> init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
> init_f2fs_rwsem(&fi->i_xattr_sem);
> + init_f2fs_rwsem(&fi->i_opu_rwsem);
>
> /* Will be used by directory only */
> fi->i_dir_level = F2FS_SB(sb)->dir_level;
> --
> 2.40.1

2024-05-15 01:42:32

by Chao Yu

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

On 2024/5/15 0:09, Jaegeuk Kim wrote:
> On 05/10, Chao Yu wrote:
>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>> it may cause buffered read hits unwritten data unexpectly, and for
>> dio read, the race condition exists as well.
>>
>> Thread A Thread B
>> - f2fs_file_write_iter
>> - f2fs_dio_write_iter
>> - __iomap_dio_rw
>> - f2fs_iomap_begin
>> - f2fs_map_blocks
>> - __allocate_data_block
>> - allocated blkaddr #x
>> - iomap_dio_submit_bio
>> - f2fs_file_read_iter
>> - filemap_read
>> - f2fs_read_data_folio
>> - f2fs_mpage_readpages
>> - f2fs_map_blocks
>> : get blkaddr #x
>> - f2fs_submit_read_bio
>> IRQ
>> - f2fs_read_end_io
>> : read IO on blkaddr #x complete
>> IRQ
>> - iomap_dio_bio_end_io
>> : direct write IO on blkaddr #x complete
>>
>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>> such race condition.
>
> Wasn't this supposed to be managed by user-land?

Actually, the test case is:

1. mount w/ lfs mode
2. touch file;
3. initialize file w/ 4k zeroed data; fsync;
4. continue triggering dio write 4k zeroed data to file;
5. and meanwhile, continue triggering buf/dio 4k read in file,
use md5sum to verify the 4k data;

It expects data is all zero, however it turned out it's not.

Thanks,

>
>>
>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
>> Signed-off-by: Chao Yu <[email protected]>
>> ---
>> v2:
>> - fix to cover dio read path w/ i_opu_rwsem as well.
>> fs/f2fs/f2fs.h | 1 +
>> fs/f2fs/file.c | 28 ++++++++++++++++++++++++++--
>> fs/f2fs/super.c | 1 +
>> 3 files changed, 28 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 30058e16a5d0..91cf4b3d6bc6 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
>> /* avoid racing between foreground op and gc */
>> struct f2fs_rwsem i_gc_rwsem[2];
>> struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
>> + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */
>>
>> int i_extra_isize; /* size of extra space located in i_addr */
>> kprojid_t i_projid; /* id for project quota */
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 72ce1a522fb2..4ec260af321f 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> const loff_t pos = iocb->ki_pos;
>> const size_t count = iov_iter_count(to);
>> struct iomap_dio *dio;
>> + bool do_opu = f2fs_lfs_mode(sbi);
>> ssize_t ret;
>>
>> if (count == 0)
>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> ret = -EAGAIN;
>> goto out;
>> }
>> + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
>> + f2fs_up_read(&fi->i_gc_rwsem[READ]);
>> + ret = -EAGAIN;
>> + goto out;
>> + }
>> } else {
>> f2fs_down_read(&fi->i_gc_rwsem[READ]);
>> + f2fs_down_read(&fi->i_opu_rwsem);
>> }
>>
>> /*
>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> ret = iomap_dio_complete(dio);
>> }
>>
>> + f2fs_up_read(&fi->i_opu_rwsem);
>> f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>
>> file_accessed(file);
>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> if (f2fs_should_use_dio(inode, iocb, to)) {
>> ret = f2fs_dio_read_iter(iocb, to);
>> } else {
>> + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
>> +
>> + if (do_opu)
>> + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
>> ret = filemap_read(iocb, to, 0);
>> + if (do_opu)
>> + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
>> if (ret > 0)
>> f2fs_update_iostat(F2FS_I_SB(inode), inode,
>> APP_BUFFERED_READ_IO, ret);
>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>> ret = -EAGAIN;
>> goto out;
>> }
>> + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
>> + f2fs_up_read(&fi->i_gc_rwsem[READ]);
>> + f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>> + ret = -EAGAIN;
>> + goto out;
>> + }
>> } else {
>> ret = f2fs_convert_inline_inode(inode);
>> if (ret)
>> goto out;
>>
>> f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
>> - if (do_opu)
>> + if (do_opu) {
>> f2fs_down_read(&fi->i_gc_rwsem[READ]);
>> + f2fs_down_write(&fi->i_opu_rwsem);
>> + }
>> }
>>
>> /*
>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>> ret = iomap_dio_complete(dio);
>> }
>>
>> - if (do_opu)
>> + if (do_opu) {
>> + f2fs_up_write(&fi->i_opu_rwsem);
>> f2fs_up_read(&fi->i_gc_rwsem[READ]);
>> + }
>> f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>
>> if (ret < 0)
>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>> index daf2c4dbe150..b4ed3b094366 100644
>> --- a/fs/f2fs/super.c
>> +++ b/fs/f2fs/super.c
>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>> init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
>> init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
>> init_f2fs_rwsem(&fi->i_xattr_sem);
>> + init_f2fs_rwsem(&fi->i_opu_rwsem);
>>
>> /* Will be used by directory only */
>> fi->i_dir_level = F2FS_SB(sb)->dir_level;
>> --
>> 2.40.1

2024-05-15 04:43:08

by Jaegeuk Kim

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

On 05/15, Chao Yu wrote:
> On 2024/5/15 0:09, Jaegeuk Kim wrote:
> > On 05/10, Chao Yu wrote:
> > > If lfs mode is on, buffered read may race w/ OPU dio write as below,
> > > it may cause buffered read hits unwritten data unexpectly, and for
> > > dio read, the race condition exists as well.
> > >
> > > Thread A Thread B
> > > - f2fs_file_write_iter
> > > - f2fs_dio_write_iter
> > > - __iomap_dio_rw
> > > - f2fs_iomap_begin
> > > - f2fs_map_blocks
> > > - __allocate_data_block
> > > - allocated blkaddr #x
> > > - iomap_dio_submit_bio
> > > - f2fs_file_read_iter
> > > - filemap_read
> > > - f2fs_read_data_folio
> > > - f2fs_mpage_readpages
> > > - f2fs_map_blocks
> > > : get blkaddr #x
> > > - f2fs_submit_read_bio
> > > IRQ
> > > - f2fs_read_end_io
> > > : read IO on blkaddr #x complete
> > > IRQ
> > > - iomap_dio_bio_end_io
> > > : direct write IO on blkaddr #x complete
> > >
> > > This patch introduces a new per-inode i_opu_rwsem lock to avoid
> > > such race condition.
> >
> > Wasn't this supposed to be managed by user-land?
>
> Actually, the test case is:
>
> 1. mount w/ lfs mode
> 2. touch file;
> 3. initialize file w/ 4k zeroed data; fsync;
> 4. continue triggering dio write 4k zeroed data to file;
> 5. and meanwhile, continue triggering buf/dio 4k read in file,
> use md5sum to verify the 4k data;
>
> It expects data is all zero, however it turned out it's not.

Can we check outstanding write bios instead of abusing locks?

>
> Thanks,
>
> >
> > >
> > > Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
> > > Signed-off-by: Chao Yu <[email protected]>
> > > ---
> > > v2:
> > > - fix to cover dio read path w/ i_opu_rwsem as well.
> > > fs/f2fs/f2fs.h | 1 +
> > > fs/f2fs/file.c | 28 ++++++++++++++++++++++++++--
> > > fs/f2fs/super.c | 1 +
> > > 3 files changed, 28 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > > index 30058e16a5d0..91cf4b3d6bc6 100644
> > > --- a/fs/f2fs/f2fs.h
> > > +++ b/fs/f2fs/f2fs.h
> > > @@ -847,6 +847,7 @@ struct f2fs_inode_info {
> > > /* avoid racing between foreground op and gc */
> > > struct f2fs_rwsem i_gc_rwsem[2];
> > > struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
> > > + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */
> > >
> > > int i_extra_isize; /* size of extra space located in i_addr */
> > > kprojid_t i_projid; /* id for project quota */
> > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > index 72ce1a522fb2..4ec260af321f 100644
> > > --- a/fs/f2fs/file.c
> > > +++ b/fs/f2fs/file.c
> > > @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > const loff_t pos = iocb->ki_pos;
> > > const size_t count = iov_iter_count(to);
> > > struct iomap_dio *dio;
> > > + bool do_opu = f2fs_lfs_mode(sbi);
> > > ssize_t ret;
> > >
> > > if (count == 0)
> > > @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > ret = -EAGAIN;
> > > goto out;
> > > }
> > > + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
> > > + f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > > + ret = -EAGAIN;
> > > + goto out;
> > > + }
> > > } else {
> > > f2fs_down_read(&fi->i_gc_rwsem[READ]);
> > > + f2fs_down_read(&fi->i_opu_rwsem);
> > > }
> > >
> > > /*
> > > @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > ret = iomap_dio_complete(dio);
> > > }
> > >
> > > + f2fs_up_read(&fi->i_opu_rwsem);
> > > f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > >
> > > file_accessed(file);
> > > @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > if (f2fs_should_use_dio(inode, iocb, to)) {
> > > ret = f2fs_dio_read_iter(iocb, to);
> > > } else {
> > > + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
> > > +
> > > + if (do_opu)
> > > + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
> > > ret = filemap_read(iocb, to, 0);
> > > + if (do_opu)
> > > + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
> > > if (ret > 0)
> > > f2fs_update_iostat(F2FS_I_SB(inode), inode,
> > > APP_BUFFERED_READ_IO, ret);
> > > @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
> > > ret = -EAGAIN;
> > > goto out;
> > > }
> > > + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
> > > + f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > > + f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
> > > + ret = -EAGAIN;
> > > + goto out;
> > > + }
> > > } else {
> > > ret = f2fs_convert_inline_inode(inode);
> > > if (ret)
> > > goto out;
> > >
> > > f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
> > > - if (do_opu)
> > > + if (do_opu) {
> > > f2fs_down_read(&fi->i_gc_rwsem[READ]);
> > > + f2fs_down_write(&fi->i_opu_rwsem);
> > > + }
> > > }
> > >
> > > /*
> > > @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
> > > ret = iomap_dio_complete(dio);
> > > }
> > >
> > > - if (do_opu)
> > > + if (do_opu) {
> > > + f2fs_up_write(&fi->i_opu_rwsem);
> > > f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > > + }
> > > f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
> > >
> > > if (ret < 0)
> > > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> > > index daf2c4dbe150..b4ed3b094366 100644
> > > --- a/fs/f2fs/super.c
> > > +++ b/fs/f2fs/super.c
> > > @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
> > > init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
> > > init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
> > > init_f2fs_rwsem(&fi->i_xattr_sem);
> > > + init_f2fs_rwsem(&fi->i_opu_rwsem);
> > >
> > > /* Will be used by directory only */
> > > fi->i_dir_level = F2FS_SB(sb)->dir_level;
> > > --
> > > 2.40.1

2024-05-15 06:38:50

by Chao Yu

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

On 2024/5/15 12:42, Jaegeuk Kim wrote:
> On 05/15, Chao Yu wrote:
>> On 2024/5/15 0:09, Jaegeuk Kim wrote:
>>> On 05/10, Chao Yu wrote:
>>>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>>>> it may cause buffered read hits unwritten data unexpectly, and for
>>>> dio read, the race condition exists as well.
>>>>
>>>> Thread A Thread B
>>>> - f2fs_file_write_iter
>>>> - f2fs_dio_write_iter
>>>> - __iomap_dio_rw
>>>> - f2fs_iomap_begin
>>>> - f2fs_map_blocks
>>>> - __allocate_data_block
>>>> - allocated blkaddr #x
>>>> - iomap_dio_submit_bio
>>>> - f2fs_file_read_iter
>>>> - filemap_read
>>>> - f2fs_read_data_folio
>>>> - f2fs_mpage_readpages
>>>> - f2fs_map_blocks
>>>> : get blkaddr #x
>>>> - f2fs_submit_read_bio
>>>> IRQ
>>>> - f2fs_read_end_io
>>>> : read IO on blkaddr #x complete
>>>> IRQ
>>>> - iomap_dio_bio_end_io
>>>> : direct write IO on blkaddr #x complete
>>>>
>>>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>>>> such race condition.
>>>
>>> Wasn't this supposed to be managed by user-land?
>>
>> Actually, the test case is:
>>
>> 1. mount w/ lfs mode
>> 2. touch file;
>> 3. initialize file w/ 4k zeroed data; fsync;
>> 4. continue triggering dio write 4k zeroed data to file;
>> 5. and meanwhile, continue triggering buf/dio 4k read in file,
>> use md5sum to verify the 4k data;
>>
>> It expects data is all zero, however it turned out it's not.
>
> Can we check outstanding write bios instead of abusing locks?

I didn't figure out a way to solve this w/o lock, due to:
- write bios can be issued after outstanding write bios check condition,
result in the race.
- once read() detects that there are outstanding write bios, we need to
delay read flow rather than fail it, right? It looks using a lock is more
proper here?

Any suggestion?

Thanks,

>
>>
>> Thanks,
>>
>>>
>>>>
>>>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
>>>> Signed-off-by: Chao Yu <[email protected]>
>>>> ---
>>>> v2:
>>>> - fix to cover dio read path w/ i_opu_rwsem as well.
>>>> fs/f2fs/f2fs.h | 1 +
>>>> fs/f2fs/file.c | 28 ++++++++++++++++++++++++++--
>>>> fs/f2fs/super.c | 1 +
>>>> 3 files changed, 28 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>> index 30058e16a5d0..91cf4b3d6bc6 100644
>>>> --- a/fs/f2fs/f2fs.h
>>>> +++ b/fs/f2fs/f2fs.h
>>>> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
>>>> /* avoid racing between foreground op and gc */
>>>> struct f2fs_rwsem i_gc_rwsem[2];
>>>> struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
>>>> + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */
>>>>
>>>> int i_extra_isize; /* size of extra space located in i_addr */
>>>> kprojid_t i_projid; /* id for project quota */
>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>> index 72ce1a522fb2..4ec260af321f 100644
>>>> --- a/fs/f2fs/file.c
>>>> +++ b/fs/f2fs/file.c
>>>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>> const loff_t pos = iocb->ki_pos;
>>>> const size_t count = iov_iter_count(to);
>>>> struct iomap_dio *dio;
>>>> + bool do_opu = f2fs_lfs_mode(sbi);
>>>> ssize_t ret;
>>>>
>>>> if (count == 0)
>>>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>> ret = -EAGAIN;
>>>> goto out;
>>>> }
>>>> + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
>>>> + f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>> + ret = -EAGAIN;
>>>> + goto out;
>>>> + }
>>>> } else {
>>>> f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>> + f2fs_down_read(&fi->i_opu_rwsem);
>>>> }
>>>>
>>>> /*
>>>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>> ret = iomap_dio_complete(dio);
>>>> }
>>>>
>>>> + f2fs_up_read(&fi->i_opu_rwsem);
>>>> f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>
>>>> file_accessed(file);
>>>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>> if (f2fs_should_use_dio(inode, iocb, to)) {
>>>> ret = f2fs_dio_read_iter(iocb, to);
>>>> } else {
>>>> + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
>>>> +
>>>> + if (do_opu)
>>>> + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
>>>> ret = filemap_read(iocb, to, 0);
>>>> + if (do_opu)
>>>> + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
>>>> if (ret > 0)
>>>> f2fs_update_iostat(F2FS_I_SB(inode), inode,
>>>> APP_BUFFERED_READ_IO, ret);
>>>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>> ret = -EAGAIN;
>>>> goto out;
>>>> }
>>>> + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
>>>> + f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>> + f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>> + ret = -EAGAIN;
>>>> + goto out;
>>>> + }
>>>> } else {
>>>> ret = f2fs_convert_inline_inode(inode);
>>>> if (ret)
>>>> goto out;
>>>>
>>>> f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
>>>> - if (do_opu)
>>>> + if (do_opu) {
>>>> f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>> + f2fs_down_write(&fi->i_opu_rwsem);
>>>> + }
>>>> }
>>>>
>>>> /*
>>>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>> ret = iomap_dio_complete(dio);
>>>> }
>>>>
>>>> - if (do_opu)
>>>> + if (do_opu) {
>>>> + f2fs_up_write(&fi->i_opu_rwsem);
>>>> f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>> + }
>>>> f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>>
>>>> if (ret < 0)
>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>> index daf2c4dbe150..b4ed3b094366 100644
>>>> --- a/fs/f2fs/super.c
>>>> +++ b/fs/f2fs/super.c
>>>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>>>> init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
>>>> init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
>>>> init_f2fs_rwsem(&fi->i_xattr_sem);
>>>> + init_f2fs_rwsem(&fi->i_opu_rwsem);
>>>>
>>>> /* Will be used by directory only */
>>>> fi->i_dir_level = F2FS_SB(sb)->dir_level;
>>>> --
>>>> 2.40.1

2024-05-15 08:19:10

by Wu Bo

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

On Fri, May 10, 2024 at 10:39:06AM +0800, Chao Yu wrote:
> If lfs mode is on, buffered read may race w/ OPU dio write as below,
> it may cause buffered read hits unwritten data unexpectly, and for
> dio read, the race condition exists as well.
>
> Thread A Thread B
> - f2fs_file_write_iter
> - f2fs_dio_write_iter
> - __iomap_dio_rw
> - f2fs_iomap_begin
> - f2fs_map_blocks
> - __allocate_data_block
> - allocated blkaddr #x
> - iomap_dio_submit_bio
> - f2fs_file_read_iter
> - filemap_read
> - f2fs_read_data_folio
> - f2fs_mpage_readpages
> - f2fs_map_blocks
> : get blkaddr #x
> - f2fs_submit_read_bio
> IRQ
> - f2fs_read_end_io
> : read IO on blkaddr #x complete
> IRQ
> - iomap_dio_bio_end_io
> : direct write IO on blkaddr #x complete
>
Looks like every COW filesystem would meet this situation. What's the solution
of other FS?
> This patch introduces a new per-inode i_opu_rwsem lock to avoid
> such race condition.
>

2024-05-15 08:41:29

by Markus Elfring

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write


> This patch introduces a new …

Please choose a corresponding imperative wording for an improved change description.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.9#n94

Regards,
Markus

2024-05-17 08:15:34

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write



Hello,

kernel test robot noticed "WARNING:at_kernel/locking/rwsem.c:#down_read" on:

commit: abf7df61e5c60fed520a09102d576fd41ed4d5ee ("[PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write")
url: https://github.com/intel-lab-lkp/linux/commits/Chao-Yu/f2fs-fix-to-avoid-racing-in-between-read-and-OPU-dio-write/20240510-104020
base: https://git.kernel.org/cgit/linux/kernel/git/jaegeuk/f2fs.git dev
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

in testcase: xfstests
version: xfstests-x86_64-0e5c12df-1_20240511
with following parameters:

disk: 4HDD
fs: f2fs
test: generic-617



compiler: gcc-13
test machine: 4 threads Intel(R) Core(TM) i5-6500 CPU @ 3.20GHz (Skylake) with 32G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-lkp/[email protected]


[ 160.985543][ T2255] ------------[ cut here ]------------
[ 160.990864][ T2255] WARNING: CPU: 3 PID: 2255 at kernel/locking/rwsem.c:245 down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1))
[ 160.999715][ T2255] Modules linked in: f2fs crc32_generic intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal btrfs intel_powerclamp blake2b_generic coretemp xor zstd_compress kvm_intel raid6_pq libcrc32c i915 kvm sd_mod t10_pi crc64_rocksoft_generic crct10dif_pclmul crc32_pclmul crc64_rocksoft crc64 crc32c_intel ghash_clmulni_intel sha512_ssse3 sg drm_buddy rapl ipmi_devintf intel_cstate intel_gtt ipmi_msghandler mei_wdt wmi_bmof intel_uncore i2c_i801 drm_display_helper i2c_smbus ahci ttm drm_kms_helper libahci video libata mei_me mei acpi_pad intel_pch_thermal wmi binfmt_misc loop fuse drm dm_mod ip_tables
[ 161.053654][ T2255] CPU: 3 PID: 2255 Comm: fsx Tainted: G I 6.9.0-rc1-00036-gabf7df61e5c6 #1
[ 161.063438][ T2255] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.1.1 10/07/2015
[ 161.071504][ T2255] RIP: 0010:down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1))
[ 161.076376][ T2255] Code: b8 00 00 00 00 00 fc ff df 83 e3 02 48 c1 ea 03 4c 09 fb 48 83 cb 01 80 3c 02 00 0f 85 9a 00 00 00 48 89 5d 08 e9 50 ff ff ff <0f> 0b 4c 8d 7d 08 be 08 00 00 00 4c 89 ff e8 c1 59 e9 fd 4c 89 f8
All code
========
0: b8 00 00 00 00 mov $0x0,%eax
5: 00 fc add %bh,%ah
7: ff (bad)
8: df 83 e3 02 48 c1 filds -0x3eb7fd1d(%rbx)
e: ea (bad)
f: 03 4c 09 fb add -0x5(%rcx,%rcx,1),%ecx
13: 48 83 cb 01 or $0x1,%rbx
17: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1)
1b: 0f 85 9a 00 00 00 jne 0xbb
21: 48 89 5d 08 mov %rbx,0x8(%rbp)
25: e9 50 ff ff ff jmpq 0xffffffffffffff7a
2a:* 0f 0b ud2 <-- trapping instruction
2c: 4c 8d 7d 08 lea 0x8(%rbp),%r15
30: be 08 00 00 00 mov $0x8,%esi
35: 4c 89 ff mov %r15,%rdi
38: e8 c1 59 e9 fd callq 0xfffffffffde959fe
3d: 4c 89 f8 mov %r15,%rax

Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: 4c 8d 7d 08 lea 0x8(%rbp),%r15
6: be 08 00 00 00 mov $0x8,%esi
b: 4c 89 ff mov %r15,%rdi
e: e8 c1 59 e9 fd callq 0xfffffffffde959d4
13: 4c 89 f8 mov %r15,%rax
[ 161.095753][ T2255] RSP: 0018:ffffc90002c0f8b8 EFLAGS: 00010286
[ 161.101662][ T2255] RAX: fffffffffffffe00 RBX: fffffffffffffe00 RCX: ffffffff83bdf543
[ 161.109469][ T2255] RDX: ffffed10236632f0 RSI: 0000000000000008 RDI: ffff88811b319778
[ 161.117276][ T2255] RBP: ffff88811b319778 R08: 0000000000000001 R09: ffffed10236632ef
[ 161.125080][ T2255] R10: ffff88811b31977f R11: ffffffff85fe96a2 R12: 1ffff92000581f18
[ 161.132885][ T2255] R13: dffffc0000000000 R14: ffffc90002c0fa70 R15: 0000000000000000
[ 161.140691][ T2255] FS: 00007f623f19d740(0000) GS:ffff8887e9380000(0000) knlGS:0000000000000000
[ 161.149446][ T2255] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 161.155871][ T2255] CR2: 00007f623f05b000 CR3: 00000001b2026006 CR4: 00000000003706f0
[ 161.163705][ T2255] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 161.171512][ T2255] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 161.179316][ T2255] Call Trace:
[ 161.182456][ T2255] <TASK>
[ 161.185254][ T2255] ? __warn (kernel/panic.c:694)
[ 161.189181][ T2255] ? down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1))
[ 161.193456][ T2255] ? report_bug (lib/bug.c:180 lib/bug.c:219)
[ 161.197814][ T2255] ? handle_bug (arch/x86/kernel/traps.c:239 (discriminator 1))
[ 161.202003][ T2255] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1))
[ 161.206532][ T2255] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621)
[ 161.211414][ T2255] ? down_read (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2723 (discriminator 5) include/linux/atomic/atomic-long.h:163 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3298 (discriminator 5) kernel/locking/rwsem.c:243 (discriminator 5) kernel/locking/rwsem.c:1249 (discriminator 5) kernel/locking/rwsem.c:1263 (discriminator 5) kernel/locking/rwsem.c:1528 (discriminator 5))
[ 161.215600][ T2255] ? down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1))
[ 161.219854][ T2255] ? down_read (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2723 (discriminator 5) include/linux/atomic/atomic-long.h:163 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3298 (discriminator 5) kernel/locking/rwsem.c:243 (discriminator 5) kernel/locking/rwsem.c:1249 (discriminator 5) kernel/locking/rwsem.c:1263 (discriminator 5) kernel/locking/rwsem.c:1528 (discriminator 5))
[ 161.224034][ T2255] ? __rmqueue_pcplist (include/linux/list.h:215 (discriminator 1) include/linux/list.h:229 (discriminator 1) mm/page_alloc.c:2836 (discriminator 1))
[ 161.228904][ T2255] ? __pfx_down_read (kernel/locking/rwsem.c:1524)
[ 161.233566][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152)
[ 161.238707][ T2255] f2fs_dio_read_iter (fs/f2fs/f2fs.h:2468 (discriminator 1) fs/f2fs/file.c:4477 (discriminator 1)) f2fs
[ 161.244288][ T2255] f2fs_file_read_iter (fs/f2fs/file.c:4533) f2fs
[ 161.249949][ T2255] copy_splice_read (include/linux/fs.h:2102 fs/splice.c:365)
[ 161.254636][ T2255] ? __pfx_copy_splice_read (fs/splice.c:324)
[ 161.259843][ T2255] splice_direct_to_actor (fs/splice.c:1136)
[ 161.265045][ T2255] ? __pfx_direct_splice_actor (fs/splice.c:1159)
[ 161.270507][ T2255] ? __pfx_splice_direct_to_actor (fs/splice.c:1032)
[ 161.276229][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152)
[ 161.281359][ T2255] ? from_kgid_munged (kernel/user_namespace.c:527)
[ 161.286131][ T2255] do_splice_direct (fs/splice.c:1208 fs/splice.c:1233)
[ 161.290829][ T2255] ? __pfx_do_splice_direct (fs/splice.c:1232)
[ 161.296047][ T2255] ? __pfx_direct_file_splice_eof (fs/splice.c:1178)
[ 161.301779][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152)
[ 161.306894][ T2255] ? rw_verify_area (fs/read_write.c:377)
[ 161.311507][ T2255] vfs_copy_file_range (fs/read_write.c:1558)
[ 161.316552][ T2255] ? f2fs_getattr (arch/x86/include/asm/bitops.h:206 (discriminator 1) arch/x86/include/asm/bitops.h:238 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 1) fs/f2fs/f2fs.h:3056 (discriminator 1) fs/f2fs/f2fs.h:3311 (discriminator 1) fs/f2fs/file.c:904 (discriminator 1)) f2fs
[ 161.321759][ T2255] ? __pfx_vfs_copy_file_range (fs/read_write.c:1486)
[ 161.327234][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152)
[ 161.332362][ T2255] __do_sys_copy_file_range (fs/read_write.c:1612)
[ 161.337755][ T2255] ? __pfx___do_sys_copy_file_range (fs/read_write.c:1578)
[ 161.343661][ T2255] ? f2fs_llseek (arch/x86/include/asm/bitops.h:206 (discriminator 1) arch/x86/include/asm/bitops.h:238 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 1) fs/f2fs/f2fs.h:3056 (discriminator 1) fs/f2fs/f2fs.h:3210 (discriminator 1) fs/f2fs/file.c:518 (discriminator 1)) f2fs
[ 161.348813][ T2255] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83)
[ 161.353151][ T2255] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
[ 161.358886][ T2255] RIP: 0033:0x7f623f2a1719
[ 161.363137][ T2255] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 06 0d 00 f7 d8 64 89 01 48
All code
========
0: 08 89 e8 5b 5d c3 or %cl,-0x3ca2a418(%rcx)
6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
d: 00 00 00
10: 90 nop
11: 48 89 f8 mov %rdi,%rax
14: 48 89 f7 mov %rsi,%rdi
17: 48 89 d6 mov %rdx,%rsi
1a: 48 89 ca mov %rcx,%rdx
1d: 4d 89 c2 mov %r8,%r10
20: 4d 89 c8 mov %r9,%r8
23: 4c 8b 4c 24 08 mov 0x8(%rsp),%r9
28: 0f 05 syscall
2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction
30: 73 01 jae 0x33
32: c3 retq
33: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06f1
3a: f7 d8 neg %eax
3c: 64 89 01 mov %eax,%fs:(%rcx)
3f: 48 rex.W

Code starting with the faulting instruction
===========================================
0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax
6: 73 01 jae 0x9
8: c3 retq
9: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06c7
10: f7 d8 neg %eax
12: 64 89 01 mov %eax,%fs:(%rcx)
15: 48 rex.W


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240517/[email protected]



--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


2024-06-06 10:26:08

by Chao Yu

[permalink] [raw]
Subject: Re: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

On 2024/5/15 16:32, Wu Bo wrote:
> On Fri, May 10, 2024 at 10:39:06AM +0800, Chao Yu wrote:
>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>> it may cause buffered read hits unwritten data unexpectly, and for
>> dio read, the race condition exists as well.
>>
>> Thread A Thread B
>> - f2fs_file_write_iter
>> - f2fs_dio_write_iter
>> - __iomap_dio_rw
>> - f2fs_iomap_begin
>> - f2fs_map_blocks
>> - __allocate_data_block
>> - allocated blkaddr #x
>> - iomap_dio_submit_bio
>> - f2fs_file_read_iter
>> - filemap_read
>> - f2fs_read_data_folio
>> - f2fs_mpage_readpages
>> - f2fs_map_blocks
>> : get blkaddr #x
>> - f2fs_submit_read_bio
>> IRQ
>> - f2fs_read_end_io
>> : read IO on blkaddr #x complete
>> IRQ
>> - iomap_dio_bio_end_io
>> : direct write IO on blkaddr #x complete
>>
> Looks like every COW filesystem would meet this situation. What's the solution
> of other FS?

I missed to reply this...

Other cow filesystem like btrfs, it will update metadata after data IO completion,
so it is safe.

Thanks,

>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>> such race condition.
>>

2024-06-06 10:31:39

by Chao Yu

[permalink] [raw]
Subject: Re: [f2fs-dev] [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

On 2024/5/15 14:38, Chao Yu wrote:
> On 2024/5/15 12:42, Jaegeuk Kim wrote:
>> On 05/15, Chao Yu wrote:
>>> On 2024/5/15 0:09, Jaegeuk Kim wrote:
>>>> On 05/10, Chao Yu wrote:
>>>>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>>>>> it may cause buffered read hits unwritten data unexpectly, and for
>>>>> dio read, the race condition exists as well.
>>>>>
>>>>> Thread A                      Thread B
>>>>> - f2fs_file_write_iter
>>>>>    - f2fs_dio_write_iter
>>>>>     - __iomap_dio_rw
>>>>>      - f2fs_iomap_begin
>>>>>       - f2fs_map_blocks
>>>>>        - __allocate_data_block
>>>>>         - allocated blkaddr #x
>>>>>          - iomap_dio_submit_bio
>>>>>                                 - f2fs_file_read_iter
>>>>>                                  - filemap_read
>>>>>                                   - f2fs_read_data_folio
>>>>>                                    - f2fs_mpage_readpages
>>>>>                                     - f2fs_map_blocks
>>>>>                                      : get blkaddr #x
>>>>>                                     - f2fs_submit_read_bio
>>>>>                                 IRQ
>>>>>                                 - f2fs_read_end_io
>>>>>                                  : read IO on blkaddr #x complete
>>>>> IRQ
>>>>> - iomap_dio_bio_end_io
>>>>>    : direct write IO on blkaddr #x complete
>>>>>
>>>>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>>>>> such race condition.
>>>>
>>>> Wasn't this supposed to be managed by user-land?
>>>
>>> Actually, the test case is:
>>>
>>> 1. mount w/ lfs mode
>>> 2. touch file;
>>> 3. initialize file w/ 4k zeroed data; fsync;
>>> 4. continue triggering dio write 4k zeroed data to file;
>>> 5. and meanwhile, continue triggering buf/dio 4k read in file,
>>> use md5sum to verify the 4k data;
>>>
>>> It expects data is all zero, however it turned out it's not.
>>
>> Can we check outstanding write bios instead of abusing locks?

Jaegeuk, seems it can solve partial race cases, not all of them.

Do you suggest to use this compromised solution?

Thanks,

>
> I didn't figure out a way to solve this w/o lock, due to:
> - write bios can be issued after outstanding write bios check condition,
> result in the race.
> - once read() detects that there are outstanding write bios, we need to
> delay read flow rather than fail it, right? It looks using a lock is more
> proper here?
>
> Any suggestion?
>
> Thanks,
>
>>
>>>
>>> Thanks,
>>>
>>>>
>>>>>
>>>>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
>>>>> Signed-off-by: Chao Yu <[email protected]>
>>>>> ---
>>>>> v2:
>>>>> - fix to cover dio read path w/ i_opu_rwsem as well.
>>>>>    fs/f2fs/f2fs.h  |  1 +
>>>>>    fs/f2fs/file.c  | 28 ++++++++++++++++++++++++++--
>>>>>    fs/f2fs/super.c |  1 +
>>>>>    3 files changed, 28 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>>> index 30058e16a5d0..91cf4b3d6bc6 100644
>>>>> --- a/fs/f2fs/f2fs.h
>>>>> +++ b/fs/f2fs/f2fs.h
>>>>> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
>>>>>         /* avoid racing between foreground op and gc */
>>>>>         struct f2fs_rwsem i_gc_rwsem[2];
>>>>>         struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
>>>>> +     struct f2fs_rwsem i_opu_rwsem;  /* avoid racing between buf read and opu dio write */
>>>>>
>>>>>         int i_extra_isize;              /* size of extra space located in i_addr */
>>>>>         kprojid_t i_projid;             /* id for project quota */
>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>>> index 72ce1a522fb2..4ec260af321f 100644
>>>>> --- a/fs/f2fs/file.c
>>>>> +++ b/fs/f2fs/file.c
>>>>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>         const loff_t pos = iocb->ki_pos;
>>>>>         const size_t count = iov_iter_count(to);
>>>>>         struct iomap_dio *dio;
>>>>> +     bool do_opu = f2fs_lfs_mode(sbi);
>>>>>         ssize_t ret;
>>>>>
>>>>>         if (count == 0)
>>>>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>                         ret = -EAGAIN;
>>>>>                         goto out;
>>>>>                 }
>>>>> +             if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
>>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>> +                     ret = -EAGAIN;
>>>>> +                     goto out;
>>>>> +             }
>>>>>         } else {
>>>>>                 f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>>> +             f2fs_down_read(&fi->i_opu_rwsem);
>>>>>         }
>>>>>
>>>>>         /*
>>>>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>                 ret = iomap_dio_complete(dio);
>>>>>         }
>>>>>
>>>>> +     f2fs_up_read(&fi->i_opu_rwsem);
>>>>>         f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>>
>>>>>         file_accessed(file);
>>>>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>         if (f2fs_should_use_dio(inode, iocb, to)) {
>>>>>                 ret = f2fs_dio_read_iter(iocb, to);
>>>>>         } else {
>>>>> +             bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
>>>>> +
>>>>> +             if (do_opu)
>>>>> +                     f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
>>>>>                 ret = filemap_read(iocb, to, 0);
>>>>> +             if (do_opu)
>>>>> +                     f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
>>>>>                 if (ret > 0)
>>>>>                         f2fs_update_iostat(F2FS_I_SB(inode), inode,
>>>>>                                                 APP_BUFFERED_READ_IO, ret);
>>>>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>>>                         ret = -EAGAIN;
>>>>>                         goto out;
>>>>>                 }
>>>>> +             if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
>>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>>> +                     ret = -EAGAIN;
>>>>> +                     goto out;
>>>>> +             }
>>>>>         } else {
>>>>>                 ret = f2fs_convert_inline_inode(inode);
>>>>>                 if (ret)
>>>>>                         goto out;
>>>>>
>>>>>                 f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
>>>>> -             if (do_opu)
>>>>> +             if (do_opu) {
>>>>>                         f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>>> +                     f2fs_down_write(&fi->i_opu_rwsem);
>>>>> +             }
>>>>>         }
>>>>>
>>>>>         /*
>>>>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>>>                 ret = iomap_dio_complete(dio);
>>>>>         }
>>>>>
>>>>> -     if (do_opu)
>>>>> +     if (do_opu) {
>>>>> +             f2fs_up_write(&fi->i_opu_rwsem);
>>>>>                 f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>> +     }
>>>>>         f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>>>
>>>>>         if (ret < 0)
>>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>>> index daf2c4dbe150..b4ed3b094366 100644
>>>>> --- a/fs/f2fs/super.c
>>>>> +++ b/fs/f2fs/super.c
>>>>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>>>>>         init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
>>>>>         init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
>>>>>         init_f2fs_rwsem(&fi->i_xattr_sem);
>>>>> +     init_f2fs_rwsem(&fi->i_opu_rwsem);
>>>>>
>>>>>         /* Will be used by directory only */
>>>>>         fi->i_dir_level = F2FS_SB(sb)->dir_level;
>>>>> --
>>>>> 2.40.1
>
>
> _______________________________________________
> Linux-f2fs-devel mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel