2022-03-04 03:52:38

by Jaegeuk Kim

[permalink] [raw]
Subject: [PATCH 1/2] f2fs: avoid an infinite loop in f2fs_sync_dirty_inodes

If one read IO is always failing, we can fall into an infinite loop in
f2fs_sync_dirty_inodes. This happens during xfstests/generic/475.

[ 142.803335] Buffer I/O error on dev dm-1, logical block 8388592, async page read
...
[ 382.887210] submit_bio_noacct+0xdd/0x2a0
[ 382.887213] submit_bio+0x80/0x110
[ 382.887223] __submit_bio+0x4d/0x300 [f2fs]
[ 382.887282] f2fs_submit_page_bio+0x125/0x200 [f2fs]
[ 382.887299] __get_meta_page+0xc9/0x280 [f2fs]
[ 382.887315] f2fs_get_meta_page+0x13/0x20 [f2fs]
[ 382.887331] f2fs_get_node_info+0x317/0x3c0 [f2fs]
[ 382.887350] f2fs_do_write_data_page+0x327/0x6f0 [f2fs]
[ 382.887367] f2fs_write_single_data_page+0x5b7/0x960 [f2fs]
[ 382.887386] f2fs_write_cache_pages+0x302/0x890 [f2fs]
[ 382.887405] ? preempt_count_add+0x7a/0xc0
[ 382.887408] f2fs_write_data_pages+0xfd/0x320 [f2fs]
[ 382.887425] ? _raw_spin_unlock+0x1a/0x30
[ 382.887428] do_writepages+0xd3/0x1d0
[ 382.887432] filemap_fdatawrite_wbc+0x69/0x90
[ 382.887434] filemap_fdatawrite+0x50/0x70
[ 382.887437] f2fs_sync_dirty_inodes+0xa4/0x270 [f2fs]
[ 382.887453] f2fs_write_checkpoint+0x189/0x1640 [f2fs]
[ 382.887469] ? schedule_timeout+0x114/0x150
[ 382.887471] ? ttwu_do_activate+0x6d/0xb0
[ 382.887473] ? preempt_count_add+0x7a/0xc0
[ 382.887476] kill_f2fs_super+0xca/0x100 [f2fs]
[ 382.887491] deactivate_locked_super+0x35/0xa0
[ 382.887494] deactivate_super+0x40/0x50
[ 382.887497] cleanup_mnt+0x139/0x190
[ 382.887499] __cleanup_mnt+0x12/0x20
[ 382.887501] task_work_run+0x64/0xa0
[ 382.887505] exit_to_user_mode_prepare+0x1b7/0x1c0
[ 382.887508] syscall_exit_to_user_mode+0x27/0x50
[ 382.887510] do_syscall_64+0x48/0xc0
[ 382.887513] entry_SYSCALL_64_after_hwframe+0x44/0xae

Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/checkpoint.c | 7 +++++++
fs/f2fs/f2fs.h | 5 +++++
2 files changed, 12 insertions(+)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 203a1577942d..871eee35a32f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,6 +98,13 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
}

if (unlikely(!PageUptodate(page))) {
+ if (page->index == sbi->metapage_eio_ofs &&
+ sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) {
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ } else {
+ sbi->metapage_eio_ofs = page->index;
+ sbi->metapage_eio_cnt = 0;
+ }
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 47bf9e30913f..efc4f1fe2ffd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -577,6 +577,9 @@ enum {
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8

+/* maximum retry of EIO'ed meta page */
+#define MAX_RETRY_META_PAGE_EIO 100
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */

#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
@@ -1614,6 +1617,8 @@ struct f2fs_sb_info {
/* keep migration IO order for LFS mode */
struct f2fs_rwsem io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
+ pgoff_t metapage_eio_ofs; /* EIO page offset */
+ int metapage_eio_cnt; /* EIO count */

/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
--
2.35.1.616.g0bdcbb4464-goog


2022-03-04 11:07:28

by Jaegeuk Kim

[permalink] [raw]
Subject: [PATCH 2/2] f2fs: introduce F2FS_UNFAIR_RWSEM to support unfair rwsem

Unfair rwsem should be used when blk-cg is on. Otherwise, there is regression.

FYI, we noticed a -26.7% regression of aim7.jobs-per-min due to commit:

commit: e4544b63a7ee49e7fbebf35ece0a6acd3b9617ae ("f2fs: move f2fs to use reader-unfair rwsems")
https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master

in testcase: aim7
on test machine: 88 threads 2 sockets Intel(R) Xeon(R) Gold 6238M CPU @ 2.10GHz with 128G memory
with following parameters:

disk: 4BRD_12G
md: RAID0
fs: f2fs
test: sync_disk_rw
load: 100
cpufreq_governor: performance
ucode: 0x500320a

test-description: AIM7 is a traditional UNIX system level benchmark suite which is used to test and measure the performance of multiuser system.
test-url: https://sourceforge.net/projects/aimbench/files/aim-suite7/

Reported-by: kernel test robot <[email protected]>
Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/Kconfig | 7 +++++++
fs/f2fs/f2fs.h | 10 ++++++++++
2 files changed, 17 insertions(+)

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index f46a7339d6cf..03ef087537c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -143,3 +143,10 @@ config F2FS_IOSTAT
Support getting IO statistics through sysfs and printing out periodic
IO statistics tracepoint events. You have to turn on "iostat_enable"
sysfs node to enable this feature.
+
+config F2FS_UNFAIR_RWSEM
+ bool "F2FS unfair rw_semaphore"
+ depends on F2FS_FS && BLK_CGROUP
+ help
+ Use unfair rw_semaphore, if system configured IO priority by block
+ cgroup.
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index efc4f1fe2ffd..68d791ec8b27 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -132,7 +132,9 @@ typedef u32 nid_t;

struct f2fs_rwsem {
struct rw_semaphore internal_rwsem;
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
wait_queue_head_t read_waiters;
+#endif
};

struct f2fs_mount_info {
@@ -2131,7 +2133,9 @@ static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
const char *sem_name, struct lock_class_key *key)
{
__init_rwsem(&sem->internal_rwsem, sem_name, key);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
init_waitqueue_head(&sem->read_waiters);
+#endif
}

static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem)
@@ -2146,7 +2150,11 @@ static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem)

static inline void f2fs_down_read(struct f2fs_rwsem *sem)
{
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem));
+#else
+ down_read(&sem->internal_rwsem);
+#endif
}

static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem)
@@ -2181,7 +2189,9 @@ static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem)
static inline void f2fs_up_write(struct f2fs_rwsem *sem)
{
up_write(&sem->internal_rwsem);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
wake_up_all(&sem->read_waiters);
+#endif
}

static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
--
2.35.1.616.g0bdcbb4464-goog

2022-03-04 19:51:40

by Chao Yu

[permalink] [raw]
Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: introduce F2FS_UNFAIR_RWSEM to support unfair rwsem

On 2022/3/4 10:19, Jaegeuk Kim wrote:
> Unfair rwsem should be used when blk-cg is on. Otherwise, there is regression.
>
> FYI, we noticed a -26.7% regression of aim7.jobs-per-min due to commit:
>
> commit: e4544b63a7ee49e7fbebf35ece0a6acd3b9617ae ("f2fs: move f2fs to use reader-unfair rwsems")
> https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
>
> in testcase: aim7
> on test machine: 88 threads 2 sockets Intel(R) Xeon(R) Gold 6238M CPU @ 2.10GHz with 128G memory
> with following parameters:
>
> disk: 4BRD_12G
> md: RAID0
> fs: f2fs
> test: sync_disk_rw
> load: 100
> cpufreq_governor: performance
> ucode: 0x500320a
>
> test-description: AIM7 is a traditional UNIX system level benchmark suite which is used to test and measure the performance of multiuser system.
> test-url: https://apc01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fsourceforge.net%2Fprojects%2Faimbench%2Ffiles%2Faim-suite7%2F&amp;data=04%7C01%7Cchao.yu%40oppo.com%7C22ad61cc1e204e1d63e908d9fd858d65%7Cf1905eb1c35341c5951662b4a54b5ee6%7C0%7C0%7C637819572336831789%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=SMV5sGpVhSvjC8sbrKpKGT27J35Bm1%2B3KoM4yeraceA%3D&amp;reserved=0
>
> Reported-by: kernel test robot <[email protected]>
> Signed-off-by: Jaegeuk Kim <[email protected]>

Reviewed-by: Chao Yu <[email protected]>

Thanks,

2022-03-04 20:31:05

by Chao Yu

[permalink] [raw]
Subject: Re: [f2fs-dev] [PATCH 1/2] f2fs: avoid an infinite loop in f2fs_sync_dirty_inodes

On 2022/3/4 10:19, Jaegeuk Kim wrote:
> If one read IO is always failing, we can fall into an infinite loop in
> f2fs_sync_dirty_inodes. This happens during xfstests/generic/475.
>
> [ 142.803335] Buffer I/O error on dev dm-1, logical block 8388592, async page read
> ...
> [ 382.887210] submit_bio_noacct+0xdd/0x2a0
> [ 382.887213] submit_bio+0x80/0x110
> [ 382.887223] __submit_bio+0x4d/0x300 [f2fs]
> [ 382.887282] f2fs_submit_page_bio+0x125/0x200 [f2fs]
> [ 382.887299] __get_meta_page+0xc9/0x280 [f2fs]
> [ 382.887315] f2fs_get_meta_page+0x13/0x20 [f2fs]
> [ 382.887331] f2fs_get_node_info+0x317/0x3c0 [f2fs]
> [ 382.887350] f2fs_do_write_data_page+0x327/0x6f0 [f2fs]
> [ 382.887367] f2fs_write_single_data_page+0x5b7/0x960 [f2fs]
> [ 382.887386] f2fs_write_cache_pages+0x302/0x890 [f2fs]
> [ 382.887405] ? preempt_count_add+0x7a/0xc0
> [ 382.887408] f2fs_write_data_pages+0xfd/0x320 [f2fs]
> [ 382.887425] ? _raw_spin_unlock+0x1a/0x30
> [ 382.887428] do_writepages+0xd3/0x1d0
> [ 382.887432] filemap_fdatawrite_wbc+0x69/0x90
> [ 382.887434] filemap_fdatawrite+0x50/0x70
> [ 382.887437] f2fs_sync_dirty_inodes+0xa4/0x270 [f2fs]
> [ 382.887453] f2fs_write_checkpoint+0x189/0x1640 [f2fs]
> [ 382.887469] ? schedule_timeout+0x114/0x150
> [ 382.887471] ? ttwu_do_activate+0x6d/0xb0
> [ 382.887473] ? preempt_count_add+0x7a/0xc0
> [ 382.887476] kill_f2fs_super+0xca/0x100 [f2fs]
> [ 382.887491] deactivate_locked_super+0x35/0xa0
> [ 382.887494] deactivate_super+0x40/0x50
> [ 382.887497] cleanup_mnt+0x139/0x190
> [ 382.887499] __cleanup_mnt+0x12/0x20
> [ 382.887501] task_work_run+0x64/0xa0
> [ 382.887505] exit_to_user_mode_prepare+0x1b7/0x1c0
> [ 382.887508] syscall_exit_to_user_mode+0x27/0x50
> [ 382.887510] do_syscall_64+0x48/0xc0
> [ 382.887513] entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> Signed-off-by: Jaegeuk Kim <[email protected]>

Reviewed-by: Chao Yu <[email protected]>

Thanks,