This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.
f2fs_get_node_info()
- nat_tree_lock
-> sleep in journal_rwsem
checkpoint
- waiting for nat_tree_lock
Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/node.c | 16 ++++++++--------
fs/f2fs/segment.c | 22 +++++++++++-----------
fs/f2fs/segment.h | 2 +-
3 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..821a40e02fb4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -567,13 +567,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
/* Check current segment summary */
- down_read(&curseg->journal_rwsem);
+ read_lock(&curseg->journal_rwlock);
i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- up_read(&curseg->journal_rwsem);
+ read_unlock(&curseg->journal_rwlock);
if (i >= 0) {
up_read(&nm_i->nat_tree_lock);
goto cache;
@@ -2338,7 +2338,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
struct f2fs_journal *journal = curseg->journal;
int i;
- down_read(&curseg->journal_rwsem);
+ read_lock(&curseg->journal_rwlock);
for (i = 0; i < nats_in_cursum(journal); i++) {
block_t addr;
nid_t nid;
@@ -2350,7 +2350,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
else
remove_free_nid(sbi, nid);
}
- up_read(&curseg->journal_rwsem);
+ read_unlock(&curseg->journal_rwlock);
}
static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
@@ -2799,7 +2799,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
struct f2fs_journal *journal = curseg->journal;
int i;
- down_write(&curseg->journal_rwsem);
+ write_lock(&curseg->journal_rwlock);
for (i = 0; i < nats_in_cursum(journal); i++) {
struct nat_entry *ne;
struct f2fs_nat_entry raw_ne;
@@ -2831,7 +2831,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
__set_nat_cache_dirty(nm_i, ne);
}
update_nats_in_cursum(journal, -i);
- up_write(&curseg->journal_rwsem);
+ write_unlock(&curseg->journal_rwlock);
}
static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -2906,7 +2906,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
to_journal = false;
if (to_journal) {
- down_write(&curseg->journal_rwsem);
+ write_lock(&curseg->journal_rwlock);
} else {
page = get_next_nat_page(sbi, start_nid);
if (IS_ERR(page))
@@ -2946,7 +2946,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
}
if (to_journal) {
- up_write(&curseg->journal_rwsem);
+ write_unlock(&curseg->journal_rwlock);
} else {
__update_nat_bits(sbi, start_nid, page);
f2fs_put_page(page, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f9b7fb785e1d..c397c1c271ec 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2442,9 +2442,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
mutex_lock(&curseg->curseg_mutex);
- down_read(&curseg->journal_rwsem);
+ read_lock(&curseg->journal_rwlock);
memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
- up_read(&curseg->journal_rwsem);
+ read_unlock(&curseg->journal_rwlock);
memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
@@ -3876,9 +3876,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
mutex_lock(&curseg->curseg_mutex);
/* update journal info */
- down_write(&curseg->journal_rwsem);
+ write_lock(&curseg->journal_rwlock);
memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
- up_write(&curseg->journal_rwsem);
+ write_unlock(&curseg->journal_rwlock);
memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
@@ -4136,7 +4136,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
struct f2fs_journal *journal = curseg->journal;
int i;
- down_write(&curseg->journal_rwsem);
+ write_lock(&curseg->journal_rwlock);
for (i = 0; i < sits_in_cursum(journal); i++) {
unsigned int segno;
bool dirtied;
@@ -4148,7 +4148,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
}
update_sits_in_cursum(journal, -i);
- up_write(&curseg->journal_rwsem);
+ write_unlock(&curseg->journal_rwlock);
}
/*
@@ -4204,7 +4204,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
to_journal = false;
if (to_journal) {
- down_write(&curseg->journal_rwsem);
+ write_lock(&curseg->journal_rwlock);
} else {
page = get_next_sit_page(sbi, start_segno);
raw_sit = page_address(page);
@@ -4251,7 +4251,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
if (to_journal)
- up_write(&curseg->journal_rwsem);
+ write_unlock(&curseg->journal_rwlock);
else
f2fs_put_page(page, 1);
@@ -4432,7 +4432,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
- init_rwsem(&array[i].journal_rwsem);
+ rwlock_init(&array[i].journal_rwlock);
array[i].journal = f2fs_kzalloc(sbi,
sizeof(struct f2fs_journal), GFP_KERNEL);
if (!array[i].journal)
@@ -4509,7 +4509,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
start_blk += readed;
} while (start_blk < sit_blk_cnt);
- down_read(&curseg->journal_rwsem);
+ read_lock(&curseg->journal_rwlock);
for (i = 0; i < sits_in_cursum(journal); i++) {
unsigned int old_valid_blocks;
@@ -4551,7 +4551,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
old_valid_blocks;
}
}
- up_read(&curseg->journal_rwsem);
+ read_unlock(&curseg->journal_rwlock);
if (!err && total_node_blocks != valid_node_count(sbi)) {
f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 050230c70a53..afb9041dc2e6 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -306,7 +306,7 @@ struct victim_selection {
struct curseg_info {
struct mutex curseg_mutex; /* lock for consistency */
struct f2fs_summary_block *sum_blk; /* cached summary block */
- struct rw_semaphore journal_rwsem; /* protect journal area */
+ rwlock_t journal_rwlock; /* protect journal area */
struct f2fs_journal *journal; /* cached journal info */
unsigned char alloc_type; /* current allocation type */
unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */
--
2.32.0.402.g57bb445576-goog
On 2021/7/22 9:41, Jaegeuk Kim wrote:
> This tries to fix priority inversion in the below condition resulting in
> long checkpoint delay.
>
> f2fs_get_node_info()
> - nat_tree_lock
> -> sleep in journal_rwsem
> checkpoint
> - waiting for nat_tree_lock
>
> Signed-off-by: Jaegeuk Kim <[email protected]>
> ---
> fs/f2fs/node.c | 16 ++++++++--------
> fs/f2fs/segment.c | 22 +++++++++++-----------
> fs/f2fs/segment.h | 2 +-
> 3 files changed, 20 insertions(+), 20 deletions(-)
>
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 0be9e2d7120e..821a40e02fb4 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -567,13 +567,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
> memset(&ne, 0, sizeof(struct f2fs_nat_entry));
>
> /* Check current segment summary */
> - down_read(&curseg->journal_rwsem);
> + read_lock(&curseg->journal_rwlock);
It seem journal_rwsem covers too many operations, e.g.
- scan_curseg_cache
- add_free_nid
- f2fs_kmem_cache_alloc
- kmem_cache_alloc(__GFP_NOFAIL)
Thanks,
> i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
> if (i >= 0) {
> ne = nat_in_journal(journal, i);
> node_info_from_raw_nat(ni, &ne);
> }
> - up_read(&curseg->journal_rwsem);
> + read_unlock(&curseg->journal_rwlock);
> if (i >= 0) {
> up_read(&nm_i->nat_tree_lock);
> goto cache;
> @@ -2338,7 +2338,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
> struct f2fs_journal *journal = curseg->journal;
> int i;
>
> - down_read(&curseg->journal_rwsem);
> + read_lock(&curseg->journal_rwlock);
> for (i = 0; i < nats_in_cursum(journal); i++) {
> block_t addr;
> nid_t nid;
> @@ -2350,7 +2350,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
> else
> remove_free_nid(sbi, nid);
> }
> - up_read(&curseg->journal_rwsem);
> + read_unlock(&curseg->journal_rwlock);
> }
>
> static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
> @@ -2799,7 +2799,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> struct f2fs_journal *journal = curseg->journal;
> int i;
>
> - down_write(&curseg->journal_rwsem);
> + write_lock(&curseg->journal_rwlock);
> for (i = 0; i < nats_in_cursum(journal); i++) {
> struct nat_entry *ne;
> struct f2fs_nat_entry raw_ne;
> @@ -2831,7 +2831,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> __set_nat_cache_dirty(nm_i, ne);
> }
> update_nats_in_cursum(journal, -i);
> - up_write(&curseg->journal_rwsem);
> + write_unlock(&curseg->journal_rwlock);
> }
>
> static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> @@ -2906,7 +2906,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> to_journal = false;
>
> if (to_journal) {
> - down_write(&curseg->journal_rwsem);
> + write_lock(&curseg->journal_rwlock);
> } else {
> page = get_next_nat_page(sbi, start_nid);
> if (IS_ERR(page))
> @@ -2946,7 +2946,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> }
>
> if (to_journal) {
> - up_write(&curseg->journal_rwsem);
> + write_unlock(&curseg->journal_rwlock);
> } else {
> __update_nat_bits(sbi, start_nid, page);
> f2fs_put_page(page, 1);
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index f9b7fb785e1d..c397c1c271ec 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -2442,9 +2442,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
>
> mutex_lock(&curseg->curseg_mutex);
>
> - down_read(&curseg->journal_rwsem);
> + read_lock(&curseg->journal_rwlock);
> memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
> - up_read(&curseg->journal_rwsem);
> + read_unlock(&curseg->journal_rwlock);
>
> memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
> memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
> @@ -3876,9 +3876,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
> mutex_lock(&curseg->curseg_mutex);
>
> /* update journal info */
> - down_write(&curseg->journal_rwsem);
> + write_lock(&curseg->journal_rwlock);
> memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
> - up_write(&curseg->journal_rwsem);
> + write_unlock(&curseg->journal_rwlock);
>
> memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
> memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
> @@ -4136,7 +4136,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
> struct f2fs_journal *journal = curseg->journal;
> int i;
>
> - down_write(&curseg->journal_rwsem);
> + write_lock(&curseg->journal_rwlock);
> for (i = 0; i < sits_in_cursum(journal); i++) {
> unsigned int segno;
> bool dirtied;
> @@ -4148,7 +4148,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
> add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
> }
> update_sits_in_cursum(journal, -i);
> - up_write(&curseg->journal_rwsem);
> + write_unlock(&curseg->journal_rwlock);
> }
>
> /*
> @@ -4204,7 +4204,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> to_journal = false;
>
> if (to_journal) {
> - down_write(&curseg->journal_rwsem);
> + write_lock(&curseg->journal_rwlock);
> } else {
> page = get_next_sit_page(sbi, start_segno);
> raw_sit = page_address(page);
> @@ -4251,7 +4251,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> }
>
> if (to_journal)
> - up_write(&curseg->journal_rwsem);
> + write_unlock(&curseg->journal_rwlock);
> else
> f2fs_put_page(page, 1);
>
> @@ -4432,7 +4432,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
> array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
> if (!array[i].sum_blk)
> return -ENOMEM;
> - init_rwsem(&array[i].journal_rwsem);
> + rwlock_init(&array[i].journal_rwlock);
> array[i].journal = f2fs_kzalloc(sbi,
> sizeof(struct f2fs_journal), GFP_KERNEL);
> if (!array[i].journal)
> @@ -4509,7 +4509,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
> start_blk += readed;
> } while (start_blk < sit_blk_cnt);
>
> - down_read(&curseg->journal_rwsem);
> + read_lock(&curseg->journal_rwlock);
> for (i = 0; i < sits_in_cursum(journal); i++) {
> unsigned int old_valid_blocks;
>
> @@ -4551,7 +4551,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
> old_valid_blocks;
> }
> }
> - up_read(&curseg->journal_rwsem);
> + read_unlock(&curseg->journal_rwlock);
>
> if (!err && total_node_blocks != valid_node_count(sbi)) {
> f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 050230c70a53..afb9041dc2e6 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -306,7 +306,7 @@ struct victim_selection {
> struct curseg_info {
> struct mutex curseg_mutex; /* lock for consistency */
> struct f2fs_summary_block *sum_blk; /* cached summary block */
> - struct rw_semaphore journal_rwsem; /* protect journal area */
> + rwlock_t journal_rwlock; /* protect journal area */
> struct f2fs_journal *journal; /* cached journal info */
> unsigned char alloc_type; /* current allocation type */
> unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */
>
This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.
f2fs_get_node_info()
- nat_tree_lock
-> sleep to grab journal_rwsem by contention
checkpoint
- waiting for nat_tree_lock
In order to let checkpoint go, let's release nat_tree_lock, if there's a
journal_rwsem contention.
Signed-off-by: Jaegeuk Kim <[email protected]>
---
Change log from v1:
- drop rwlock but take retry logic to reduce lock-holding time
fs/f2fs/node.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..c60ba4179bb2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
int i;
ni->nid = nid;
-
+retry:
/* Check nat cache */
down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +564,16 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
return 0;
}
- memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ /*
+ * Check current segment summary by trying to grab journal_rwsem first.
+ * This sem is on the critical path on the checkpoint requiring the above
+ * nat_tree_lock. Therefore, we should retry, if we failed to grab here.
+ */
+ if (!down_read_trylock(&curseg->journal_rwsem)) {
+ up_read(&nm_i->nat_tree_lock);
+ goto retry;
+ }
- /* Check current segment summary */
- down_read(&curseg->journal_rwsem);
i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
--
2.32.0.432.gabb21c7263-goog
On 2021/7/23 1:44, Jaegeuk Kim wrote:
> This tries to fix priority inversion in the below condition resulting in
> long checkpoint delay.
>
> f2fs_get_node_info()
> - nat_tree_lock
> -> sleep to grab journal_rwsem by contention
>
> checkpoint
> - waiting for nat_tree_lock
>
> In order to let checkpoint go, let's release nat_tree_lock, if there's a
> journal_rwsem contention.
Write lock of nat_tree_lock is held from many places, how about just
retrying unlock/lock only if checkpoint() is flushing nat blocks?
---
fs/f2fs/f2fs.h | 1 +
fs/f2fs/node.c | 8 ++++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ad8f99d7235f..05f41a15fda4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -877,6 +877,7 @@ struct f2fs_nm_info {
spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
unsigned int nat_blocks; /* # of nat blocks */
+ bool flushing_nat; /* indicate checkpoint() is flushing nat blocks */
/* free node ids management */
struct radix_tree_root free_nid_root;/* root of the free_nid cache */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c60ba4179bb2..2caa171a68f8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -569,7 +569,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
* This sem is on the critical path on the checkpoint requiring the above
* nat_tree_lock. Therefore, we should retry, if we failed to grab here.
*/
- if (!down_read_trylock(&curseg->journal_rwsem)) {
+ if (!down_read_trylock(&curseg->journal_rwsem) && nm_i->flushing_nat) {
up_read(&nm_i->nat_tree_lock);
goto retry;
}
@@ -2981,6 +2981,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
LIST_HEAD(sets);
int err = 0;
+ nm_i->flushing_nat = true;
+
/*
* during unmount, let's flush nat_bits before checking
* nat_cnt[DIRTY_NAT].
@@ -2992,7 +2994,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
if (!nm_i->nat_cnt[DIRTY_NAT])
- return 0;
+ goto out;
down_write(&nm_i->nat_tree_lock);
@@ -3026,6 +3028,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
up_write(&nm_i->nat_tree_lock);
/* Allow dirty nats by node block allocation in write_begin */
+out:
+ nm_i->flushing_nat = false;
return err;
}
--
2.22.1
This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.
f2fs_get_node_info()
- nat_tree_lock
-> sleep to grab journal_rwsem by contention
checkpoint
- waiting for nat_tree_lock
In order to let checkpoint go, let's release nat_tree_lock, if there's a
journal_rwsem contention.
Signed-off-by: Daeho Jeong <[email protected]>
Signed-off-by: Jaegeuk Kim <[email protected]>
---
Change log from v2:
- don't bother checkpoint
fs/f2fs/node.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..b26642daa3d2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
int i;
ni->nid = nid;
-
+retry:
/* Check nat cache */
down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
return 0;
}
- memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ /*
+ * Check current segment summary by trying to grab journal_rwsem first.
+ * This sem is on the critical path on the checkpoint requiring the above
+ * nat_tree_lock. Therefore, we should retry, if we failed to grab here
+ * while not bothering checkpoint.
+ */
+ if (rwsem_is_locked(&sbi->cp_global_sem)) {
+ down_read(&curseg->journal_rwsem);
+ } else if (!down_read_trylock(&curseg->journal_rwsem)) {
+ up_read(&nm_i->nat_tree_lock);
+ goto retry;
+ }
- /* Check current segment summary */
- down_read(&curseg->journal_rwsem);
i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
--
2.32.0.432.gabb21c7263-goog
On 2021/7/23 10:35, Jaegeuk Kim wrote:
> This tries to fix priority inversion in the below condition resulting in
> long checkpoint delay.
>
> f2fs_get_node_info()
> - nat_tree_lock
> -> sleep to grab journal_rwsem by contention
>
> checkpoint
> - waiting for nat_tree_lock
>
> In order to let checkpoint go, let's release nat_tree_lock, if there's a
> journal_rwsem contention.
>
> Signed-off-by: Daeho Jeong <[email protected]>
> Signed-off-by: Jaegeuk Kim <[email protected]>
> ---
> Change log from v2:
> - don't bother checkpoint
>
> fs/f2fs/node.c | 17 +++++++++++++----
> 1 file changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 0be9e2d7120e..b26642daa3d2 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
> int i;
>
> ni->nid = nid;
> -
> +retry:
> /* Check nat cache */
> down_read(&nm_i->nat_tree_lock);
> e = __lookup_nat_cache(nm_i, nid);
> @@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
> return 0;
> }
>
> - memset(&ne, 0, sizeof(struct f2fs_nat_entry));
> + /*
> + * Check current segment summary by trying to grab journal_rwsem first.
> + * This sem is on the critical path on the checkpoint requiring the above
> + * nat_tree_lock. Therefore, we should retry, if we failed to grab here
> + * while not bothering checkpoint.
> + */
> + if (rwsem_is_locked(&sbi->cp_global_sem)) {
You mean: if (!rwsem_is_locked()) ?
IMO, once CP is processing, all readers who needs read nat will be blocked...
Thanks,
> + down_read(&curseg->journal_rwsem);
> + } else if (!down_read_trylock(&curseg->journal_rwsem)) {
> + up_read(&nm_i->nat_tree_lock);
> + goto retry;
> + }
>
> - /* Check current segment summary */
> - down_read(&curseg->journal_rwsem);
> i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
> if (i >= 0) {
> ne = nat_in_journal(journal, i);
>
This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.
f2fs_get_node_info()
- nat_tree_lock
-> sleep to grab journal_rwsem by contention
checkpoint
- waiting for nat_tree_lock
In order to let checkpoint go, let's release nat_tree_lock, if there's a
journal_rwsem contention.
Signed-off-by: Daeho Jeong <[email protected]>
Signed-off-by: Jaegeuk Kim <[email protected]>
---
Change log from v3:
- fix wrong condition check
fs/f2fs/node.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..c945a9730f3c 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
int i;
ni->nid = nid;
-
+retry:
/* Check nat cache */
down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
return 0;
}
- memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ /*
+ * Check current segment summary by trying to grab journal_rwsem first.
+ * This sem is on the critical path on the checkpoint requiring the above
+ * nat_tree_lock. Therefore, we should retry, if we failed to grab here
+ * while not bothering checkpoint.
+ */
+ if (!rwsem_is_locked(&sbi->cp_global_sem)) {
+ down_read(&curseg->journal_rwsem);
+ } else if (!down_read_trylock(&curseg->journal_rwsem)) {
+ up_read(&nm_i->nat_tree_lock);
+ goto retry;
+ }
- /* Check current segment summary */
- down_read(&curseg->journal_rwsem);
i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
--
2.32.0.432.gabb21c7263-goog