2022-03-08 23:52:22

by harshad shirwadkar

[permalink] [raw]
Subject: [PATCH v2 3/5] ext4: rework fast commit commit path

From: Harshad Shirwadkar <[email protected]>

This patch reworks fast commit's commit path to remove locking the
journal for the entire duration of a fast commit. Instead, we only lock
the journal while marking all the eligible inodes as "committing". This
allows handles to make progress in parallel with the fast commit.

Signed-off-by: Harshad Shirwadkar <[email protected]>
---
fs/ext4/fast_commit.c | 77 ++++++++++++++++++++++++++-----------------
fs/jbd2/journal.c | 2 --
2 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index be8c5b3456ec..eedcf8b4d47b 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -287,20 +287,30 @@ void ext4_fc_del(struct inode *inode)
(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
return;

-restart:
spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
return;
}

- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
- }
-
- if (!list_empty(&ei->i_fc_list))
- list_del_init(&ei->i_fc_list);
+ /*
+ * Since ext4_fc_del is called from ext4_evict_inode while having a
+ * handle open, there is no need for us to wait here even if a fast
+ * commit is going on. That is because, if this inode is being
+ * committed, ext4_mark_inode_dirty would have waited for inode commit
+ * operation to finish before we come here. So, by the time we come
+ * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+ * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+ * here.
+ *
+ * We may come here without any handles open in the "no_delete" case of
+ * ext4_evict_inode as well. However, if that happens, we first mark the
+ * file system as fast commit ineligible anyway. So, even in that case,
+ * it is okay to remove the inode from the fc list.
+ */
+ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+ && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+ list_del_init(&ei->i_fc_list);

/*
* Since this inode is getting removed, let's also remove all FC
@@ -323,8 +333,6 @@ void ext4_fc_del(struct inode *inode)
fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
kfree(fc_dentry->fcd_name.name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
- return;
}

/*
@@ -964,19 +972,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)

spin_lock(&sbi->s_fc_lock);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
- while (atomic_read(&ei->i_fc_updates)) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&ei->i_fc_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&ei->i_fc_updates)) {
- spin_unlock(&sbi->s_fc_lock);
- schedule();
- spin_lock(&sbi->s_fc_lock);
- }
- finish_wait(&ei->i_fc_wait, &wait);
- }
spin_unlock(&sbi->s_fc_lock);
ret = jbd2_submit_inode_data(ei->jinode);
if (ret)
@@ -998,13 +993,9 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal)

spin_lock(&sbi->s_fc_lock);
list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- spin_lock(&pos->i_fc_lock);
if (!ext4_test_inode_state(&pos->vfs_inode,
- EXT4_STATE_FC_COMMITTING)) {
- spin_unlock(&pos->i_fc_lock);
+ EXT4_STATE_FC_COMMITTING))
continue;
- }
- spin_unlock(&pos->i_fc_lock);
spin_unlock(&sbi->s_fc_lock);

ret = jbd2_wait_inode_data(journal, pos->jinode);
@@ -1093,6 +1084,16 @@ static int ext4_fc_perform_commit(journal_t *journal)
int ret = 0;
u32 crc = 0;

+ /* Lock the journal */
+ jbd2_journal_lock_updates(journal);
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ jbd2_journal_unlock_updates(journal);
+
ret = ext4_fc_submit_inode_data_all(journal);
if (ret)
return ret;
@@ -1143,6 +1144,18 @@ static int ext4_fc_perform_commit(journal_t *journal)
ret = ext4_fc_write_inode(inode, &crc);
if (ret)
goto out;
+ ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
+ smp_mb();
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+#endif
spin_lock(&sbi->s_fc_lock);
}
spin_unlock(&sbi->s_fc_lock);
@@ -1276,13 +1289,17 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
spin_lock(&sbi->s_fc_lock);
list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
i_fc_list) {
- list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
if (iter->i_sync_tid <= tid)
ext4_fc_reset_inode(&iter->vfs_inode);
- /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
smp_mb();
+ list_del_init(&iter->i_fc_list);
#if (BITS_PER_LONG < 64)
wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c2cf74b01ddb..06b885628b1c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -757,7 +757,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
}
journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
write_unlock(&journal->j_state_lock);
- jbd2_journal_lock_updates(journal);

return 0;
}
@@ -769,7 +768,6 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
*/
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
- jbd2_journal_unlock_updates(journal);
if (journal->j_fc_cleanup_callback)
journal->j_fc_cleanup_callback(journal, 0, tid);
write_lock(&journal->j_state_lock);
--
2.35.1.616.g0bdcbb4464-goog


2022-03-09 12:49:02

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH v2 3/5] ext4: rework fast commit commit path

On Tue 08-03-22 08:33:17, Harshad Shirwadkar wrote:
> From: Harshad Shirwadkar <[email protected]>
>
> This patch reworks fast commit's commit path to remove locking the
> journal for the entire duration of a fast commit. Instead, we only lock
> the journal while marking all the eligible inodes as "committing". This
> allows handles to make progress in parallel with the fast commit.
>
> Signed-off-by: Harshad Shirwadkar <[email protected]>

The patch looks good. Feel free to add:

Reviewed-by: Jan Kara <[email protected]>

Honza


> ---
> fs/ext4/fast_commit.c | 77 ++++++++++++++++++++++++++-----------------
> fs/jbd2/journal.c | 2 --
> 2 files changed, 47 insertions(+), 32 deletions(-)
>
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index be8c5b3456ec..eedcf8b4d47b 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -287,20 +287,30 @@ void ext4_fc_del(struct inode *inode)
> (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
> return;
>
> -restart:
> spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
> if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
> spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
> return;
> }
>
> - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
> - ext4_fc_wait_committing_inode(inode);
> - goto restart;
> - }
> -
> - if (!list_empty(&ei->i_fc_list))
> - list_del_init(&ei->i_fc_list);
> + /*
> + * Since ext4_fc_del is called from ext4_evict_inode while having a
> + * handle open, there is no need for us to wait here even if a fast
> + * commit is going on. That is because, if this inode is being
> + * committed, ext4_mark_inode_dirty would have waited for inode commit
> + * operation to finish before we come here. So, by the time we come
> + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
> + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
> + * here.
> + *
> + * We may come here without any handles open in the "no_delete" case of
> + * ext4_evict_inode as well. However, if that happens, we first mark the
> + * file system as fast commit ineligible anyway. So, even in that case,
> + * it is okay to remove the inode from the fc list.
> + */
> + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
> + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
> + list_del_init(&ei->i_fc_list);
>
> /*
> * Since this inode is getting removed, let's also remove all FC
> @@ -323,8 +333,6 @@ void ext4_fc_del(struct inode *inode)
> fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
> kfree(fc_dentry->fcd_name.name);
> kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
> -
> - return;
> }
>
> /*
> @@ -964,19 +972,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
>
> spin_lock(&sbi->s_fc_lock);
> list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
> - while (atomic_read(&ei->i_fc_updates)) {
> - DEFINE_WAIT(wait);
> -
> - prepare_to_wait(&ei->i_fc_wait, &wait,
> - TASK_UNINTERRUPTIBLE);
> - if (atomic_read(&ei->i_fc_updates)) {
> - spin_unlock(&sbi->s_fc_lock);
> - schedule();
> - spin_lock(&sbi->s_fc_lock);
> - }
> - finish_wait(&ei->i_fc_wait, &wait);
> - }
> spin_unlock(&sbi->s_fc_lock);
> ret = jbd2_submit_inode_data(ei->jinode);
> if (ret)
> @@ -998,13 +993,9 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal)
>
> spin_lock(&sbi->s_fc_lock);
> list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> - spin_lock(&pos->i_fc_lock);
> if (!ext4_test_inode_state(&pos->vfs_inode,
> - EXT4_STATE_FC_COMMITTING)) {
> - spin_unlock(&pos->i_fc_lock);
> + EXT4_STATE_FC_COMMITTING))
> continue;
> - }
> - spin_unlock(&pos->i_fc_lock);
> spin_unlock(&sbi->s_fc_lock);
>
> ret = jbd2_wait_inode_data(journal, pos->jinode);
> @@ -1093,6 +1084,16 @@ static int ext4_fc_perform_commit(journal_t *journal)
> int ret = 0;
> u32 crc = 0;
>
> + /* Lock the journal */
> + jbd2_journal_lock_updates(journal);
> + spin_lock(&sbi->s_fc_lock);
> + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> + ext4_set_inode_state(&iter->vfs_inode,
> + EXT4_STATE_FC_COMMITTING);
> + }
> + spin_unlock(&sbi->s_fc_lock);
> + jbd2_journal_unlock_updates(journal);
> +
> ret = ext4_fc_submit_inode_data_all(journal);
> if (ret)
> return ret;
> @@ -1143,6 +1144,18 @@ static int ext4_fc_perform_commit(journal_t *journal)
> ret = ext4_fc_write_inode(inode, &crc);
> if (ret)
> goto out;
> + ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
> + /*
> + * Make sure clearing of EXT4_STATE_FC_COMMITTING is
> + * visible before we send the wakeup. Pairs with implicit
> + * barrier in prepare_to_wait() in ext4_fc_track_inode().
> + */
> + smp_mb();
> +#if (BITS_PER_LONG < 64)
> + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
> +#else
> + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
> +#endif
> spin_lock(&sbi->s_fc_lock);
> }
> spin_unlock(&sbi->s_fc_lock);
> @@ -1276,13 +1289,17 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
> spin_lock(&sbi->s_fc_lock);
> list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
> i_fc_list) {
> - list_del_init(&iter->i_fc_list);
> ext4_clear_inode_state(&iter->vfs_inode,
> EXT4_STATE_FC_COMMITTING);
> if (iter->i_sync_tid <= tid)
> ext4_fc_reset_inode(&iter->vfs_inode);
> - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
> + /*
> + * Make sure clearing of EXT4_STATE_FC_COMMITTING is
> + * visible before we send the wakeup. Pairs with implicit
> + * barrier in prepare_to_wait() in ext4_fc_track_inode().
> + */
> smp_mb();
> + list_del_init(&iter->i_fc_list);
> #if (BITS_PER_LONG < 64)
> wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
> #else
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index c2cf74b01ddb..06b885628b1c 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -757,7 +757,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
> }
> journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
> write_unlock(&journal->j_state_lock);
> - jbd2_journal_lock_updates(journal);
>
> return 0;
> }
> @@ -769,7 +768,6 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
> */
> static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
> {
> - jbd2_journal_unlock_updates(journal);
> if (journal->j_fc_cleanup_callback)
> journal->j_fc_cleanup_callback(journal, 0, tid);
> write_lock(&journal->j_state_lock);
> --
> 2.35.1.616.g0bdcbb4464-goog
>
--
Jan Kara <[email protected]>
SUSE Labs, CR