From: Harshad Shirwadkar <[email protected]>
If the inode that's being requested to track using ext4_fc_track_inode
is being committed, then wait until the inode finishes the commit.
Signed-off-by: Harshad Shirwadkar <[email protected]>
---
fs/ext4/ext4_jbd2.c | 12 ++++++++++++
fs/ext4/ext4_jbd2.h | 13 ++++---------
fs/ext4/fast_commit.c | 28 ++++++++++++++++++++++++++++
fs/ext4/inode.c | 3 ++-
4 files changed, 46 insertions(+), 10 deletions(-)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3477a16d08ae..7fa301b0a35a 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -106,6 +106,18 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
GFP_NOFS, type, line);
}
+handle_t *__ext4_journal_start(struct inode *inode, unsigned int line,
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds)
+{
+ handle_t *handle = __ext4_journal_start_sb(inode->i_sb, line,
+ type, blocks, rsv_blocks,
+ revoke_creds);
+ if (ext4_handle_valid(handle) && !IS_ERR(handle))
+ ext4_fc_track_inode(handle, inode);
+ return handle;
+}
+
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
{
struct super_block *sb;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index db2ae4a2b38d..e408622fe896 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -302,6 +302,10 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
return ext4_free_metadata_revoke_credits(sb, 8);
}
+handle_t *__ext4_journal_start(struct inode *inode, unsigned int line,
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds);
+
#define ext4_journal_start_sb(sb, type, nblocks) \
__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
ext4_trans_default_revoke_credits(sb))
@@ -318,15 +322,6 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
__ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \
(revoke_creds))
-static inline handle_t *__ext4_journal_start(struct inode *inode,
- unsigned int line, int type,
- int blocks, int rsv_blocks,
- int revoke_creds)
-{
- return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
- rsv_blocks, revoke_creds);
-}
-
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__func__, __LINE__, (handle))
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 9913de655b61..be8c5b3456ec 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -551,8 +551,14 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
return 0;
}
+/*
+ * Track inode as part of the next fast commit. If the inode is being
+ * committed, this function will wait for the commit to finish.
+ */
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ wait_queue_head_t *wq;
int ret;
if (S_ISDIR(inode->i_mode))
@@ -564,6 +570,28 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
return;
}
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ }
+
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(inode, ret);
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 531a94f48637..7a01f5bd377c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -629,6 +629,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* with create == 1 flag.
*/
down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_fc_track_inode(handle, inode);
/*
* We need to check for EXT4 here because migrate
@@ -5690,7 +5691,6 @@ int ext4_mark_iloc_dirty(handle_t *handle,
put_bh(iloc->bh);
return -EIO;
}
- ext4_fc_track_inode(handle, inode);
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
@@ -5727,6 +5727,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;
--
2.35.1.616.g0bdcbb4464-goog
On Tue 08-03-22 08:33:16, Harshad Shirwadkar wrote:
> From: Harshad Shirwadkar <[email protected]>
>
> If the inode that's being requested to track using ext4_fc_track_inode
> is being committed, then wait until the inode finishes the commit.
>
> Signed-off-by: Harshad Shirwadkar <[email protected]>
One comment below...
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -106,6 +106,18 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
> GFP_NOFS, type, line);
> }
>
> +handle_t *__ext4_journal_start(struct inode *inode, unsigned int line,
> + int type, int blocks, int rsv_blocks,
> + int revoke_creds)
> +{
> + handle_t *handle = __ext4_journal_start_sb(inode->i_sb, line,
> + type, blocks, rsv_blocks,
> + revoke_creds);
> + if (ext4_handle_valid(handle) && !IS_ERR(handle))
> + ext4_fc_track_inode(handle, inode);
> + return handle;
> +}
> +
Please fix fs/ext4/inline.c rather than papering over the problem like
this. Because it is just a landmine waiting to explode in some strange
cornercase when someone does not call ext4_journal_start() but other handle
starting function.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hmm, after removing ext4_fc_track_inode() from ext4_journal_start(), I
see one deadlock - there are some places in code where
ext4_mark_inode_dirty gets called while holding i_data_sem. Commit
path requires i_data_sem to commit inode data (via ext4_map_blocks).
So, if an inode is being committed, ext4_mark_inode_dirty may start
waiting for the inode to commit while holding i_data_sem and the
commit path may wait on i_data_sem. The right way to fix this is to
call ext4_fc_track_inode in such places before acquiring i_data_sem in
write mode. But that would mean we sprinkle code with more
ext4_fc_track_inode() calls which is something that I preferably
wanted to avoid.
This makes me wonder though, for fast commits, is it a terrible idea
to extend the meaning of ext4_journal_start() from "start a new
handle" to "start a new handle with an intention to modify the passed
inode"? Most of the handles modify only one inode, and for other
places where we do modify multiple inodes, ext4_reserve_inode_write()
would ensure that those inodes are tracked as well. All of the
existing places where inode gets modified after grabbing i_data_sem,
i_data_sem is grabbed only after starting the handle. This would take
care of the deadlock mentioned above and similar deadlocks. Another
advantage with doing this is that developers wouldn't need to worry
about adding explicit ext4_fc_track_inode() calls for future changes.
If we decide to do this, we would need to do a thorough code review to
ensure that the above rule is followed everywhere. But note that
ext4_fc_track_inode() is idempotent, so it doesn't matter if this
function gets called multiple times in the same handle. So to avoid
breaking fast commits, we can be super careful and in the first
version, we can have ext4_fc_track_range() calls in
ext4_reserve_inode_dirty(), ext4_journal_start(), inline.c and in
handles where i_data_sem gets acquired in write mode. We can then
carefully evaluate each code path and remove redundant
ext4_fc_track_range() calls.
What do you think?
- Harshad
On Thu, 10 Mar 2022 at 20:17, harshad shirwadkar
<[email protected]> wrote:
>
> Thanks for the reviews Jan! I'll update inline.c as you mentioned in
> the next version.
>
> - Harshad
>
> On Wed, 9 Mar 2022 at 02:14, Jan Kara <[email protected]> wrote:
> >
> > On Tue 08-03-22 08:33:16, Harshad Shirwadkar wrote:
> > > From: Harshad Shirwadkar <[email protected]>
> > >
> > > If the inode that's being requested to track using ext4_fc_track_inode
> > > is being committed, then wait until the inode finishes the commit.
> > >
> > > Signed-off-by: Harshad Shirwadkar <[email protected]>
> >
> > One comment below...
> >
> > > --- a/fs/ext4/ext4_jbd2.c
> > > +++ b/fs/ext4/ext4_jbd2.c
> > > @@ -106,6 +106,18 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
> > > GFP_NOFS, type, line);
> > > }
> > >
> > > +handle_t *__ext4_journal_start(struct inode *inode, unsigned int line,
> > > + int type, int blocks, int rsv_blocks,
> > > + int revoke_creds)
> > > +{
> > > + handle_t *handle = __ext4_journal_start_sb(inode->i_sb, line,
> > > + type, blocks, rsv_blocks,
> > > + revoke_creds);
> > > + if (ext4_handle_valid(handle) && !IS_ERR(handle))
> > > + ext4_fc_track_inode(handle, inode);
> > > + return handle;
> > > +}
> > > +
> >
> > Please fix fs/ext4/inline.c rather than papering over the problem like
> > this. Because it is just a landmine waiting to explode in some strange
> > cornercase when someone does not call ext4_journal_start() but other handle
> > starting function.
> >
> > Honza
> > --
> > Jan Kara <[email protected]>
> > SUSE Labs, CR
Thanks for the reviews Jan! I'll update inline.c as you mentioned in
the next version.
- Harshad
On Wed, 9 Mar 2022 at 02:14, Jan Kara <[email protected]> wrote:
>
> On Tue 08-03-22 08:33:16, Harshad Shirwadkar wrote:
> > From: Harshad Shirwadkar <[email protected]>
> >
> > If the inode that's being requested to track using ext4_fc_track_inode
> > is being committed, then wait until the inode finishes the commit.
> >
> > Signed-off-by: Harshad Shirwadkar <[email protected]>
>
> One comment below...
>
> > --- a/fs/ext4/ext4_jbd2.c
> > +++ b/fs/ext4/ext4_jbd2.c
> > @@ -106,6 +106,18 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
> > GFP_NOFS, type, line);
> > }
> >
> > +handle_t *__ext4_journal_start(struct inode *inode, unsigned int line,
> > + int type, int blocks, int rsv_blocks,
> > + int revoke_creds)
> > +{
> > + handle_t *handle = __ext4_journal_start_sb(inode->i_sb, line,
> > + type, blocks, rsv_blocks,
> > + revoke_creds);
> > + if (ext4_handle_valid(handle) && !IS_ERR(handle))
> > + ext4_fc_track_inode(handle, inode);
> > + return handle;
> > +}
> > +
>
> Please fix fs/ext4/inline.c rather than papering over the problem like
> this. Because it is just a landmine waiting to explode in some strange
> cornercase when someone does not call ext4_journal_start() but other handle
> starting function.
>
> Honza
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR
Sorry for delayed reply, I've got dragged into other things and somewhat
forgot about this...
On Fri 11-03-22 00:25:48, harshad shirwadkar wrote:
> Hmm, after removing ext4_fc_track_inode() from ext4_journal_start(), I
> see one deadlock - there are some places in code where
> ext4_mark_inode_dirty gets called while holding i_data_sem. Commit
> path requires i_data_sem to commit inode data (via ext4_map_blocks).
> So, if an inode is being committed, ext4_mark_inode_dirty may start
> waiting for the inode to commit while holding i_data_sem and the
> commit path may wait on i_data_sem.
Indeed, that is a problem.
> The right way to fix this is to
> call ext4_fc_track_inode in such places before acquiring i_data_sem in
> write mode. But that would mean we sprinkle code with more
> ext4_fc_track_inode() calls which is something that I preferably
> wanted to avoid.
So I agree calling ext4_fc_track_inode() from ext4_reserve_inode_write()
isn't going to fly as is. We need to find a better way of doing things.
> This makes me wonder though, for fast commits, is it a terrible idea
> to extend the meaning of ext4_journal_start() from "start a new
> handle" to "start a new handle with an intention to modify the passed
> inode"? Most of the handles modify only one inode, and for other
> places where we do modify multiple inodes, ext4_reserve_inode_write()
> would ensure that those inodes are tracked as well.
Well but to avoid deadlocks like you've described above you would have to
start tracking inode with explicit ext4_fc_track_inode() calls before
grabbing i_data_sem. ext4_reserve_inode_write() would be only useful for an
assertion check that indeed the inode is already tracked.
> All of the
> existing places where inode gets modified after grabbing i_data_sem,
> i_data_sem is grabbed only after starting the handle. This would take
> care of the deadlock mentioned above and similar deadlocks. Another
> advantage with doing this is that developers wouldn't need to worry
> about adding explicit ext4_fc_track_inode() calls for future changes.
Well, at least for the simple case where only one inode is modified. But I
agree that's a majority.
> If we decide to do this, we would need to do a thorough code review to
> ensure that the above rule is followed everywhere. But note that
> ext4_fc_track_inode() is idempotent, so it doesn't matter if this
> function gets called multiple times in the same handle. So to avoid
> breaking fast commits, we can be super careful and in the first
> version, we can have ext4_fc_track_range() calls in
> ext4_reserve_inode_dirty(), ext4_journal_start(), inline.c and in
> handles where i_data_sem gets acquired in write mode. We can then
> carefully evaluate each code path and remove redundant
> ext4_fc_track_range() calls.
As I wrote above, if we go this path, I'd be for ext4_fc_track_inode()
calls in ext4_journal_start() and then adding explicit calls to
ext4_fc_track_inode() where additionally needed and have only assertion
checks in ext4_reserve_inode_dirty() and other places which modify inode
metadata, to catch places which need explicit calls to
ext4_fc_track_inode(). That way we won't have any hidden deadlocks in the
code waiting to happen.
I have also another proposal for consideration how we could handle this.
Mostly branstorming now: We could also drop the need for fastcommit code to
acquire i_data_sem during commit. We could use just information in extent
status tree to provide block mapping for the fastcommit code (that does not
need i_data_sem). The only thing is we'd need to make sure modified extents
from the status tree are not evicted from memory before the appropriate
transaction commits so that they are available for appropriate fastcommit -
for that we'd probably need to add TID of the transaction that last
modified an extent and add check into the shrinker to avoid shrinking
uncommitted extents. As a bonus, we could now add to fastcommit only
extents with appropriate TID and thus save on extent logging for sparsely
modified inode.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Sorry for the late reply, I was on vacation and getting to this just now.
On Mon, 21 Mar 2022 at 07:06, Jan Kara <[email protected]> wrote:
>
> Sorry for delayed reply, I've got dragged into other things and somewhat
> forgot about this...
>
> On Fri 11-03-22 00:25:48, harshad shirwadkar wrote:
> > Hmm, after removing ext4_fc_track_inode() from ext4_journal_start(), I
> > see one deadlock - there are some places in code where
> > ext4_mark_inode_dirty gets called while holding i_data_sem. Commit
> > path requires i_data_sem to commit inode data (via ext4_map_blocks).
> > So, if an inode is being committed, ext4_mark_inode_dirty may start
> > waiting for the inode to commit while holding i_data_sem and the
> > commit path may wait on i_data_sem.
>
> Indeed, that is a problem.
>
> > The right way to fix this is to
> > call ext4_fc_track_inode in such places before acquiring i_data_sem in
> > write mode. But that would mean we sprinkle code with more
> > ext4_fc_track_inode() calls which is something that I preferably
> > wanted to avoid.
>
> So I agree calling ext4_fc_track_inode() from ext4_reserve_inode_write()
> isn't going to fly as is. We need to find a better way of doing things.
>
> > This makes me wonder though, for fast commits, is it a terrible idea
> > to extend the meaning of ext4_journal_start() from "start a new
> > handle" to "start a new handle with an intention to modify the passed
> > inode"? Most of the handles modify only one inode, and for other
> > places where we do modify multiple inodes, ext4_reserve_inode_write()
> > would ensure that those inodes are tracked as well.
>
> Well but to avoid deadlocks like you've described above you would have to
> start tracking inode with explicit ext4_fc_track_inode() calls before
> grabbing i_data_sem. ext4_reserve_inode_write() would be only useful for an
> assertion check that indeed the inode is already tracked.
>
> > All of the
> > existing places where inode gets modified after grabbing i_data_sem,
> > i_data_sem is grabbed only after starting the handle. This would take
> > care of the deadlock mentioned above and similar deadlocks. Another
> > advantage with doing this is that developers wouldn't need to worry
> > about adding explicit ext4_fc_track_inode() calls for future changes.
>
> Well, at least for the simple case where only one inode is modified. But I
> agree that's a majority.
>
> > If we decide to do this, we would need to do a thorough code review to
> > ensure that the above rule is followed everywhere. But note that
> > ext4_fc_track_inode() is idempotent, so it doesn't matter if this
> > function gets called multiple times in the same handle. So to avoid
> > breaking fast commits, we can be super careful and in the first
> > version, we can have ext4_fc_track_range() calls in
> > ext4_reserve_inode_dirty(), ext4_journal_start(), inline.c and in
> > handles where i_data_sem gets acquired in write mode. We can then
> > carefully evaluate each code path and remove redundant
> > ext4_fc_track_range() calls.
>
> As I wrote above, if we go this path, I'd be for ext4_fc_track_inode()
> calls in ext4_journal_start() and then adding explicit calls to
> ext4_fc_track_inode() where additionally needed and have only assertion
> checks in ext4_reserve_inode_dirty() and other places which modify inode
> metadata, to catch places which need explicit calls to
> ext4_fc_track_inode(). That way we won't have any hidden deadlocks in the
> code waiting to happen.
Okay sounds good, so based on your response, I'll rework the patch
series to make following changes:
1) Add ext4_fc_track_inode() calls in ext4_journal_start()
2) Add ext4_fc_track_inode calls in places where more than one inode
are changed in a handle and / or i_data_sem is being acquired.
3) Add an assertion in ext4_reserve_inode_dirty() to check if the
inode on which write is being reserved is already being tracked.
Does that sound good?
>
> I have also another proposal for consideration how we could handle this.
> Mostly branstorming now: We could also drop the need for fastcommit code to
> acquire i_data_sem during commit. We could use just information in extent
> status tree to provide block mapping for the fastcommit code (that does not
> need i_data_sem). The only thing is we'd need to make sure modified extents
> from the status tree are not evicted from memory before the appropriate
> transaction commits so that they are available for appropriate fastcommit -
> for that we'd probably need to add TID of the transaction that last
> modified an extent and add check into the shrinker to avoid shrinking
> uncommitted extents. As a bonus, we could now add to fastcommit only
> extents with appropriate TID and thus save on extent logging for sparsely
> modified inode.
Yeah, I like this idea. I'll add that as a TODO in the code just so
that we don't lose it.
Thanks,
Harshad
>
> Honza
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR