2021-03-17 08:42:44

by Haotian Li

[permalink] [raw]
Subject: [PATCH v3] e2fsck: Avoid changes on recovery flags when jbd2_journal_recover() failed

jbd2_journal_recover() may fail when some error occers such
as ENOMEM and EIO. However, jsb->s_start is still cleared
by func e2fsck_journal_release(). This may break consistency
between metadata and data in disk. Sometimes, failure in
jbd2_journal_recover() is temporary but retry e2fsck will
skip the journal recovery when the temporary problem is fixed.

Following harshad shirwadkar's suggestion??we add an option
"recovery_error_behavior" with default value "continue" to
e2fsck.conf. User may set it to "retry" or "exit" to adopt
different behavior when such journal recovery errors occur.

Reported-by: Liangyun <[email protected]>
Signed-off-by: Haotian Li <[email protected]>
Signed-off-by: Zhiqiang Liu <[email protected]>
Reviewed-by: Harshad Shirwadkar <[email protected]>
---
e2fsck/e2fsck.h | 11 +++++++++++
e2fsck/journal.c | 33 +++++++++++++++++++++++++++++++--
e2fsck/unix.c | 13 ++++++++++++-
3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
index 15d043ee..22f9ad11 100644
--- a/e2fsck/e2fsck.h
+++ b/e2fsck/e2fsck.h
@@ -451,6 +451,9 @@ struct e2fsck_struct {

/* Fast commit replay state */
struct e2fsck_fc_replay_state fc_replay_state;
+
+ /* Behavior when journal recovery fails */
+ int recovery_error_behavior;
};

/* Data structures to evaluate whether an extent tree needs rebuilding. */
@@ -474,6 +477,14 @@ typedef struct region_struct *region_t;
extern int e2fsck_strnlen(const char * s, int count);
#endif

+/* Different behaviors when journal recovery fails */
+#define RECOVERY_ERROR_CONTINUE 0
+#define RECOVERY_ERROR_RETRY 1
+#define RECOVERY_ERROR_EXIT 2
+
+/* Journal retry times if RECOVERY_ERROR_RETRY is set*/
+#define RECOVERY_TIMES_LIMIT 3
+
/*
* Procedure declarations
*/
diff --git a/e2fsck/journal.c b/e2fsck/journal.c
index a425bbd1..c1c6f6ee 100644
--- a/e2fsck/journal.c
+++ b/e2fsck/journal.c
@@ -1600,11 +1600,26 @@ no_has_journal:
return retval;
}

+static void set_recovery_error_behavior(e2fsck_t ctx, const char *recovery_behavior)
+{
+ if (!recovery_behavior) {
+ ctx->recovery_error_behavior = RECOVERY_ERROR_CONTINUE;
+ return;
+ }
+ if (strcmp(recovery_behavior, "retry") == 0)
+ ctx->recovery_error_behavior = RECOVERY_ERROR_RETRY;
+ else if (strcmp(recovery_behavior, "exit") == 0)
+ ctx->recovery_error_behavior = RECOVERY_ERROR_EXIT;
+ else
+ ctx->recovery_error_behavior = RECOVERY_ERROR_CONTINUE;
+}
+
static errcode_t recover_ext3_journal(e2fsck_t ctx)
{
struct problem_context pctx;
journal_t *journal;
errcode_t retval;
+ char *recovery_behavior = 0;

clear_problem_context(&pctx);

@@ -1629,8 +1644,12 @@ static errcode_t recover_ext3_journal(e2fsck_t ctx)
goto errout;

retval = -jbd2_journal_recover(journal);
- if (retval)
+ if (retval) {
+ profile_get_string(ctx->profile, "options", "recovery_error_behavior",
+ 0, "continue", &recovery_behavior);
+ set_recovery_error_behavior(ctx, recovery_behavior);
goto errout;
+ }

if (journal->j_failed_commit) {
pctx.ino = journal->j_failed_commit;
@@ -1645,7 +1664,15 @@ errout:
jbd2_journal_destroy_revoke(journal);
jbd2_journal_destroy_revoke_record_cache();
jbd2_journal_destroy_revoke_table_cache();
- e2fsck_journal_release(ctx, journal, 1, 0);
+ if (retval == 0 || ctx->recovery_error_behavior == RECOVERY_ERROR_CONTINUE)
+ e2fsck_journal_release(ctx, journal, 1, 0);
+ if (retval && ctx->recovery_error_behavior == RECOVERY_ERROR_EXIT) {
+ ctx->fs->flags &= ~EXT2_FLAG_VALID;
+ com_err(ctx->program_name, 0,
+ _("Journal recovery failed "
+ "on %s\n"), ctx->device_name);
+ fatal_error(ctx, 0);
+ }
return retval;
}

@@ -1697,6 +1724,8 @@ errcode_t e2fsck_run_ext3_journal(e2fsck_t ctx)

/* Set the superblock flags */
e2fsck_clear_recover(ctx, recover_retval != 0);
+ if (recover_retval != 0 && ctx->recovery_error_behavior == RECOVERY_ERROR_RETRY)
+ ext2fs_set_feature_journal_needs_recovery(ctx->fs->super);

/*
* Do one last sanity check, and propagate journal->s_errno to
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index c5f9e441..2c605bff 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -1068,6 +1068,8 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
if (c)
ctx->options |= E2F_OPT_ICOUNT_FULLMAP;

+ ctx->recovery_error_behavior = RECOVERY_ERROR_CONTINUE;
+
if (ctx->readahead_kb == ~0ULL) {
profile_get_integer(ctx->profile, "options",
"readahead_mem_pct", 0, -1, &c);
@@ -1776,6 +1778,7 @@ failure:
"doing a read-only filesystem check.\n"));
io_channel_flush(ctx->fs->io);
} else {
+ int recovery_retry_times = 0;
if (ctx->flags & E2F_FLAG_RESTARTED) {
/*
* Whoops, we attempted to run the
@@ -1788,7 +1791,15 @@ failure:
"on %s\n"), ctx->device_name);
fatal_error(ctx, 0);
}
- retval = e2fsck_run_ext3_journal(ctx);
+ while (recovery_retry_times++ < RECOVERY_TIMES_LIMIT) {
+ retval = e2fsck_run_ext3_journal(ctx);
+ if (retval && ctx->recovery_error_behavior == RECOVERY_ERROR_RETRY) {
+ log_out(ctx, _("Try to recovery Journal again in %s\n"),
+ ctx->device_name);
+ } else {
+ break;
+ }
+ }
if (retval == EFSBADCRC) {
log_out(ctx, _("Journal checksum error "
"found in %s\n"),
--
2.23.0