2015-06-08 01:33:14

by Joseph Qi

[permalink] [raw]
Subject: [PATCH] jbd2: fix ocfs2 corrupt when updating journal superblock fails

If updating journal superblock fails after journal data has been flushed,
the error is omitted and this will mislead the caller as a normal case.
In ocfs2, the checkpoint will be treated successfully and the other node
can get the lock to update. Since the sb_start is still pointing to the
old log block, it will rewrite the journal data during journal recovery
bu the other node. Thus the new updates will be overwritten and ocfs2
corrupts.
So we have to return the error, and ocfs2_commit_cache will take care of
the error and prevent the other node to do update besides recovering
journal first.

Reported-by: Yiwen Jiang <[email protected]>
Signed-off-by: Joseph Qi <[email protected]>
Tested-by: Yiwen Jiang <[email protected]>
Cc: Junxiao Bi <[email protected]>
Cc: <[email protected]>
---
fs/jbd2/checkpoint.c | 5 ++---
fs/jbd2/journal.c | 37 ++++++++++++++++++++++++++++++-------
include/linux/jbd2.h | 4 ++--
3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 988b32e..82e5b7d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
unsigned long blocknr;

if (is_journal_aborted(journal))
- return 1;
+ return -EIO;

if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
return 1;
@@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);

- __jbd2_update_log_tail(journal, first_tid, blocknr);
- return 0;
+ return __jbd2_update_log_tail(journal, first_tid, blocknr);
}


diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b96bd80..6b33a42 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -885,9 +885,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
*
* Requires j_checkpoint_mutex
*/
-void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
unsigned long freed;
+ int ret;

BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));

@@ -897,7 +898,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ if (ret)
+ goto out;
+
write_lock(&journal->j_state_lock);
freed = block - journal->j_tail;
if (block < journal->j_tail)
@@ -913,6 +917,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
journal->j_tail_sequence = tid;
journal->j_tail = block;
write_unlock(&journal->j_state_lock);
+
+out:
+ return ret;
}

/*
@@ -1331,7 +1338,7 @@ static int journal_reset(journal_t *journal)
return jbd2_journal_start_thread(journal);
}

-static void jbd2_write_superblock(journal_t *journal, int write_op)
+static int jbd2_write_superblock(journal_t *journal, int write_op)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
@@ -1370,7 +1377,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
printk(KERN_ERR "JBD2: Error %d detected when updating "
"journal superblock for %s.\n", ret,
journal->j_devname);
+ jbd2_journal_abort(journal, ret);
}
+
+ return ret;
}

/**
@@ -1383,10 +1393,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
* Update a journal's superblock information about log tail and write it to
* disk, waiting for the IO to complete.
*/
-void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
unsigned long tail_block, int write_op)
{
journal_superblock_t *sb = journal->j_superblock;
+ int ret;

BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
@@ -1395,13 +1406,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start = cpu_to_be32(tail_block);

- jbd2_write_superblock(journal, write_op);
+ ret = jbd2_write_superblock(journal, write_op);
+ if (ret)
+ goto out;

/* Log is no longer empty */
write_lock(&journal->j_state_lock);
WARN_ON(!sb->s_sequence);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
+
+out:
+ return ret;
}

/**
@@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal)
return -EIO;

mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_cleanup_journal_tail(journal);
+ if (!err) {
+ err = jbd2_cleanup_journal_tail(journal);
+ if (err < 0) {
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ goto out;
+ }
+ }

/* Finally, mark the journal as really needing no recovery.
* This sets s_start==0 in the underlying superblock, which is
@@ -1966,7 +1988,8 @@ int jbd2_journal_flush(journal_t *journal)
J_ASSERT(journal->j_head == journal->j_tail);
J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
write_unlock(&journal->j_state_lock);
- return 0;
+out:
+ return err;
}

/**
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 20e7f78..edb640a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
unsigned long *block);
-void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
+int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);

/* Commit management */
@@ -1157,7 +1157,7 @@ extern int jbd2_journal_recover (journal_t *journal);
extern int jbd2_journal_wipe (journal_t *, int);
extern int jbd2_journal_skip_recovery (journal_t *);
extern void jbd2_journal_update_sb_errno(journal_t *);
-extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t,
+extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t,
unsigned long, int);
extern void __jbd2_journal_abort_hard (journal_t *);
extern void jbd2_journal_abort (journal_t *, int);
--
1.8.4.3




2015-06-19 14:48:31

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] jbd2: fix ocfs2 corrupt when updating journal superblock fails

This patch caused test ext4/306 to fail, because it caused resize2fs
to fail. The problem is that jbd2_cleanup_journal_tail() will return
1 if there is nothing to cleanup, and a negative error number if there
is an error. Unfortunately, this patch hunk:

On Mon, Jun 08, 2015 at 09:31:12AM +0800, Joseph Qi wrote:
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index b96bd80..6b33a42 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal)
> return -EIO;
>
> mutex_lock(&journal->j_checkpoint_mutex);
> - jbd2_cleanup_journal_tail(journal);
> + if (!err) {
> + err = jbd2_cleanup_journal_tail(journal);
> + if (err < 0) {
> + mutex_unlock(&journal->j_checkpoint_mutex);
> + goto out;
> + }
> + }

... would let the non-negative return value leak out to
jbd2_journal_flush(), and its callers are *not* prepared to handle the
non-negative return value (since jbd2_journal_flush wasn't doing this
before.)

I've fixed this by adding a "err = 0;" after the if statement.

- Ted

2015-06-23 00:50:08

by Joseph Qi

[permalink] [raw]
Subject: Re: [PATCH] jbd2: fix ocfs2 corrupt when updating journal superblock fails

Hi Ted,
Thanks for pointing out this issue and fixing it. If return 1 it may
break out those can only handle zero/negative return value.
I am sorry for not considering this before.

On 2015/6/19 22:48, Theodore Ts'o wrote:
> This patch caused test ext4/306 to fail, because it caused resize2fs
> to fail. The problem is that jbd2_cleanup_journal_tail() will return
> 1 if there is nothing to cleanup, and a negative error number if there
> is an error. Unfortunately, this patch hunk:
>
> On Mon, Jun 08, 2015 at 09:31:12AM +0800, Joseph Qi wrote:
>> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
>> index b96bd80..6b33a42 100644
>> --- a/fs/jbd2/journal.c
>> +++ b/fs/jbd2/journal.c
>> @@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal)
>> return -EIO;
>>
>> mutex_lock(&journal->j_checkpoint_mutex);
>> - jbd2_cleanup_journal_tail(journal);
>> + if (!err) {
>> + err = jbd2_cleanup_journal_tail(journal);
>> + if (err < 0) {
>> + mutex_unlock(&journal->j_checkpoint_mutex);
>> + goto out;
>> + }
>> + }
>
> ... would let the non-negative return value leak out to
> jbd2_journal_flush(), and its callers are *not* prepared to handle the
> non-negative return value (since jbd2_journal_flush wasn't doing this
> before.)
>
> I've fixed this by adding a "err = 0;" after the if statement.
>
> - Ted
>
> .
>