If the journal doesn't abort when it gets an IO error in file data
blocks, the file data corruption will spread silently. Because
most of applications and commands do buffered writes without fsync(),
they don't notice the IO error. It's scary for mission critical
systems. On the other hand, if the journal aborts whenever it gets
an IO error in file data blocks, the system will easily become
inoperable. So this patch introduces a filesystem option to
determine whether it aborts the journal or just call printk() when
it gets an IO error in file data.
If you mount a ext3 fs with data_err=abort option, it aborts on file
data write error. If you mount it with data_err=ignore, it doesn't
abort, just call printk(). data_err=abort is default, because
people have used this error handling policy for three years.
Signed-off-by: Hidehiro Kawai <[email protected]>
---
Documentation/filesystems/ext3.txt | 5 +++++
fs/ext3/super.c | 18 ++++++++++++++++++
fs/jbd/commit.c | 2 ++
include/linux/ext3_fs.h | 2 ++
include/linux/jbd.h | 3 +++
5 files changed, 30 insertions(+)
Index: linux-2.6.27-rc1/Documentation/filesystems/ext3.txt
===================================================================
--- linux-2.6.27-rc1.orig/Documentation/filesystems/ext3.txt
+++ linux-2.6.27-rc1/Documentation/filesystems/ext3.txt
@@ -96,6 +96,11 @@ errors=remount-ro(*) Remount the filesys
errors=continue Keep going on a filesystem error.
errors=panic Panic and halt the machine if an error occurs.
+data_err=abort(*) Abort the journal if an error occurs in a file
+ data buffer in ordered mode.
+data_err=ignore Just print an error message if an error occurs
+ in a file data buffer in ordered mode.
+
grpid Give objects the same group ID as their creator.
bsdgroups
Index: linux-2.6.27-rc1/fs/ext3/super.c
===================================================================
--- linux-2.6.27-rc1.orig/fs/ext3/super.c
+++ linux-2.6.27-rc1/fs/ext3/super.c
@@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_
else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
seq_puts(seq, ",data=writeback");
+ if (!test_opt(sb, DATA_ERR_ABORT))
+ seq_puts(seq, ",data_err=ignore");
+
ext3_show_quota_options(seq, sb);
return 0;
@@ -754,6 +757,7 @@ enum {
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -796,6 +800,8 @@ static match_table_t tokens = {
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_data_err_abort, "data_err=abort"},
+ {Opt_data_err_ignore, "data_err=ignore"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -1011,6 +1017,12 @@ static int parse_options (char *options,
sbi->s_mount_opt |= data_opt;
}
break;
+ case Opt_data_err_abort:
+ set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
+ case Opt_data_err_ignore:
+ clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1600,6 +1612,8 @@ static int ext3_fill_super (struct super
else
set_opt(sbi->s_mount_opt, ERRORS_RO);
+ set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -1986,6 +2000,10 @@ static void ext3_init_journal_params(str
journal->j_flags |= JFS_BARRIER;
else
journal->j_flags &= ~JFS_BARRIER;
+ if (test_opt(sb, DATA_ERR_ABORT))
+ journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+ else
+ journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
spin_unlock(&journal->j_state_lock);
}
Index: linux-2.6.27-rc1/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.27-rc1.orig/include/linux/ext3_fs.h
+++ linux-2.6.27-rc1/include/linux/ext3_fs.h
@@ -380,6 +380,8 @@ struct ext3_inode {
#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
+ * error in ordered mode */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
Index: linux-2.6.27-rc1/include/linux/jbd.h
===================================================================
--- linux-2.6.27-rc1.orig/include/linux/jbd.h
+++ linux-2.6.27-rc1/include/linux/jbd.h
@@ -816,6 +816,9 @@ struct journal_s
#define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
#define JFS_LOADED 0x010 /* The journal superblock has been loaded */
#define JFS_BARRIER 0x020 /* Use IDE barriers */
+#define JFS_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file
+ * data write error in ordered
+ * mode */
/*
* Function declarations for the journaling transaction and buffer
Index: linux-2.6.27-rc1/fs/jbd/commit.c
===================================================================
--- linux-2.6.27-rc1.orig/fs/jbd/commit.c
+++ linux-2.6.27-rc1/fs/jbd/commit.c
@@ -482,6 +482,8 @@ void journal_commit_transaction(journal_
printk(KERN_WARNING
"JBD: Detected IO errors while flushing file data "
"on %s\n", bdevname(journal->j_fs_dev, b));
+ if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+ journal_abort(journal, err);
err = 0;
}
In ordered mode, if a file data buffer being dirtied exists in
the committing transaction, we write the buffer to the disk, move
it from the committing transaction to the running transaction,
then dirty it. But we don't have to remove the buffer from the
committing transaction when the buffer couldn't be written out,
otherwise it would miss the error and the committing transaction
would not abort.
This patch adds an error check before removing the buffer from the
committing transaction.
Signed-off-by: Hidehiro Kawai <[email protected]>
Acked-by: Jan Kara <[email protected]>
---
This patch is the same as patch 2/5 of possible filesystem corruption
fixes (take 2). It can be found at:
http://kerneltrap.org/mailarchive/linux-kernel/2008/6/2/2002144
fs/jbd/transaction.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
Index: linux-2.6.27-rc1/fs/jbd/transaction.c
===================================================================
--- linux-2.6.27-rc1.orig/fs/jbd/transaction.c
+++ linux-2.6.27-rc1/fs/jbd/transaction.c
@@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle,
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
+ int ret = 0;
if (is_handle_aborted(handle))
- return 0;
+ return ret;
jh = journal_add_journal_head(bh);
JBUFFER_TRACE(jh, "entry");
@@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle,
time if it is redirtied */
}
- /* journal_clean_data_list() may have got there first */
+ /*
+ * We cannot remove the buffer with io error from the
+ * committing transaction, because otherwise it would
+ * miss the error and the commit would not abort.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ ret = -EIO;
+ goto no_journal;
+ }
+
if (jh->b_transaction != NULL) {
JBUFFER_TRACE(jh, "unfile from commit");
__journal_temp_unlink_buffer(jh);
@@ -1108,7 +1118,7 @@ no_journal:
}
JBUFFER_TRACE(jh, "exit");
journal_put_journal_head(jh);
- return 0;
+ return ret;
}
/**
On Tue, Jul 29, 2008 at 10:52 PM, Hidehiro Kawai
<[email protected]> wrote:
> If the journal doesn't abort when it gets an IO error in file data
> blocks, the file data corruption will spread silently. Because
> most of applications and commands do buffered writes without fsync(),
> they don't notice the IO error. It's scary for mission critical
> systems. On the other hand, if the journal aborts whenever it gets
> an IO error in file data blocks, the system will easily become
> inoperable. So this patch introduces a filesystem option to
> determine whether it aborts the journal or just call printk() when
> it gets an IO error in file data.
>
> If you mount a ext3 fs with data_err=abort option, it aborts on file
> data write error. If you mount it with data_err=ignore, it doesn't
> abort, just call printk(). data_err=abort is default, because
> people have used this error handling policy for three years.
Hidehiro,
Thanks for making this configurable!
But given how surprised many of us were when we found out that
jbd/ext3 has been aborting on file data blocks isn't this our chance
to correct that long-standing oversight? Shouldn't the default be
data_err=ignore? Or would changing this behavior cause more harm than
good?
I don't feel strongly either way, having the "data_err" option makes
this issue moot for me, but I figured I'd raise the question (in the
interest of review).
Mike
On Jul 30, 2008 11:14 -0400, Mike Snitzer wrote:
> On Tue, Jul 29, 2008 at 10:52 PM, Hidehiro Kawai
> <[email protected]> wrote:
> > If the journal doesn't abort when it gets an IO error in file data
> > blocks, the file data corruption will spread silently. Because
> > most of applications and commands do buffered writes without fsync(),
> > they don't notice the IO error. It's scary for mission critical
> > systems. On the other hand, if the journal aborts whenever it gets
> > an IO error in file data blocks, the system will easily become
> > inoperable. So this patch introduces a filesystem option to
> > determine whether it aborts the journal or just call printk() when
> > it gets an IO error in file data.
> >
> > If you mount a ext3 fs with data_err=abort option, it aborts on file
> > data write error. If you mount it with data_err=ignore, it doesn't
> > abort, just call printk(). data_err=abort is default, because
> > people have used this error handling policy for three years.
>
> Thanks for making this configurable!
>
> But given how surprised many of us were when we found out that
> jbd/ext3 has been aborting on file data blocks isn't this our chance
> to correct that long-standing oversight? Shouldn't the default be
> data_err=ignore? Or would changing this behavior cause more harm than
> good?
>
> I don't feel strongly either way, having the "data_err" option makes
> this issue moot for me, but I figured I'd raise the question (in the
> interest of review).
Yes, good point. I don't think any of the ext3 maintainers were aware
that the 3-years-old patch had introduced "abort on data error" behaviour.
The default for ext4 is only now going to errors=remount-ro from
errors=continue (as it is on ext2/3) so I think it is inconsistent to
have the journal abort on data errors when the filesystem itself does not.
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
Andreas Dilger wrote:
> On Jul 30, 2008 11:14 -0400, Mike Snitzer wrote:
>
>>On Tue, Jul 29, 2008 at 10:52 PM, Hidehiro Kawai
>><[email protected]> wrote:
>>>If you mount a ext3 fs with data_err=abort option, it aborts on file
>>>data write error. If you mount it with data_err=ignore, it doesn't
>>>abort, just call printk(). data_err=abort is default, because
>>>people have used this error handling policy for three years.
>>
>>Thanks for making this configurable!
>>
>>But given how surprised many of us were when we found out that
>>jbd/ext3 has been aborting on file data blocks isn't this our chance
>>to correct that long-standing oversight? Shouldn't the default be
>>data_err=ignore? Or would changing this behavior cause more harm than
>>good?
I asked Japanese server vendor's people which default is preferred,
and they agreed on data_err=abort. But it would not be true for
all users all over the world.
>>I don't feel strongly either way, having the "data_err" option makes
>>this issue moot for me, but I figured I'd raise the question (in the
>>interest of review).
>
> Yes, good point. I don't think any of the ext3 maintainers were aware
> that the 3-years-old patch had introduced "abort on data error" behaviour.
> The default for ext4 is only now going to errors=remount-ro from
> errors=continue (as it is on ext2/3) so I think it is inconsistent to
> have the journal abort on data errors when the filesystem itself does not.
It's good point. Well, how about setting the default depending on
"errors" option? It means the default is data_err=ignore on
errors=continue and data_err=abort on errors=remount-ro/panic.
If it is confusing, I don't mind if the default is simply
data_err=ignore.
Thanks,
--
Hidehiro Kawai
Hitachi, Systems Development Laboratory
Linux Technology Center
Here is the 'data_err=ignore is the default' version of the PATCH 1/2.
Thanks,
Hidehiro Kawai
Subject: [PATCH 1/2] ext3: add an option to control error handling on file data
If the journal doesn't abort when it gets an IO error in file data
blocks, the file data corruption will spread silently. Because
most of applications and commands do buffered writes without fsync(),
they don't notice the IO error. It's scary for mission critical
systems. On the other hand, if the journal aborts whenever it gets
an IO error in file data blocks, the system will easily become
inoperable. So this patch introduces a filesystem option to
determine whether it aborts the journal or just call printk() when
it gets an IO error in file data.
If you mount a ext3 fs with data_err=abort option, it aborts on file
data write error. If you mount it with data_err=ignore, it doesn't
abort, just call printk(). data_err=ignore is the default.
Signed-off-by: Hidehiro Kawai <[email protected]>
---
Documentation/filesystems/ext3.txt | 5 +++++
fs/ext3/super.c | 16 ++++++++++++++++
fs/jbd/commit.c | 2 ++
include/linux/ext3_fs.h | 2 ++
include/linux/jbd.h | 3 +++
5 files changed, 28 insertions(+)
Index: linux-2.6.27-rc1/Documentation/filesystems/ext3.txt
===================================================================
--- linux-2.6.27-rc1.orig/Documentation/filesystems/ext3.txt
+++ linux-2.6.27-rc1/Documentation/filesystems/ext3.txt
@@ -96,6 +96,11 @@ errors=remount-ro(*) Remount the filesys
errors=continue Keep going on a filesystem error.
errors=panic Panic and halt the machine if an error occurs.
+data_err=ignore(*) Just print an error message if an error occurs
+ in a file data buffer in ordered mode.
+data_err=abort Abort the journal if an error occurs in a file
+ data buffer in ordered mode.
+
grpid Give objects the same group ID as their creator.
bsdgroups
Index: linux-2.6.27-rc1/fs/ext3/super.c
===================================================================
--- linux-2.6.27-rc1.orig/fs/ext3/super.c
+++ linux-2.6.27-rc1/fs/ext3/super.c
@@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_
else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
seq_puts(seq, ",data=writeback");
+ if (test_opt(sb, DATA_ERR_ABORT))
+ seq_puts(seq, ",data_err=abort");
+
ext3_show_quota_options(seq, sb);
return 0;
@@ -754,6 +757,7 @@ enum {
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -796,6 +800,8 @@ static match_table_t tokens = {
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_data_err_abort, "data_err=abort"},
+ {Opt_data_err_ignore, "data_err=ignore"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -1011,6 +1017,12 @@ static int parse_options (char *options,
sbi->s_mount_opt |= data_opt;
}
break;
+ case Opt_data_err_abort:
+ set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
+ case Opt_data_err_ignore:
+ clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(str
journal->j_flags |= JFS_BARRIER;
else
journal->j_flags &= ~JFS_BARRIER;
+ if (test_opt(sb, DATA_ERR_ABORT))
+ journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+ else
+ journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
spin_unlock(&journal->j_state_lock);
}
Index: linux-2.6.27-rc1/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.27-rc1.orig/include/linux/ext3_fs.h
+++ linux-2.6.27-rc1/include/linux/ext3_fs.h
@@ -380,6 +380,8 @@ struct ext3_inode {
#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
+ * error in ordered mode */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
Index: linux-2.6.27-rc1/include/linux/jbd.h
===================================================================
--- linux-2.6.27-rc1.orig/include/linux/jbd.h
+++ linux-2.6.27-rc1/include/linux/jbd.h
@@ -816,6 +816,9 @@ struct journal_s
#define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
#define JFS_LOADED 0x010 /* The journal superblock has been loaded */
#define JFS_BARRIER 0x020 /* Use IDE barriers */
+#define JFS_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file
+ * data write error in ordered
+ * mode */
/*
* Function declarations for the journaling transaction and buffer
Index: linux-2.6.27-rc1/fs/jbd/commit.c
===================================================================
--- linux-2.6.27-rc1.orig/fs/jbd/commit.c
+++ linux-2.6.27-rc1/fs/jbd/commit.c
@@ -482,6 +482,8 @@ void journal_commit_transaction(journal_
printk(KERN_WARNING
"JBD: Detected IO errors while flushing file data "
"on %s\n", bdevname(journal->j_fs_dev, b));
+ if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+ journal_abort(journal, err);
err = 0;
}