LinuxLists.cc - [PATCH 0/2] Add inode checksum support to ext4

2011-04-06 22:44:12

Subject: [PATCH 0/2] Add inode checksum support to ext4

Hi all,

I spent last week analyzing a client's corrupted ext3 image to see if I could
determine what had gone wrong and caused the filesystem to blow apart. As best
as I could tell, a data block got miswritten into a different sector ... which
happened to be an indirect block. Some time later the indirect block, which
now pointed at one of the inode tables (among other things that shouldn't ever
become file data) was loaded as part of a file write, which caused that inode
table to be blown to smithereens. Just for fun I tried reading from one of
these busted-inode files and ... failed to encounter any errors. Somehow, they
didn't find it funny that ext3 would read block numbers from a table with the
contents "ibm.com" with a straight face. Fortunately there were backups. :)

The client at this point asked if ext4 would do a better job of sanity
checking, which got me to wonder why ext4 checksums block groups but not
inodes. It's on Ted's todo list, but apparently nobody wrote any patch, so I
did. The following two patches are a first draft of adding inode checksum
support to both the kernel driver and to the various e2fsprogs.

If you have an existing ext* fs with 256-byte inodes, you ought to be able to
"tune2fs -O inode_csum /dev/XXX", fsck /dev/XXX, and mount the filesystem with
checksumming enabled. It seems to work for me (i386/x86-64), but I'm looking
for comments for improvement and perhaps some more testing (ppc64 is still
building). This inode checksum feature is not enabled by default.

--D

2011-04-06 22:45:48

by djwong

[permalink] [raw]

Subject: [PATCH 1/2] ext4: Calculate and verify inode checksums

This patch introduces to ext4 the ability to calculate and verify inode
checksums. This requires the use of a new ro compatibility flag and some
accompanying e2fsprogs patches to provide the relevant features in tune2fs and
e2fsck.

Signed-off-by: Darrick J. Wong <[email protected]>
---

fs/ext4/ext4.h | 6 ++++--
fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b..8815928 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -617,7 +617,7 @@ struct ext4_inode {
} masix2;
} osd2; /* OS dependent 2 */
__le16 i_extra_isize;
- __le16 i_pad1;
+ __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */
__le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */
__le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */
__le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */
@@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400

#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM)

/*
* Default values for user and/or group using reserved blocks
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1a86282..dc76f19 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,6 +42,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/crc16.h>

#include "ext4_jbd2.h"
#include "xattr.h"
@@ -52,6 +53,40 @@

#define MPAGE_DA_EXTENT_TAIL 0x01

+static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int offset = offsetof(struct ext4_inode, i_checksum);
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __u16 crc = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
+ le16_to_cpu(raw->i_extra_isize) >= 4) {
+ crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+ crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
+ crc = crc16(crc, (__u8 *)raw, offset);
+ offset += sizeof(raw->i_checksum); /* skip checksum */
+ /* for checksum of struct ext4_inode do the rest...*/
+ if (ei->i_extra_isize > 4)
+ crc = crc16(crc, (__u8 *)raw + offset,
+ ei->i_extra_isize - 4);
+ }
+
+ return cpu_to_le16(crc);
+}
+
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
+{
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
+ (raw->i_checksum != ext4_inode_csum(inode, raw)))
+ return 0;
+
+ return 1;
+}
+
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
@@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
struct inode *inode;
- journal_t *journal = EXT4_SB(sb)->s_journal;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ journal_t *journal = sbi->s_journal;
long ret;
int block;

@@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
} else
ei->i_extra_isize = 0;

+ if (!ext4_inode_csum_verify(inode, raw_inode)) {
+ EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
+ le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
+ le16_to_cpu(raw_inode->i_checksum));
+ ret = -EIO;
+ goto bad_inode;
+ }
+
EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
@@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_version_hi =
cpu_to_le32(inode->i_version >> 32);
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
+ raw_inode->i_checksum =
+ ext4_inode_csum(inode, raw_inode);
}

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

2011-04-06 22:47:35

by djwong

[permalink] [raw]

Subject: [PATCH 2/2] e2fsprogs: Add support for toggling, verifying, and fixing inode checksums

This patch adds to tune2fs the ability to toggle the inode checksum rocompat
feature flag, to e2fsck the ability to verify and correct inode checksums, and
to debugfs the ability to dump inode checksums.

Signed-off-by: Darrick J. Wong <[email protected]>
---

debugfs/debugfs.c | 6 +++++
e2fsck/pass1.c | 9 ++++++-
e2fsck/problem.c | 5 ++++
e2fsck/problem.h | 3 ++
lib/blkid/probe.h | 1 +
lib/e2p/feature.c | 2 ++
lib/ext2fs/csum.c | 59 +++++++++++++++++++++++++++++++++++++++++++++
lib/ext2fs/ext2_err.et.in | 3 ++
lib/ext2fs/ext2_fs.h | 3 ++
lib/ext2fs/ext2fs.h | 9 ++++++-
lib/ext2fs/inode.c | 17 +++++++++++++
lib/ext2fs/swapfs.c | 2 ++
misc/tune2fs.c | 18 ++++++++++++--
13 files changed, 132 insertions(+), 5 deletions(-)

diff --git a/debugfs/debugfs.c b/debugfs/debugfs.c
index e441bc5..182ac6a 100644
--- a/debugfs/debugfs.c
+++ b/debugfs/debugfs.c
@@ -509,6 +509,12 @@ static void internal_dump_inode_extra(FILE *out,
inode->i_extra_isize);
return;
}
+
+ if (current_fs->super->s_feature_ro_compat &
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM &&
+ inode->i_extra_isize > 4)
+ fprintf(out, "Inode checksum: %x\n", inode->i_checksum);
+
storage_size = EXT2_INODE_SIZE(current_fs->super) -
EXT2_GOOD_OLD_INODE_SIZE -
inode->i_extra_isize;
diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
index 67dd986..58f24bb 100644
--- a/e2fsck/pass1.c
+++ b/e2fsck/pass1.c
@@ -365,7 +365,7 @@ static void check_inode_extra_space(e2fsck_t ctx, struct problem_context *pctx)
inode->i_extra_isize);
#endif
/* i_extra_isize must cover i_extra_isize + i_pad1 at least */
- min = sizeof(inode->i_extra_isize) + sizeof(inode->i_pad1);
+ min = sizeof(inode->i_extra_isize) + sizeof(inode->i_checksum);
max = EXT2_INODE_SIZE(sb) - EXT2_GOOD_OLD_INODE_SIZE;
/*
* For now we will allow i_extra_isize to be 0, but really
@@ -729,6 +729,13 @@ void e2fsck_pass1(e2fsck_t ctx)
}
}

+ /* Check for invalid inode checksum */
+ if (!ext2fs_inode_csum_verify(fs, ino,
+ (struct ext2_inode_large *)inode) &&
+ fix_problem(ctx, PR_1_INODE_CSUM_INVALID, &pctx))
+ e2fsck_write_inode_full(ctx, ino, inode,
+ sizeof(struct ext2_inode_large), "pass1");
+
/*
* Test for incorrect extent flag settings.
*
diff --git a/e2fsck/problem.c b/e2fsck/problem.c
index 8f0b211..6785fa3 100644
--- a/e2fsck/problem.c
+++ b/e2fsck/problem.c
@@ -890,6 +890,11 @@ static struct e2fsck_problem problem_table[] = {
"(size %Is, lblk %r)\n"),
PROMPT_CLEAR, PR_PREEN_OK },

+ /* Fast symlink has EXTENTS_FL set */
+ { PR_1_INODE_CSUM_INVALID,
+ N_("inode %i checksum invalid. "),
+ PROMPT_FIX, PR_PREEN_OK },
+
/* Pass 1b errors */

/* Pass 1B: Rescan for duplicate/bad blocks */
diff --git a/e2fsck/problem.h b/e2fsck/problem.h
index 7c4c156..388d153 100644
--- a/e2fsck/problem.h
+++ b/e2fsck/problem.h
@@ -520,6 +520,9 @@ struct problem_context {
/* EOFBLOCKS flag set when not necessary */
#define PR_1_EOFBLOCKS_FL_SET 0x010060

+/* inode checksum invalid */
+#define PR_1_INODE_CSUM_INVALID 0x010061
+
/*
* Pass 1b errors
*/
diff --git a/lib/blkid/probe.h b/lib/blkid/probe.h
index 37e80ef..e01a689 100644
--- a/lib/blkid/probe.h
+++ b/lib/blkid/probe.h
@@ -110,6 +110,7 @@ struct ext2_super_block {
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
+#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400

/* for s_feature_incompat */
#define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002
diff --git a/lib/e2p/feature.c b/lib/e2p/feature.c
index 16fba53..876f0e1 100644
--- a/lib/e2p/feature.c
+++ b/lib/e2p/feature.c
@@ -59,6 +59,8 @@ static struct feature feature_list[] = {
"quota" },
{ E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_BIGALLOC,
"bigalloc"},
+ { E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_INODE_CSUM,
+ "inode_csum"},

{ E2P_FEATURE_INCOMPAT, EXT2_FEATURE_INCOMPAT_COMPRESSION,
"compression" },
diff --git a/lib/ext2fs/csum.c b/lib/ext2fs/csum.c
index 58e2971..3621bd7 100644
--- a/lib/ext2fs/csum.c
+++ b/lib/ext2fs/csum.c
@@ -29,6 +29,65 @@
#define STATIC static
#endif

+__u16 ext2fs_inode_csum(ext2_filsys fs, ext2_ino_t inum,
+ struct ext2_inode_large *inode)
+{
+ int offset = offsetof(struct ext2_inode_large, i_checksum);
+ int extra_size = inode->i_extra_isize;
+ size_t size = fs->super->s_inode_size;
+ __u16 crc = 0;
+
+ if (fs->super->s_feature_ro_compat &
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM &&
+ extra_size >= 4) {
+#ifdef WORDS_BIGENDIAN
+ struct ext4_inode_large swabinode;
+
+ /* Have to swab back to little-endian to do the checksum */
+ memcpy(&swabinode, inode, size);
+ ext2fs_swap_inode_full(fs, inode, inode, 0, size);
+ desc = &swabinode;
+
+ inum = ext2fs_swab32(inum);
+#endif
+ crc = ext2fs_crc16(~0, fs->super->s_uuid,
+ sizeof(fs->super->s_uuid));
+ crc = ext2fs_crc16(crc, &inum, sizeof(inum));
+ crc = ext2fs_crc16(crc, inode, offset);
+ offset += sizeof(inode->i_checksum); /* skip checksum */
+ /* for checksum of struct ext4_group_desc do the rest...*/
+ if (extra_size > 4) {
+ crc = ext2fs_crc16(crc, (char *)inode + offset,
+ extra_size - 4);
+ }
+ }
+
+ return crc;
+}
+
+int ext2fs_inode_csum_verify(ext2_filsys fs, ext2_ino_t inum,
+ struct ext2_inode_large *inode)
+{
+ if (EXT2_HAS_RO_COMPAT_FEATURE(fs->super,
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
+ (inode->i_checksum != ext2fs_inode_csum(fs, inum, inode)))
+ return 0;
+
+ return 1;
+}
+
+void ext2fs_inode_csum_set(ext2_filsys fs, ext2_ino_t inum,
+ struct ext2_inode_large *inode)
+{
+ if (!EXT2_HAS_RO_COMPAT_FEATURE(fs->super,
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM))
+ return;
+
+ /* ext2fs_bg_checksum_set() sets the actual checksum field but
+ * does not calculate the checksum itself. */
+ inode->i_checksum = ext2fs_inode_csum(fs, inum, inode);
+}
+
STATIC __u16 ext2fs_group_desc_csum(ext2_filsys fs, dgrp_t group)
{
__u16 crc = 0;
diff --git a/lib/ext2fs/ext2_err.et.in b/lib/ext2fs/ext2_err.et.in
index 995ddc3..cc9ac3d 100644
--- a/lib/ext2fs/ext2_err.et.in
+++ b/lib/ext2fs/ext2_err.et.in
@@ -422,4 +422,7 @@ ec EXT2_NO_MTAB_FILE,
ec EXT2_ET_CANT_USE_LEGACY_BITMAPS,
"Filesystem too large to use legacy bitmaps"

+ec EXT2_ET_INODE_CSUM_INVALID,
+ "Inode checksum is invalid"
+
end
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index a89e33b..dfedd4e 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -416,7 +416,7 @@ struct ext2_inode_large {
} hurd2;
} osd2; /* OS dependent 2 */
__u16 i_extra_isize;
- __u16 i_pad1;
+ __u16 i_checksum;
__u32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */
__u32 i_mtime_extra; /* extra Modification time (nsec << 2 | epoch) */
__u32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */
@@ -677,6 +677,7 @@ struct ext2_super_block {
#define EXT4_FEATURE_RO_COMPAT_HAS_SNAPSHOT 0x0080
#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400

#define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index d3eb31d..cbfa5a1 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -553,7 +553,8 @@ typedef struct ext2_icount *ext2_icount_t;
EXT2_FEATURE_RO_COMPAT_LARGE_FILE|\
EXT4_FEATURE_RO_COMPAT_DIR_NLINK|\
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|\
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM)
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM |\
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM)

/*
* These features are only allowed if EXT2_FLAG_SOFTSUPP_FEATURES is passed
@@ -858,6 +859,12 @@ extern int ext2fs_super_and_bgd_loc(ext2_filsys fs,
extern void ext2fs_update_dynamic_rev(ext2_filsys fs);

/* csum.c */
+extern __u16 ext2fs_inode_csum(ext2_filsys fs, ext2_ino_t inum,
+ struct ext2_inode_large *inode);
+extern void ext2fs_inode_csum_set(ext2_filsys fs, ext2_ino_t inum,
+ struct ext2_inode_large *inode);
+extern int ext2fs_inode_csum_verify(ext2_filsys fs, ext2_ino_t inum,
+ struct ext2_inode_large *inode);
extern void ext2fs_group_desc_csum_set(ext2_filsys fs, dgrp_t group);
extern int ext2fs_group_desc_csum_verify(ext2_filsys fs, dgrp_t group);
extern errcode_t ext2fs_set_gdt_csum(ext2_filsys fs);
diff --git a/lib/ext2fs/inode.c b/lib/ext2fs/inode.c
index a762dbc..5c377b5 100644
--- a/lib/ext2fs/inode.c
+++ b/lib/ext2fs/inode.c
@@ -608,6 +608,10 @@ errcode_t ext2fs_read_inode_full(ext2_filsys fs, ext2_ino_t ino,
(struct ext2_inode_large *) inode,
0, bufsize);
#endif
+ if (bufsize >= offsetof(struct ext2_inode_large, i_ctime_extra) &&
+ !ext2fs_inode_csum_verify(fs, ino,
+ (struct ext2_inode_large *)inode))
+ return EXT2_ET_INODE_CSUM_INVALID;

/* Update the inode cache */
fs->icache->cache_last = (fs->icache->cache_last + 1) %
@@ -675,6 +679,19 @@ errcode_t ext2fs_write_inode_full(ext2_filsys fs, ext2_ino_t ino,
w_inode = &temp_inode;
memset(w_inode, 0, length);

+ if (fs->super->s_feature_ro_compat &
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM &&
+ ((struct ext2_inode_large *)inode)->i_extra_isize > 4) {
+ if (bufsize < offsetof(struct ext2_inode_large,
+ i_ctime_extra)) {
+ fprintf(stderr, "Underflow, inode %lu bufsize %u\n",
+ ino, bufsize);
+ abort();
+ }
+ ext2fs_inode_csum_set(fs, ino,
+ (struct ext2_inode_large *)inode);
+ }
+
#ifdef WORDS_BIGENDIAN
ext2fs_swap_inode_full(fs, w_inode,
(struct ext2_inode_large *) inode,
diff --git a/lib/ext2fs/swapfs.c b/lib/ext2fs/swapfs.c
index 3a43c6c..c601d8f 100644
--- a/lib/ext2fs/swapfs.c
+++ b/lib/ext2fs/swapfs.c
@@ -279,6 +279,8 @@ void ext2fs_swap_inode_full(ext2_filsys fs, struct ext2_inode_large *t,
return;
}

+ t->i_checksum = ext2fs_swab16(f->i_checksum);
+
i = sizeof(struct ext2_inode) + extra_isize + sizeof(__u32);
if (bufsize < (int) i)
return; /* no space for EA magic */
diff --git a/misc/tune2fs.c b/misc/tune2fs.c
index bcada11..957a19c 100644
--- a/misc/tune2fs.c
+++ b/misc/tune2fs.c
@@ -130,7 +130,8 @@ static __u32 ok_features[3] = {
EXT4_FEATURE_RO_COMPAT_DIR_NLINK|
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|
EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
- EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER
+ EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER |
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM
};

static __u32 clear_ok_features[3] = {
@@ -146,7 +147,8 @@ static __u32 clear_ok_features[3] = {
EXT4_FEATURE_RO_COMPAT_HUGE_FILE|
EXT4_FEATURE_RO_COMPAT_DIR_NLINK|
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM
};

/*
@@ -448,6 +450,18 @@ static void update_feature_set(ext2_filsys fs, char *features)
}

if (FEATURE_ON(E2P_FEATURE_RO_INCOMPAT,
+ EXT4_FEATURE_RO_COMPAT_INODE_CSUM)) {
+ if (sb->s_inode_size < ext4_offsetof(struct ext2_inode_large,
+ i_ctime_extra)) {
+ fputs(_("Inode checksums are only supported when "
+ "inodes are larger than 128 bytes.\n"),
+ stderr);
+ exit(1);
+ }
+ request_fsck_afterwards(fs);
+ }
+
+ if (FEATURE_ON(E2P_FEATURE_RO_INCOMPAT,
EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
for (i = 0; i < fs->group_desc_count; i++) {
gd = ext2fs_group_desc(fs, fs->group_desc, i);

2011-04-07 00:52:51

by Sunil Mushran

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On 04/06/2011 03:45 PM, Darrick J. Wong wrote:
> This patch introduces to ext4 the ability to calculate and verify inode
> checksums. This requires the use of a new ro compatibility flag and some
> accompanying e2fsprogs patches to provide the relevant features in tune2fs and
> e2fsck.
>
> Signed-off-by: Darrick J. Wong<[email protected]>
> ---
>
> fs/ext4/ext4.h | 6 ++++--
> fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 4daaf2b..8815928 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -617,7 +617,7 @@ struct ext4_inode {
> } masix2;
> } osd2; /* OS dependent 2 */
> __le16 i_extra_isize;
> - __le16 i_pad1;
> + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */
> __le32 i_ctime_extra; /* extra Change time (nsec<< 2 | epoch) */
> __le32 i_mtime_extra; /* extra Modification time(nsec<< 2 | epoch) */
> __le32 i_atime_extra; /* extra Access time (nsec<< 2 | epoch) */
> @@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
> #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
> #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
> +#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400
>
> #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
> #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
> @@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
> EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
> EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
> - EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
> + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)
>
> /*
> * Default values for user and/or group using reserved blocks
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1a86282..dc76f19 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -42,6 +42,7 @@
> #include<linux/printk.h>
> #include<linux/slab.h>
> #include<linux/ratelimit.h>
> +#include<linux/crc16.h>
>
> #include "ext4_jbd2.h"
> #include "xattr.h"
> @@ -52,6 +53,40 @@
>
> #define MPAGE_DA_EXTENT_TAIL 0x01
>
> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> + struct ext4_inode_info *ei = EXT4_I(inode);
> + int offset = offsetof(struct ext4_inode, i_checksum);
> + __le32 inum = cpu_to_le32(inode->i_ino);
> + __u16 crc = 0;
> +
> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
> + le16_to_cpu(raw->i_extra_isize)>= 4) {
> + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
> + crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
> + crc = crc16(crc, (__u8 *)raw, offset);
> + offset += sizeof(raw->i_checksum); /* skip checksum */
> + /* for checksum of struct ext4_inode do the rest...*/
> + if (ei->i_extra_isize> 4)
> + crc = crc16(crc, (__u8 *)raw + offset,
> + ei->i_extra_isize - 4);
> + }
> +
> + return cpu_to_le16(crc);
> +}
> +
> +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
> +{
> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
> + (raw->i_checksum != ext4_inode_csum(inode, raw)))
> + return 0;
> +
> + return 1;
> +}
> +
> static inline int ext4_begin_ordered_truncate(struct inode *inode,
> loff_t new_size)
> {
> @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> struct ext4_inode *raw_inode;
> struct ext4_inode_info *ei;
> struct inode *inode;
> - journal_t *journal = EXT4_SB(sb)->s_journal;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + journal_t *journal = sbi->s_journal;
> long ret;
> int block;
>
> @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> } else
> ei->i_extra_isize = 0;
>
> + if (!ext4_inode_csum_verify(inode, raw_inode)) {
> + EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
> + le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
> + le16_to_cpu(raw_inode->i_checksum));
> + ret = -EIO;
> + goto bad_inode;
> + }
> +
> EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
> EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
> EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
> @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle,
> raw_inode->i_version_hi =
> cpu_to_le32(inode->i_version>> 32);
> raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
> +
> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
> + EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
> + raw_inode->i_checksum =
> + ext4_inode_csum(inode, raw_inode);
> }
>
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

You may want to look into jbd2 buffer triggers. struct jbd2_buffer_trigger_type
Instead of computing checksum on every update, it allows one to compute it
just before the journal write. More efficient.

2011-04-07 16:40:12

by djwong

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On Wed, Apr 06, 2011 at 05:52:43PM -0700, Sunil Mushran wrote:
> On 04/06/2011 03:45 PM, Darrick J. Wong wrote:
>> This patch introduces to ext4 the ability to calculate and verify inode
>> checksums. This requires the use of a new ro compatibility flag and some
>> accompanying e2fsprogs patches to provide the relevant features in tune2fs and
>> e2fsck.
>>
>> Signed-off-by: Darrick J. Wong<[email protected]>
>> ---
>>
>> fs/ext4/ext4.h | 6 ++++--
>> fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 2 files changed, 55 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 4daaf2b..8815928 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -617,7 +617,7 @@ struct ext4_inode {
>> } masix2;
>> } osd2; /* OS dependent 2 */
>> __le16 i_extra_isize;
>> - __le16 i_pad1;
>> + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */
>> __le32 i_ctime_extra; /* extra Change time (nsec<< 2 | epoch) */
>> __le32 i_mtime_extra; /* extra Modification time(nsec<< 2 | epoch) */
>> __le32 i_atime_extra; /* extra Access time (nsec<< 2 | epoch) */
>> @@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>> #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
>> #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
>> #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
>> +#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400
>>
>> #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
>> #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
>> @@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>> EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
>> EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
>> EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
>> - EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
>> + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
>> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)
>>
>> /*
>> * Default values for user and/or group using reserved blocks
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 1a86282..dc76f19 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -42,6 +42,7 @@
>> #include<linux/printk.h>
>> #include<linux/slab.h>
>> #include<linux/ratelimit.h>
>> +#include<linux/crc16.h>
>>
>> #include "ext4_jbd2.h"
>> #include "xattr.h"
>> @@ -52,6 +53,40 @@
>>
>> #define MPAGE_DA_EXTENT_TAIL 0x01
>>
>> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
>> +{
>> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> + struct ext4_inode_info *ei = EXT4_I(inode);
>> + int offset = offsetof(struct ext4_inode, i_checksum);
>> + __le32 inum = cpu_to_le32(inode->i_ino);
>> + __u16 crc = 0;
>> +
>> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
>> + le16_to_cpu(raw->i_extra_isize)>= 4) {
>> + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
>> + crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
>> + crc = crc16(crc, (__u8 *)raw, offset);
>> + offset += sizeof(raw->i_checksum); /* skip checksum */
>> + /* for checksum of struct ext4_inode do the rest...*/
>> + if (ei->i_extra_isize> 4)
>> + crc = crc16(crc, (__u8 *)raw + offset,
>> + ei->i_extra_isize - 4);
>> + }
>> +
>> + return cpu_to_le16(crc);
>> +}
>> +
>> +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
>> +{
>> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
>> + (raw->i_checksum != ext4_inode_csum(inode, raw)))
>> + return 0;
>> +
>> + return 1;
>> +}
>> +
>> static inline int ext4_begin_ordered_truncate(struct inode *inode,
>> loff_t new_size)
>> {
>> @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> struct ext4_inode *raw_inode;
>> struct ext4_inode_info *ei;
>> struct inode *inode;
>> - journal_t *journal = EXT4_SB(sb)->s_journal;
>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>> + journal_t *journal = sbi->s_journal;
>> long ret;
>> int block;
>>
>> @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> } else
>> ei->i_extra_isize = 0;
>>
>> + if (!ext4_inode_csum_verify(inode, raw_inode)) {
>> + EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
>> + le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
>> + le16_to_cpu(raw_inode->i_checksum));
>> + ret = -EIO;
>> + goto bad_inode;
>> + }
>> +
>> EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
>> EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
>> EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
>> @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle,
>> raw_inode->i_version_hi =
>> cpu_to_le32(inode->i_version>> 32);
>> raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
>> +
>> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
>> + EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
>> + raw_inode->i_checksum =
>> + ext4_inode_csum(inode, raw_inode);
>> }
>>
>> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>
> You may want to look into jbd2 buffer triggers. struct jbd2_buffer_trigger_type
> Instead of computing checksum on every update, it allows one to compute it
> just before the journal write. More efficient.

Yes, I see that jbd2 has triggers, looks like a nifty feature. I suppose if I
went with that approach I'd still have to calculate the checksum in
ext4_do_update_inode in the nojournal case, and in the journal case I'd write a
trigger that would figure out which inodes in a given buffer are actually dirty
and compute their checksums.

That said, I haven't really quantified the performance impact of this naive
approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
what kind of performance increase did you get by adapting the code to use the
jbd2 trigger? If there's potentially a large increase, it would be interesting
to apply the same conversion to the group descriptor checksumming code too.

--D

2011-04-07 17:11:12

by Sunil Mushran

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On 04/07/2011 09:40 AM, Darrick J. Wong wrote:
> Yes, I see that jbd2 has triggers, looks like a nifty feature. I suppose if I
> went with that approach I'd still have to calculate the checksum in
> ext4_do_update_inode in the nojournal case, and in the journal case I'd write a
> trigger that would figure out which inodes in a given buffer are actually dirty
> and compute their checksums.

Yes, you could use a in-mem flag (set in update, cleared in trigger) to
identify dirty inodes. The discussion on stable pages could be relevant for
the nojournal case.

>
> That said, I haven't really quantified the performance impact of this naive
> approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
> what kind of performance increase did you get by adapting the code to use the
> jbd2 trigger? If there's potentially a large increase, it would be interesting
> to apply the same conversion to the group descriptor checksumming code too.

Joel Becker may remember the overhead. He wrote the patch. That said we have few
differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code
is in fs/ocfs2/blockcheck.c.

As in, you may want to generate ext4 specific numbers.

2011-04-08 08:58:30

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote:
> This patch introduces to ext4 the ability to calculate and verify inode
> checksums. This requires the use of a new ro compatibility flag and some
> accompanying e2fsprogs patches to provide the relevant features in tune2fs and
> e2fsck.
>
> Signed-off-by: Darrick J. Wong <[email protected]>
> ---
>
> fs/ext4/ext4.h | 6 ++++--
> fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 4daaf2b..8815928 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -617,7 +617,7 @@ struct ext4_inode {
> } masix2;
> } osd2; /* OS dependent 2 */
> __le16 i_extra_isize;
> - __le16 i_pad1;
> + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */

In previous discussions, we thought about using part/all of the l_i_reserved2 field to store the inode checksum. The inode checksum is important enough to warrant a field in the core inode, so that it also works on upgraded filesystems.

> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> + struct ext4_inode_info *ei = EXT4_I(inode);
> + int offset = offsetof(struct ext4_inode, i_checksum);
> + __le32 inum = cpu_to_le32(inode->i_ino);
> + __u16 crc = 0;
> +
> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> + le16_to_cpu(raw->i_extra_isize) >= 4) {

If this field remains in the large part of the inode, then this check is incorrect. i_extra_isize is itself only valid if (s_inode_size is > EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode.

Also, instead of hard-coding ">= 4" here, it would be better to use EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum). Probably no reason to put "ei" on the stack at all.

> + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
> + crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
> + crc = crc16(crc, (__u8 *)raw, offset);
> + offset += sizeof(raw->i_checksum); /* skip checksum */
> + /* for checksum of struct ext4_inode do the rest...*/
> + if (ei->i_extra_isize > 4)
> + crc = crc16(crc, (__u8 *)raw + offset,
> + ei->i_extra_isize - 4);

> + }
> +
> + return cpu_to_le16(crc);
> +}
> +
> +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
> +{
> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> + (raw->i_checksum != ext4_inode_csum(inode, raw)))
> + return 0;
> +
> + return 1;
> +}
> +
> static inline int ext4_begin_ordered_truncate(struct inode *inode,
> loff_t new_size)
> {
> @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> struct ext4_inode *raw_inode;
> struct ext4_inode_info *ei;
> struct inode *inode;
> - journal_t *journal = EXT4_SB(sb)->s_journal;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + journal_t *journal = sbi->s_journal;
> long ret;
> int block;
>
> @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> } else
> ei->i_extra_isize = 0;
>
> + if (!ext4_inode_csum_verify(inode, raw_inode)) {
> + EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
> + le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
> + le16_to_cpu(raw_inode->i_checksum));
> + ret = -EIO;
> + goto bad_inode;
> + }
> +
> EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
> EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
> EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
> @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle,
> raw_inode->i_version_hi =
> cpu_to_le32(inode->i_version >> 32);
> raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
> +
> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> + EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
> + raw_inode->i_checksum =
> + ext4_inode_csum(inode, raw_inode);
> }
>
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

Cheers, Andreas

2011-04-08 09:14:08

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH 2/2] e2fsprogs: Add support for toggling, verifying, and fixing inode checksums

On 2011-04-06, at 4:47 PM, Darrick J. Wong wrote:
> This patch adds to tune2fs the ability to toggle the inode checksum rocompat
> feature flag, to e2fsck the ability to verify and correct inode checksums, and
> to debugfs the ability to dump inode checksums.
>
> Signed-off-by: Darrick J. Wong <[email protected]>
> ---
> @@ -729,6 +729,13 @@ void e2fsck_pass1(e2fsck_t ctx)
> + /* Check for invalid inode checksum */
> + if (!ext2fs_inode_csum_verify(fs, ino,
> + (struct ext2_inode_large *)inode) &&
> + fix_problem(ctx, PR_1_INODE_CSUM_INVALID, &pctx))
> + e2fsck_write_inode_full(ctx, ino, inode,
> + sizeof(struct ext2_inode_large), "pass1");

If we just correct the checksum when it is found to be incorrect, then there is relatively little benefit in having it at all? The default action in this case would likely be to declare the inode invalid and clears it, but there also needs to be a fallback option that declares the only checksum invalid and corrects it.

Do you have an e2fsck testcase for this code, to show that it detects/fixes inodes with data corruption, and to fix the checksums after the ROCOMPAT flag is set the first time?

With the "ibadness" patch in our tree, the bad checksum should be a significant factor in marking the inode as garbage, but possibly not enough to have it thrown out if there are no other errors in the inode.

> @@ -890,6 +890,11 @@ static struct e2fsck_problem problem_table[] = {
> "(size %Is, lblk %r)\n"),
> PROMPT_CLEAR, PR_PREEN_OK },
>
> + /* Fast symlink has EXTENTS_FL set */
> + { PR_1_INODE_CSUM_INVALID,
> + N_("inode %i checksum invalid. "),

The comment for each problem should exactly mirror the text that is printed. In this case, you haven't used the abbreviations "@i" and "@n", which would normally make it much harder to search for this error string in the code, but also simplifies the translation of the message.

Cheers, Andreas

2011-04-08 18:50:29

by Joel Becker

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On Thu, Apr 07, 2011 at 10:10:52AM -0700, Sunil Mushran wrote:
> On 04/07/2011 09:40 AM, Darrick J. Wong wrote:
> >That said, I haven't really quantified the performance impact of this naive
> >approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
> >what kind of performance increase did you get by adapting the code to use the
> >jbd2 trigger? If there's potentially a large increase, it would be interesting
> >to apply the same conversion to the group descriptor checksumming code too.
>
> Joel Becker may remember the overhead. He wrote the patch. That said we have few
> differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code
> is in fs/ocfs2/blockcheck.c.

ocfs2 does the journal access/journal dirty cycle a lot more
than extN. I think you'd want to generate your own numbers.

Joel

--

"Conservative, n. A statesman who is enamoured of existing evils,
as distinguished from the Liberal, who wishes to replace them
with others."
- Ambrose Bierce, The Devil's Dictionary

http://www.jlbec.org/
[email protected]

2011-04-08 19:12:54

by djwong

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On Fri, Apr 08, 2011 at 02:58:27AM -0600, Andreas Dilger wrote:
> On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote:
> > This patch introduces to ext4 the ability to calculate and verify inode
> > checksums. This requires the use of a new ro compatibility flag and some
> > accompanying e2fsprogs patches to provide the relevant features in tune2fs and
> > e2fsck.
> >
> > Signed-off-by: Darrick J. Wong <[email protected]>
> > ---
> >
> > fs/ext4/ext4.h | 6 ++++--
> > fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 2 files changed, 55 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 4daaf2b..8815928 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -617,7 +617,7 @@ struct ext4_inode {
> > } masix2;
> > } osd2; /* OS dependent 2 */
> > __le16 i_extra_isize;
> > - __le16 i_pad1;
> > + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */
>
> In previous discussions, we thought about using part/all of the l_i_reserved2
> field to store the inode checksum. The inode checksum is important enough to
> warrant a field in the core inode, so that it also works on upgraded
> filesystems.

Hmm, yes, I better like using that field too, it'll simplify the code. Does
anyone know which of the s_creator_os codes map to the Linux format? I'm going
to guess 1 and 2 don't... for now I suppose the mount code can check for a
Linux creator code if the inode checksum feature is set, and fail the mount if
it finds 1 or 2. I'm not sure which version of the osd2 union FreeBSD or
"Lites" use.

> > +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
> > +{
> > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> > + struct ext4_inode_info *ei = EXT4_I(inode);
> > + int offset = offsetof(struct ext4_inode, i_checksum);
> > + __le32 inum = cpu_to_le32(inode->i_ino);
> > + __u16 crc = 0;
> > +
> > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> > + le16_to_cpu(raw->i_extra_isize) >= 4) {
>
> If this field remains in the large part of the inode, then this check is
> incorrect. i_extra_isize is itself only valid if (s_inode_size is >
> EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode.
>
> Also, instead of hard-coding ">= 4" here, it would be better to use
> EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum). Probably no reason to
> put "ei" on the stack at all.

As always, thank you for the feedback!

--D

2011-04-08 19:25:41

by djwong

[permalink] [raw]

Subject: Re: [PATCH 2/2] e2fsprogs: Add support for toggling, verifying, and fixing inode checksums

On Fri, Apr 08, 2011 at 03:14:04AM -0600, Andreas Dilger wrote:
> On 2011-04-06, at 4:47 PM, Darrick J. Wong wrote:
> > This patch adds to tune2fs the ability to toggle the inode checksum rocompat
> > feature flag, to e2fsck the ability to verify and correct inode checksums, and
> > to debugfs the ability to dump inode checksums.
> >
> > Signed-off-by: Darrick J. Wong <[email protected]>
> > ---
> > @@ -729,6 +729,13 @@ void e2fsck_pass1(e2fsck_t ctx)
> > + /* Check for invalid inode checksum */
> > + if (!ext2fs_inode_csum_verify(fs, ino,
> > + (struct ext2_inode_large *)inode) &&
> > + fix_problem(ctx, PR_1_INODE_CSUM_INVALID, &pctx))
> > + e2fsck_write_inode_full(ctx, ino, inode,
> > + sizeof(struct ext2_inode_large), "pass1");
>
> If we just correct the checksum when it is found to be incorrect, then there
> is relatively little benefit in having it at all? The default action in this
> case would likely be to declare the inode invalid and clears it, but there
> also needs to be a fallback option that declares the only checksum invalid
> and corrects it.
>
> Do you have an e2fsck testcase for this code, to show that it detects/fixes
> inodes with data corruption, and to fix the checksums after the ROCOMPAT flag
> is set the first time?

Not yet; I suspected that some clarification of exactly that issue was needed.
It looks to me that in general the checksum will be zero for the "flag is
enabled but no checksum has yet been provided" case, and nonzero in the "inode
is corrupt" case. So if e2fsck sees zero it'd first ask to correct the
checksum, and if it sees nonzero it'll first ask to clear the inode. If the
user answers no to the first question, e2fsck can then propose the second
option.

> With the "ibadness" patch in our tree, the bad checksum should be a
> significant factor in marking the inode as garbage, but possibly not enough
> to have it thrown out if there are no other errors in the inode.

Or e2fsck could use that heuristic; which tree is the ibadness patch in?
Google shows a patch from 2008, but no recent discussion.

Something along the lines of: if the inode is not very bad, ask first to fix
the checksum and second to clear the inode; if the inode seems bad, ask first
to clear it and second to fix the checksum.

> > @@ -890,6 +890,11 @@ static struct e2fsck_problem problem_table[] = {
> > "(size %Is, lblk %r)\n"),
> > PROMPT_CLEAR, PR_PREEN_OK },
> >
> > + /* Fast symlink has EXTENTS_FL set */
> > + { PR_1_INODE_CSUM_INVALID,
> > + N_("inode %i checksum invalid. "),
>
> The comment for each problem should exactly mirror the text that is printed.
> In this case, you haven't used the abbreviations "@i" and "@n", which would
> normally make it much harder to search for this error string in the code, but
> also simplifies the translation of the message.

Oops, comment blooper that was a thinko on my part. What would the @n be for?

--D

2011-04-08 19:28:38

by Mingming Cao

[permalink] [raw]

Subject: Re: [PATCH 0/2] Add inode checksum support to ext4

On Wed, 2011-04-06 at 15:44 -0700, Darrick J. Wong wrote:
> Hi all,
>
> I spent last week analyzing a client's corrupted ext3 image to see if I could
> determine what had gone wrong and caused the filesystem to blow apart. As best
> as I could tell, a data block got miswritten into a different sector ... which
> happened to be an indirect block. Some time later the indirect block, which
> now pointed at one of the inode tables (among other things that shouldn't ever
> become file data) was loaded as part of a file write, which caused that inode
> table to be blown to smithereens. Just for fun I tried reading from one of
> these busted-inode files and ... failed to encounter any errors. Somehow, they
> didn't find it funny that ext3 would read block numbers from a table with the
> contents "ibm.com" with a straight face. Fortunately there were backups. :)
>
> The client at this point asked if ext4 would do a better job of sanity
> checking, which got me to wonder why ext4 checksums block groups but not
> inodes. It's on Ted's todo list, but apparently nobody wrote any patch, so I
> did. The following two patches are a first draft of adding inode checksum
> support to both the kernel driver and to the various e2fsprogs.
>

We had some discussion about this week at SF (at the ext4 bof at the
linux colloboration summit). Beyond checksumming the inode itself, it
would be more useful to checksum the extent indexing blocks, as the ext3
corruption actually happen at the indirect block.

The idea is to reduce the eh_max (the max # of extents stored in this
block) to save some space to store the checksums in the block,

/*
* Each block (leaves and indexes), even inode-stored has header.
*/
struct ext4_extent_header {
__le16 eh_magic; /* probably will support different
formats */
__le16 eh_entries; /* number of valid entries */
__le16 eh_max; /* capacity of store in entries */
__le16 eh_depth; /* has tree real underlying blocks? */
__le32 eh_generation; /* generation of the tree */
};
This would make us a RO feature to checksum the leaves and indexes
blocks too.

> If you have an existing ext* fs with 256-byte inodes, you ought to be able to
> "tune2fs -O inode_csum /dev/XXX", fsck /dev/XXX, and mount the filesystem with
> checksumming enabled. It seems to work for me (i386/x86-64), but I'm looking
> for comments for improvement and perhaps some more testing (ppc64 is still
> building). This inode checksum feature is not enabled by default.
>
> --D
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2011-04-08 19:30:36

by djwong

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On Fri, Apr 08, 2011 at 11:50:13AM -0700, Joel Becker wrote:
> On Thu, Apr 07, 2011 at 10:10:52AM -0700, Sunil Mushran wrote:
> > On 04/07/2011 09:40 AM, Darrick J. Wong wrote:
> > >That said, I haven't really quantified the performance impact of this naive
> > >approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
> > >what kind of performance increase did you get by adapting the code to use the
> > >jbd2 trigger? If there's potentially a large increase, it would be interesting
> > >to apply the same conversion to the group descriptor checksumming code too.
> >
> > Joel Becker may remember the overhead. He wrote the patch. That said we have few
> > differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code
> > is in fs/ocfs2/blockcheck.c.

Heh, yes, ext4 uses a fairly simple crc16 and the inodes are (most likely) not
block sized.

> ocfs2 does the journal access/journal dirty cycle a lot more
> than extN. I think you'd want to generate your own numbers.

Ok, I ran both the mailserver ffsb profile and a quick-and-dumb test that tried
to dirty inodes as fast as it could. On both a regular disk, an SSD, and a
loopmounted ext4 on a tmpfs I couldn't really see much of a performance
difference at all. I'll see about giving this a try once I get the field
location and e2fsck behavior more firmly resolved, though I suspect I won't see
much gain.

--D

2011-04-08 20:17:17

by Joel Becker

[permalink] [raw]

Subject: Re: [PATCH 0/2] Add inode checksum support to ext4

On Fri, Apr 08, 2011 at 12:27:48PM -0700, Mingming Cao wrote:
> We had some discussion about this week at SF (at the ext4 bof at the
> linux colloboration summit). Beyond checksumming the inode itself, it
> would be more useful to checksum the extent indexing blocks, as the ext3
> corruption actually happen at the indirect block.

You want to checksum all metadata. ocfs2 takes advantage of
this because we can just EIO any bad metadata and continue to serve the
rest of the filesystem.

Joel

--

"Time is an illusion, lunchtime doubly so."
-Douglas Adams

http://www.jlbec.org/
[email protected]

2011-04-08 22:49:56

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH 1/2] ext4: Calculate and verify inode checksums

On 2011-04-08, at 1:12 PM, Darrick J. Wong wrote:
> On Fri, Apr 08, 2011 at 02:58:27AM -0600, Andreas Dilger wrote:
>> On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote:
>>> @@ -617,7 +617,7 @@ struct ext4_inode {
>>> } masix2;
>>> } osd2; /* OS dependent 2 */
>>> __le16 i_extra_isize;
>>> - __le16 i_pad1;
>>> + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */
>>
>> In previous discussions, we thought about using part/all of the l_i_reserved2
>> field to store the inode checksum. The inode checksum is important enough to
>> warrant a field in the core inode, so that it also works on upgraded
>> filesystems.
>
> Hmm, yes, I better like using that field too, it'll simplify the code. Does
> anyone know which of the s_creator_os codes map to the Linux format? I'm going
> to guess 1 and 2 don't... for now I suppose the mount code can check for a
> Linux creator code if the inode checksum feature is set, and fail the mount if
> it finds 1 or 2. I'm not sure which version of the osd2 union FreeBSD or
> "Lites" use.

Actually, the e2fsprogs code has already removed the "masix" part of the union years ago, and I'm not even sure why the "hurd" part of the union remains in the kernel code. I think only Hurd is using the "h_i_author" field, so I suspect all that is needed is to refuse tune2fs to enable EXT4_FEATURE_RO_COMPAT_INODE_CSUM if s_creator_os == EXT2_OS_HURD (1).

The kernel shouldn't really care about this field - if the checksum feature isn't enabled, then this field will be ignored, and if the checksum feature IS enabled it could only have been done by mke2fs, or tune2fs which should check if it is allowed based on s_creator_os.

>>> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
>>> +{
>>> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>> + struct ext4_inode_info *ei = EXT4_I(inode);
>>> + int offset = offsetof(struct ext4_inode, i_checksum);
>>> + __le32 inum = cpu_to_le32(inode->i_ino);
>>> + __u16 crc = 0;
>>> +
>>> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>>> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
>>> + le16_to_cpu(raw->i_extra_isize) >= 4) {
>>
>> If this field remains in the large part of the inode, then this check is
>> incorrect. i_extra_isize is itself only valid if (s_inode_size is >
>> EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode.
>>
>> Also, instead of hard-coding ">= 4" here, it would be better to use
>> EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum). Probably no reason to
>> put "ei" on the stack at all.
>
> As always, thank you for the feedback!
>
> --D

Cheers, Andreas

2011-04-08 23:13:18

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH 2/2] e2fsprogs: Add support for toggling, verifying, and fixing inode checksums

On 2011-04-08, at 1:25 PM, Darrick J. Wong wrote:
> On Fri, Apr 08, 2011 at 03:14:04AM -0600, Andreas Dilger wrote:
>> Do you have an e2fsck testcase for this code, to show that it detects/fixes
>> inodes with data corruption, and to fix the checksums after the ROCOMPAT flag
>> is set the first time?
>
> Not yet; I suspected that some clarification of exactly that issue was needed.
> It looks to me that in general the checksum will be zero for the "flag is
> enabled but no checksum has yet been provided" case, and nonzero in the "inode
> is corrupt" case. So if e2fsck sees zero it'd first ask to correct the
> checksum, and if it sees nonzero it'll first ask to clear the inode. If the
> user answers no to the first question, e2fsck can then propose the second
> option.

Seems reasonable, though it is possible that the inode checksums can also become invalid due to changing the filesystem UUID. This should probably be handled by tune2fs when the UUID is changed, with an extra prompt if INODE_CSUM is enabled to indicate that the conversion may take a long time.

Looking at the checksum algorithm you used, the inode checksum does not change if the inode is relocated due to resize (i.e. it uses the inode number and not the underlying block number). This is convenient, and does not impact the correctness in any way - if the wrong block is read/written then the inode number used in the checksum will not match either.

>> With the "ibadness" patch in our tree, the bad checksum should be a
>> significant factor in marking the inode as garbage, but possibly not enough
>> to have it thrown out if there are no other errors in the inode.
>
> Or e2fsck could use that heuristic; which tree is the ibadness patch in?
> Google shows a patch from 2008, but no recent discussion.

There is a relatively up-to-date version at
http://git.whamcloud.com/?p=tools/e2fsprogs.git;a=blob_plain;f=patches/e2fsprogs-ibadness-counter.patch;hb=8dd11ed9bdf0914d57d78d0c387bd21f747c1d29

> Something along the lines of: if the inode is not very bad, ask first to fix
> the checksum and second to clear the inode; if the inode seems bad, ask first
> to clear it and second to fix the checksum.

Yes, that is what I was thinking. The real question is why the checksum would be bad in the case of no other "badness"? If it is due to the UUID, that should be handled when the UUID is changed, and if it is due to a misplaced write (i.e. bad inode number) then it will help us to distinguish between the "real" inode and the misplaced "bad" inode.

>>> @@ -890,6 +890,11 @@ static struct e2fsck_problem problem_table[] = {
>>> "(size %Is, lblk %r)\n"),
>>> PROMPT_CLEAR, PR_PREEN_OK },
>>>
>>> + /* Fast symlink has EXTENTS_FL set */
>>> + { PR_1_INODE_CSUM_INVALID,
>>> + N_("inode %i checksum invalid. "),
>>
>> The comment for each problem should exactly mirror the text that is printed.
>> In this case, you haven't used the abbreviations "@i" and "@n", which would
>> normally make it much harder to search for this error string in the code, but
>> also simplifies the translation of the message.
>
> Oops, comment blooper that was a thinko on my part. What would the @n be for?

@i is "inode", @n is "invalid", per e2fsck/message.c.

Cheers, Andreas

2011-04-12 02:05:31

by djwong

[permalink] [raw]

Subject: Re: [PATCH 2/2] e2fsprogs: Add support for toggling, verifying, and fixing inode checksums

On Fri, Apr 08, 2011 at 05:13:13PM -0600, Andreas Dilger wrote:
> On 2011-04-08, at 1:25 PM, Darrick J. Wong wrote:
> > On Fri, Apr 08, 2011 at 03:14:04AM -0600, Andreas Dilger wrote:
> >> Do you have an e2fsck testcase for this code, to show that it detects/fixes
> >> inodes with data corruption, and to fix the checksums after the ROCOMPAT flag
> >> is set the first time?
> >
> > Not yet; I suspected that some clarification of exactly that issue was needed.
> > It looks to me that in general the checksum will be zero for the "flag is
> > enabled but no checksum has yet been provided" case, and nonzero in the "inode
> > is corrupt" case. So if e2fsck sees zero it'd first ask to correct the
> > checksum, and if it sees nonzero it'll first ask to clear the inode. If the
> > user answers no to the first question, e2fsck can then propose the second
> > option.
>
> Seems reasonable, though it is possible that the inode checksums can also
> become invalid due to changing the filesystem UUID. This should probably be
> handled by tune2fs when the UUID is changed, with an extra prompt if
> INODE_CSUM is enabled to indicate that the conversion may take a long time.

Yes, sounds reasonable. I guess we'd have to verify all the inode checksums
before changing the UUID, change the UUID, and then set new checksums. If the
pre-verification step fails, demand e2fsck and don't write anything.

> Looking at the checksum algorithm you used, the inode checksum does not
> change if the inode is relocated due to resize (i.e. it uses the inode number
> and not the underlying block number). This is convenient, and does not
> impact the correctness in any way - if the wrong block is read/written then
> the inode number used in the checksum will not match either.
>
> >> With the "ibadness" patch in our tree, the bad checksum should be a
> >> significant factor in marking the inode as garbage, but possibly not enough
> >> to have it thrown out if there are no other errors in the inode.
> >
> > Or e2fsck could use that heuristic; which tree is the ibadness patch in?
> > Google shows a patch from 2008, but no recent discussion.
>
> There is a relatively up-to-date version at
> http://git.whamcloud.com/?p=tools/e2fsprogs.git;a=blob_plain;f=patches/e2fsprogs-ibadness-counter.patch;hb=8dd11ed9bdf0914d57d78d0c387bd21f747c1d29

Ok, I'll pull that into my tree.

> > Something along the lines of: if the inode is not very bad, ask first to fix
> > the checksum and second to clear the inode; if the inode seems bad, ask first
> > to clear it and second to fix the checksum.
>
> Yes, that is what I was thinking. The real question is why the checksum
> would be bad in the case of no other "badness"? If it is due to the UUID,
> that should be handled when the UUID is changed, and if it is due to a
> misplaced write (i.e. bad inode number) then it will help us to distinguish
> between the "real" inode and the misplaced "bad" inode.

Agreed. I discovered another problem is that there seem to be a number of
places where e2fsck loads only the first 128 bytes out of an inode, checks it,
and then writes out a "corrected" 128 byte inode. Obviously e2fsck needs to be
changed to read in the full inode size (whatever that is) and write out the
same amount, though this will probably result in a lot of e2fsck code churn.

I added the ability for e2fsck to zero out the checksum if it finds a Linux
ext* fs and the checksum feature disabled.

Mingming was also wondering if we ought to save some rocompat bits and combine
all the current checksumming proposals (extent tree, bitmaps) under one
rocompat bit? Sounds like a decent idea to me.

By the way, I've been uploading my notes about on-disk layout to the wiki:
https://ext4.wiki.kernel.org/index.php/Ext4_Disk_Layout

(Not sure if it's 100% clueful, but there we go.)

--D

>
> >>> @@ -890,6 +890,11 @@ static struct e2fsck_problem problem_table[] = {
> >>> "(size %Is, lblk %r)\n"),
> >>> PROMPT_CLEAR, PR_PREEN_OK },
> >>>
> >>> + /* Fast symlink has EXTENTS_FL set */
> >>> + { PR_1_INODE_CSUM_INVALID,
> >>> + N_("inode %i checksum invalid. "),
> >>
> >> The comment for each problem should exactly mirror the text that is printed.
> >> In this case, you haven't used the abbreviations "@i" and "@n", which would
> >> normally make it much harder to search for this error string in the code, but
> >> also simplifies the translation of the message.
> >
> > Oops, comment blooper that was a thinko on my part. What would the @n be for?
>
> @i is "inode", @n is "invalid", per e2fsck/message.c.
>
> Cheers, Andreas
>
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2011-04-20 17:41:08

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH 0/2] Add inode checksum support to ext4

"Darrick J. Wong" <[email protected]> writes:

FWIW I had a similar idea quite some time ago and implemented checksums for
superblocks and inodes. I never posted it because I didn't get around
to write the e2fsprogs support and do a lot of performance testing.
There was one result that showed a slow down in quick tests, but
I suspect it was a fluke and probably needs to be redone. In other
tests it was neutral.

Anyways here's the old patch if anyone is interested (against a ~.35ish kernel)
I can forward port it if there's interest.

IMHO it's a good idea but it should be done for the super blocks too
(and ideally for all objects, although unfortunately that breaks
the disk format)

The locking for stable buffers is still something that needs to be
double checked.

-Andi

commit 8ece10cfa4b148075dbb93ca65855a7e2aad7b07
Author: Andi Kleen <[email protected]>
Date: Mon May 31 18:27:29 2010 +0200

EXT4: Add checksums to the super block and the inodes

Currently when a on-disk structure is corrupted ext4 usually
detects it only by some internal inconsistency, but this can happen
too late or give confusing error messages.

One way to detect corruptions sooner is to use checksums. This
means the file system can stop using a corrupted object ASAP, not
follow any potentially corrupted pointers to other blocks,
and also give a clear message on what happened.

Potentially it can also be used to limit read only mounts and
forced fscks. When the extent of a corruption can be detected
accurately using checksums and only force errors on that
particular object (this is right now not enabled though
because there are still too many objects with missing checksums)

There's a general trend in file systems recently to add metadata
checksums, this just makes ext4 catch up a bit (in addition
to the already existing transaction and block group checksums)

Another advantage is that the checksum can also check for misplaced
writes (by including the intended block location) and objects
left over from before mkfs (by including the file system UUID).

Adding checksums to all metadata in ext4 would likely need some
incompatible format changes, but it's relatively straight-forward to add
them to inodes and the super block at least. This patch-kit does this.

Again it only handles some meta-data objects. This is not a full
user-data checksumming feature or not even a "all metadata objects"
feature.

I didn't do a lot of research into different CRC functions, but simply
used the same one as btrfs and iSCSI (crc32c). ocfs2 uses ECCs
that have some self-correcting ability, but that seemed overkill to me.
The standard CRC has the advantage that the CRC is accelerated by hardware
instructions on newer Intel CPUs and possibly on other platforms too.
The CRC32C function is also hard coded, although in theory it could be made
configurable per file system. But since that would lead to a multitude of
incompatible formats I decided to simply hard code for now.

Right now there is no e2fsprogs support, so fsck doesn't know about CRCs.

For now the checksums can be enabled with a simple shell script
(which is just a wrapper around debugfs):

wget ftp://firstfloor.org/pub/ak/shell/e2checksum
chmod +x e2checksum

To enable:

./e2checksum /dev/device enable

To disable:

./e2checksum /dev/device disable

This works with the current ext4 e2fsprogs without any changes.

The kernel will automatically add checksums to the file system when the
feature is turned on, as the inodes are rewritten.

WARNING: Note there is no fsck support currently, so before you run fsck better
disable the checksums. After disabling/changing/reenabling
you may also need to mount with ignore_crc until the checksums
are fixed up again.

The checksums are implemented as a read-only compat feature: this means
the a file system with checksums enabled can be read by a kernel that
does not know about checksums, but not written. Of course you can turn
off the flag again to make the file system write compatible again (but
that will put the checksums into a inconsistent state) or use the
ignore_crc mount option.

WARNING: the in-kernel upgrader relies on the checksum fields being
zero when checksums are enabled. When you turn on the feature, use the
file system, turn it off again, use it again and turn it on again the
checksums will be in a inconsistent state. Right now this can be
fixed by mounting with -o ignore_crc and then doing a find | xargs touch
on the file system.

I expect real e2fsprogs support can do that better.

The code also adds some more sanity checks on inodes to distingush zeroed
inodes from inodes with no checksums. This is currently done by enforcing
that a/m/ctime are not zero. If there are broken file systems around
where that is not true, the sanity check can be turned off (see below)

There are two new mount options:

ignore_crc Ignore the CRCs on reading but still update them
noinode_sanity Don't do new inode sanity checks

The implementation is not particularly optimized: it always recomputes
the full CRC on each inode or super block write. Some more optimizations
would be possible in this area.

BENCHMARKS

I ran kernelbench and it didn't show any significant difference between
the run with checksums and the run without.

I also tried timing fsstress from XFS/LTP, and it gave a 35% slowdown
on a disk and 5% slow down on the ramdisk for the system time
during the test. However I'm a bit suspicious of these results.
This was on a core 2.

I also tested on a Nehalem system which has CRC acceleration instructions
in the CPU.

Anyone has a better inode metadata intensive benchmark to run?

This gobbled up the last mount flag bits, but there are still some unused
holes in the middle.

The patch is definitely still experimental and needs more testing and
review and e2fsprogs support. In particular I would appreciate review
of my bh locking scheme.

Also ext4_abort() now takes the super lock, which implies that the callers
shouldn't. I think I checked all callers that it's safe, but double checking
that would be good.

Opens:
- e2fsprogs support. In this case the zero crc hack could be removed from
the super block code at least.
- Right now checksum numbers by default lead to a read-only file system remount
and are also logged as "IO error". This could be made more gentle, because
a checksum catches problems early enough that continuation might be possible.
- Incremental checksumming
- Checksums for extents and directories
- In ignore_crc mode the checksums are only rewritten when the inode is somehow
dirtied otherwise. Do this implicitely? Might be better to move this to e2fsprogs

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index e1def17..e98141a 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -359,6 +359,14 @@ nodiscard(*) commands to the underlying block device when
and sparse/thinly-provisioned LUNs, but it is off
by default until sufficient testing has been done.

+nosanity_check Don't check for zero m/c/a times when reading a inode.
+ Should not normally be needed.
+
+ignore_crc Ignore checksum failures while reading the super block
+ or inodes, but still update the checksums on writing
+ (if you don't want to update the checksums, clear the
+ checksum feature bit in the super block)
+
Data Mode
=========
There are 3 different data modes:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de5..3a99769 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -611,6 +611,7 @@ struct ext4_inode {
__le32 i_crtime; /* File Creation time */
__le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
__le32 i_version_hi; /* high 32 bits for 64-bit version */
+ __le32 i_crc; /* CRC32 for this inode */
};

struct move_extent {
@@ -836,6 +837,12 @@ struct ext4_inode_info {
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
+
+ /*
+ * Protect raw inode modifications, mostly to ensure
+ * stability for checksumming.
+ */
+ spinlock_t raw_inode_lock;
};

/*
@@ -881,10 +888,12 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
+#define EXT4_MOUNT_IGNORE_CRC 0x4000000 /* Ignore CRCs on read */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INODE_SANITY 0x80000000 /* Inode sanity check */

#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -1003,7 +1012,8 @@ struct ext4_super_block {
__u8 s_reserved_char_pad2;
__le16 s_reserved_pad;
__le64 s_kbytes_written; /* nr of lifetime kilobytes written */
- __u32 s_reserved[160]; /* Padding to the end of the block */
+ __le32 s_crc; /* CRC32 for this super block */
+ __u32 s_reserved[159]; /* Padding to the end of the block */
};

#ifdef __KERNEL__
@@ -1051,6 +1061,7 @@ struct ext4_sb_info {
u32 s_hash_seed[4];
int s_def_hash_version;
int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ u8 s_uuid[16];
struct percpu_counter s_freeblocks_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
@@ -1265,6 +1276,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4_FEATURE_RO_COMPAT_SBI_CRC 0x0080 /* sb and inode CRCs */

#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1291,7 +1303,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+ EXT4_FEATURE_RO_COMPAT_SBI_CRC)

/*
* Default values for user and/or group using reserved blocks
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 42272d6..8ba2f24 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/crc32c.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -72,6 +73,141 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}

+#define ZERO_MAGIC 1
+
+static __le32 inode_crc(struct super_block *sb, struct ext4_inode *raw_inode, long ino)
+{
+ struct ext4_csum_header {
+ __le64 ino;
+ __le64 pad;
+ u8 uuid[16];
+ } __attribute__((aligned)) hdr;
+ u32 crc;
+
+ /*
+ * First CRC in the inode number and the file system UUID
+ * to detect inodes from before the last mkfs and misplaced inode
+ * writes.
+ */
+ hdr.ino = cpu_to_le64(ino);
+ memcpy(&hdr.uuid, EXT4_SB(sb)->s_uuid, 16);
+ hdr.pad = 0;
+
+ raw_inode->i_crc = 0;
+ crc = crc32c(~0U, &hdr, sizeof(struct ext4_csum_header));
+ /* Could CRC precompute the common zero tail? (if it's really common) */
+ crc = crc32c(crc, raw_inode, EXT4_INODE_SIZE(sb));
+ if (crc == 0)
+ crc = ZERO_MAGIC;
+ return cpu_to_le32(crc);
+}
+
+/*
+ * Update CRC in a inode, including all additional fields after the
+ * standard inode structure.
+ *
+ * This relies on the raw_inode_lock protecting against all writes
+ * to the raw inode state in the bh. Right now the JBD locking
+ * is not good enough for that.
+ *
+ * This always precomputes the full checksum. In principle it would
+ * be possible to be more clever and do incremental changes at least
+ * for some state changes.
+ */
+static void inode_update_crc(struct super_block *sb, struct ext4_inode *raw_inode,
+ long ino)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
+ return;
+ raw_inode->i_crc = inode_crc(sb, raw_inode, ino);
+}
+
+/*
+ * This is needed because nfsd might try to access dead inodes
+ * the test is that same one that e2fsck uses
+ * NeilBrown 1999oct15
+ */
+static int inode_deleted(struct super_block *sb, struct ext4_inode *raw_inode)
+{
+ if (raw_inode->i_links_count == 0) {
+ if (raw_inode->i_mode == 0 ||
+ !(EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS))
+ return -ESTALE;
+ /*
+ * The only unlinked inodes we let through here have
+ * valid i_mode and are being read by the orphan
+ * recovery code: that's fine, we're about to complete
+ * the process of deleting those.
+ */
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Does this checksum-less inode look like a valid inode?
+ * Do a few sanity checks.
+ */
+static int inode_sanity_check(struct super_block *sb, struct ext4_inode *raw_inode, char **msg)
+{
+ int err = inode_deleted(sb, raw_inode);
+ if (err)
+ return err;
+ if (!test_opt(sb, INODE_SANITY))
+ return 0;
+ if (raw_inode->i_mode == 0 ||
+ raw_inode->i_atime == 0 ||
+ raw_inode->i_ctime == 0 ||
+ raw_inode->i_mtime == 0) {
+ *msg = "inode has invalid zero times";
+ return -1;
+ }
+ /* More sanity checks? */
+ return 0;
+}
+
+/*
+ * Check CRC for a newly read inode.
+ */
+static int inode_check_crc(struct super_block *sb, struct ext4_inode *raw_inode,
+ long ino, char **msg)
+{
+ __le32 crc, check;
+
+ /*
+ * Special case: zero CRC. This is handled like no checksum yet,
+ * otherwise tune2fs would need to rewrite all inodes when CRCs
+ * are turned on. The CRC will be updated when the inode
+ * is written out.
+ *
+ * This however means that if zeroes are blasted over the inodes
+ * we would think the checksum is not there. So instead do
+ * some special sanity checks in the other fields when this happens,
+ * to catch this case.
+ * This is not 100% fool proof, but should be reasonable.
+ * When the checksum function returns a real zero we turn that
+ * into a one to avoid ambiguity.
+ *
+ * The sanity check is done unconditionally even if the checksum passed
+ * because it's cheap enough.
+ */
+ crc = raw_inode->i_crc;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC) && crc
+ && !test_opt(sb, IGNORE_CRC)) {
+ check = inode_crc(sb, raw_inode, ino);
+ if (check != crc) {
+ *msg = "inode has invalid checksum";
+ /*
+ * Restore bad CRC so that if the inode is reread it'll fail
+ * the check again.
+ */
+ raw_inode->i_crc = crc;
+ return -1;
+ }
+ }
+ return inode_sanity_check(sb, raw_inode, msg);
+}
+
/*
* Work out how many blocks we need to proceed with the next chunk of a
* truncate transaction.
@@ -4996,6 +5132,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
+ char *msg = NULL;

inode = iget_locked(sb, ino);
if (!inode)
@@ -5010,6 +5147,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
+
+ /*
+ * Relies on the inode lock to protect the raw_inode bh contents.
+ */
+ ret = inode_check_crc(sb, raw_inode, ino, &msg);
+ if (ret < 0) {
+ /*
+ * Here would be the place to send a "read other mirror"
+ * retry hint to the block layer.
+ */
+ brelse(iloc.bh);
+ if (ret != -ESTALE)
+ ext4_error(sb,
+ "%s: inode=%lu, block=%llu", msg,
+ ino,
+ (unsigned long long)iloc.bh->b_blocknr);
+ goto bad_inode;
+ }
+ ret = 0;
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -5022,23 +5179,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_state_flags = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
- /* We now have enough fields to check if the inode was active or not.
- * This is needed because nfsd might try to access dead inodes
- * the test is that same one that e2fsck uses
- * NeilBrown 1999oct15
- */
- if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
- /* this inode is deleted */
- ret = -ESTALE;
- goto bad_inode;
- }
- /* The only unlinked inodes we let through here have
- * valid i_mode and are being read by the orphan
- * recovery code: that's fine, we're about to complete
- * the process of deleting those. */
- }
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
@@ -5232,11 +5372,19 @@ static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc)
{
+ struct super_block *sb = inode->i_sb;
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;

+ /*
+ * Protect the on disk inode against parallel modification
+ * until we compute the checksum and pass the resulting block
+ * to JBD, which protects it then.
+ */
+ spin_lock(&ei->raw_inode_lock);
+
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
@@ -5291,18 +5439,23 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
EXT4_SB(sb)->s_es->s_rev_level ==
cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+ spin_unlock(&ei->raw_inode_lock);
/* If this is the first large file
* created, add a flag to the superblock.
*/
err = ext4_journal_get_write_access(handle,
EXT4_SB(sb)->s_sbh);
- if (err)
+ if (err) {
+ spin_lock(&ei->raw_inode_lock);
goto out_brelse;
+ }
ext4_update_dynamic_rev(sb);
EXT4_SET_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
sb->s_dirt = 1;
ext4_handle_sync(handle);
+ spin_lock(&ei->raw_inode_lock);
+ inode_update_crc(sb, raw_inode, inode->i_ino);
err = ext4_handle_dirty_metadata(handle, NULL,
EXT4_SB(sb)->s_sbh);
}
@@ -5330,6 +5483,7 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(inode->i_version >> 32);
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
+ inode_update_crc(sb, raw_inode, inode->i_ino);

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -5339,6 +5493,7 @@ static int ext4_do_update_inode(handle_t *handle,

ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
+ spin_unlock(&ei->raw_inode_lock);
brelse(bh);
ext4_std_error(inode->i_sb, err);
return err;
@@ -5759,6 +5914,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
return 0;

+ spin_lock(&EXT4_I(inode)->raw_inode_lock);
+
raw_inode = ext4_raw_inode(&iloc);

header = IHDR(inode, raw_inode);
@@ -5770,10 +5927,12 @@ static int ext4_expand_extra_isize(struct inode *inode,
memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
new_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ spin_unlock(&EXT4_I(inode)->raw_inode_lock);
return 0;
}

/* try to expand with EAs present */
+ spin_unlock(&EXT4_I(inode)->raw_inode_lock);
return ext4_expand_extra_isize_ea(inode, new_extra_isize,
raw_inode, handle);
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4e8983a..4012753 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,6 +39,7 @@
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
+#include <linux/crc32c.h>
#include <asm/uaccess.h>

#include "ext4.h"
@@ -70,6 +71,8 @@ static void ext4_write_super(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static int ext4_get_sb(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data, struct vfsmount *mnt);
+static void ext4_super_update_crc(struct super_block *sb,
+ struct ext4_super_block *es);

#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext3_fs_type = {
@@ -342,7 +345,14 @@ static void ext4_handle_error(struct super_block *sb)
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
sb->s_flags |= MS_RDONLY;
}
+ /*
+ * Unfortunately must take the lock here, to make sure there
+ * is consistent super state for the checksum. This is very easy to get
+ * wrong in the caller.
+ */
+ lock_super(sb);
ext4_commit_super(sb, 1);
+ unlock_super(sb);
if (test_opt(sb, ERRORS_PANIC))
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
@@ -531,7 +541,9 @@ __acquires(bitlock)
if (test_opt(sb, ERRORS_CONT)) {
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ lock_super(sb);
ext4_commit_super(sb, 0);
+ unlock_super(sb);
return;
}
ext4_unlock_group(sb, grp);
@@ -659,9 +671,12 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
- if (err < 0)
+ if (err < 0) {
+ unlock_super(sb);
ext4_abort(sb, __func__,
"Couldn't clean up the journal");
+ lock_super(sb);
+ }
}

ext4_release_system_zone(sb);
@@ -758,6 +773,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_da_metadata_calc_len = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+ spin_lock_init(&(ei->raw_inode_lock));
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
#endif
@@ -1161,6 +1177,7 @@ enum {
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard,
+ Opt_noinode_sanity, Opt_ignore_crc,
};

static const match_table_t tokens = {
@@ -1231,6 +1248,8 @@ static const match_table_t tokens = {
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
+ {Opt_noinode_sanity, "noinode_sanity"},
+ {Opt_ignore_crc, "ignore_crc"},
{Opt_err, NULL},
};

@@ -1408,6 +1427,12 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_orlov:
clear_opt(sbi->s_mount_opt, OLDALLOC);
break;
+ case Opt_noinode_sanity:
+ clear_opt(sbi->s_mount_opt, INODE_SANITY);
+ break;
+ case Opt_ignore_crc:
+ set_opt(sbi->s_mount_opt, IGNORE_CRC);
+ break;
#ifdef CONFIG_EXT4_FS_XATTR
case Opt_user_xattr:
set_opt(sbi->s_mount_opt, XATTR_USER);
@@ -1946,6 +1971,7 @@ static int ext4_check_descriptors(struct super_block *sb)
}

ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+ ext4_super_update_crc(sb, sbi->s_es);
sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
return 1;
}
@@ -2431,6 +2457,50 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 1;
}

+/* could be removed for SBs once e2fsutils are fixed to always compute
+ the CRC when the feature is turned on. */
+#define ZERO_MAGIC 1
+
+/*
+ * Manage CRC32 for the superblock.
+ */
+static int ext4_super_check_crc(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ u32 crc, check;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
+ return 0;
+ crc = le32_to_cpu(es->s_crc);
+ if (crc == 0 || test_opt(sb, IGNORE_CRC)) {
+ ext4_msg(sb, KERN_INFO, "super block checksum ignored");
+ return 0;
+ }
+ es->s_crc = 0;
+ check = crc32c(~0U, es, sizeof(struct ext4_super_block));
+ if (check == 0)
+ check = ZERO_MAGIC;
+ if (check != crc)
+ return -1;
+ /* Remove me */
+ ext4_msg(sb, KERN_INFO, "super block checksum passed");
+ return 0;
+}
+
+static void ext4_super_update_crc(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ u32 crc;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
+ return;
+ es->s_crc = 0;
+ crc = crc32c(~0U, es, sizeof(struct ext4_super_block));
+ if (crc == 0)
+ crc = ZERO_MAGIC;
+ es->s_crc = cpu_to_le32(crc);
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2554,6 +2624,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;

set_opt(sbi->s_mount_opt, BARRIER);
+ set_opt(sbi->s_mount_opt, INODE_SANITY);

/*
* enable delayed allocation by default
@@ -2585,6 +2656,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;

+ if (ext4_super_check_crc(sb, es) < 0) {
+ ext4_msg(sb, KERN_ERR, "Invalid checksum in super block");
+ goto failed_mount;
+ }
+
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);

if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2675,6 +2751,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)

for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+ memcpy(sbi->s_uuid, es->s_uuid, 16);
sbi->s_def_hash_version = es->s_def_hash_version;
i = le32_to_cpu(es->s_flags);
if (i & EXT2_FLAGS_UNSIGNED_HASH)
@@ -3393,6 +3470,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
&EXT4_SB(sb)->s_freeinodes_counter));
sb->s_dirt = 0;
+ /* can be removed if this is properly journaled, see
+ * http://bugzilla.kernel.org/show_bug.cgi?id=14354 */
+ ext4_super_update_crc(sb, es);
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
if (sync) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 0433800..fbba95a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1171,6 +1171,7 @@ retry:

free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
if (free >= new_extra_isize) {
+ spin_lock(&EXT4_I(inode)->raw_inode_lock);
entry = IFIRST(header);
ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
- new_extra_isize, (void *)raw_inode +
@@ -1179,6 +1180,7 @@ retry:
inode->i_sb->s_blocksize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
error = 0;
+ spin_unlock(&EXT4_I(inode)->raw_inode_lock);
goto cleanup;
}

@@ -1301,6 +1303,7 @@ retry:
if (error)
goto cleanup;

+ spin_lock(&EXT4_I(inode)->raw_inode_lock);
entry = IFIRST(header);
if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
shift_bytes = new_extra_isize;
@@ -1312,6 +1315,7 @@ retry:
EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
(void *)header, total_ino - entry_size,
inode->i_sb->s_blocksize);
+ spin_unlock(&EXT4_I(inode)->raw_inode_lock);

extra_isize += shift_bytes;
new_extra_isize -= shift_bytes;

--
[email protected] -- Speaking for myself only

2011-04-20 22:54:21

by djwong

[permalink] [raw]

Subject: Re: [PATCH 0/2] Add inode checksum support to ext4

On Wed, Apr 20, 2011 at 10:40:26AM -0700, Andi Kleen wrote:
> "Darrick J. Wong" <[email protected]> writes:
>
> FWIW I had a similar idea quite some time ago and implemented checksums for
> superblocks and inodes. I never posted it because I didn't get around
> to write the e2fsprogs support and do a lot of performance testing.
> There was one result that showed a slow down in quick tests, but
> I suspect it was a fluke and probably needs to be redone. In other
> tests it was neutral.

I've also seen comparable results between having inode checksums and not having
them. Unfortunately, like you said, modifying e2fsprogs is really what's
slowing this down right now-- there are a lot of places that assume inode_size
= 128 and therefore only read/write that much.

> Anyways here's the old patch if anyone is interested (against a ~.35ish kernel)
> I can forward port it if there's interest.
>
> IMHO it's a good idea but it should be done for the super blocks too
> (and ideally for all objects, although unfortunately that breaks
> the disk format)

I think I've seen some proposals for checksumming the bitmaps and the extent
tree nodes. It might be worth it to save some rocompat bits and combine them
all into one big(ger) rocompat flag. I guess it wouldn't be too hard to throw
in superblock checksumming too. :)

> The locking for stable buffers is still something that needs to be
> double checked.

> -Andi
>
> commit 8ece10cfa4b148075dbb93ca65855a7e2aad7b07
> Author: Andi Kleen <[email protected]>
> Date: Mon May 31 18:27:29 2010 +0200
>
> EXT4: Add checksums to the super block and the inodes
>
> Currently when a on-disk structure is corrupted ext4 usually
> detects it only by some internal inconsistency, but this can happen
> too late or give confusing error messages.
>
> One way to detect corruptions sooner is to use checksums. This
> means the file system can stop using a corrupted object ASAP, not
> follow any potentially corrupted pointers to other blocks,
> and also give a clear message on what happened.
>
> Potentially it can also be used to limit read only mounts and
> forced fscks. When the extent of a corruption can be detected
> accurately using checksums and only force errors on that
> particular object (this is right now not enabled though
> because there are still too many objects with missing checksums)
>
> There's a general trend in file systems recently to add metadata
> checksums, this just makes ext4 catch up a bit (in addition
> to the already existing transaction and block group checksums)
>
> Another advantage is that the checksum can also check for misplaced
> writes (by including the intended block location) and objects
> left over from before mkfs (by including the file system UUID).
>
> Adding checksums to all metadata in ext4 would likely need some
> incompatible format changes, but it's relatively straight-forward to add
> them to inodes and the super block at least. This patch-kit does this.
>
> Again it only handles some meta-data objects. This is not a full
> user-data checksumming feature or not even a "all metadata objects"
> feature.
>
> I didn't do a lot of research into different CRC functions, but simply
> used the same one as btrfs and iSCSI (crc32c). ocfs2 uses ECCs
> that have some self-correcting ability, but that seemed overkill to me.
> The standard CRC has the advantage that the CRC is accelerated by hardware
> instructions on newer Intel CPUs and possibly on other platforms too.
> The CRC32C function is also hard coded, although in theory it could be made
> configurable per file system. But since that would lead to a multitude of
> incompatible formats I decided to simply hard code for now.
>
> Right now there is no e2fsprogs support, so fsck doesn't know about CRCs.
>
> For now the checksums can be enabled with a simple shell script
> (which is just a wrapper around debugfs):
>
> wget ftp://firstfloor.org/pub/ak/shell/e2checksum
> chmod +x e2checksum
>
> To enable:
>
> ./e2checksum /dev/device enable
>
> To disable:
>
> ./e2checksum /dev/device disable
>
> This works with the current ext4 e2fsprogs without any changes.
>
> The kernel will automatically add checksums to the file system when the
> feature is turned on, as the inodes are rewritten.
>
> WARNING: Note there is no fsck support currently, so before you run fsck better
> disable the checksums. After disabling/changing/reenabling
> you may also need to mount with ignore_crc until the checksums
> are fixed up again.
>
> The checksums are implemented as a read-only compat feature: this means
> the a file system with checksums enabled can be read by a kernel that
> does not know about checksums, but not written. Of course you can turn
> off the flag again to make the file system write compatible again (but
> that will put the checksums into a inconsistent state) or use the
> ignore_crc mount option.
>
> WARNING: the in-kernel upgrader relies on the checksum fields being
> zero when checksums are enabled. When you turn on the feature, use the
> file system, turn it off again, use it again and turn it on again the
> checksums will be in a inconsistent state. Right now this can be
> fixed by mounting with -o ignore_crc and then doing a find | xargs touch
> on the file system.
>
> I expect real e2fsprogs support can do that better.
>
> The code also adds some more sanity checks on inodes to distingush zeroed
> inodes from inodes with no checksums. This is currently done by enforcing
> that a/m/ctime are not zero. If there are broken file systems around
> where that is not true, the sanity check can be turned off (see below)
>
> There are two new mount options:
>
> ignore_crc Ignore the CRCs on reading but still update them
> noinode_sanity Don't do new inode sanity checks

Hm, the usage model of my patch is tune2fs; fsck; mount. I suspect it's a good
idea to run fsck before/while turning on this feature to correct any other
errors. Though, that means that the kernel will simply -EIO anything it thinks
is corrupt.

> The implementation is not particularly optimized: it always recomputes
> the full CRC on each inode or super block write. Some more optimizations
> would be possible in this area.
>
> BENCHMARKS
>
> I ran kernelbench and it didn't show any significant difference between
> the run with checksums and the run without.
>
> I also tried timing fsstress from XFS/LTP, and it gave a 35% slowdown
> on a disk and 5% slow down on the ramdisk for the system time
> during the test. However I'm a bit suspicious of these results.
> This was on a core 2.

I'll give fsstress a try when I get back to this (Friday skunkworks project).

Either way, thanks for the patch!

--D
>
> I also tested on a Nehalem system which has CRC acceleration instructions
> in the CPU.
>
> Anyone has a better inode metadata intensive benchmark to run?
>
> This gobbled up the last mount flag bits, but there are still some unused
> holes in the middle.
>
> The patch is definitely still experimental and needs more testing and
> review and e2fsprogs support. In particular I would appreciate review
> of my bh locking scheme.
>
> Also ext4_abort() now takes the super lock, which implies that the callers
> shouldn't. I think I checked all callers that it's safe, but double checking
> that would be good.
>
> Opens:
> - e2fsprogs support. In this case the zero crc hack could be removed from
> the super block code at least.
> - Right now checksum numbers by default lead to a read-only file system remount
> and are also logged as "IO error". This could be made more gentle, because
> a checksum catches problems early enough that continuation might be possible.
> - Incremental checksumming
> - Checksums for extents and directories
> - In ignore_crc mode the checksums are only rewritten when the inode is somehow
> dirtied otherwise. Do this implicitely? Might be better to move this to e2fsprogs
>
> diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
> index e1def17..e98141a 100644
> --- a/Documentation/filesystems/ext4.txt
> +++ b/Documentation/filesystems/ext4.txt
> @@ -359,6 +359,14 @@ nodiscard(*) commands to the underlying block device when
> and sparse/thinly-provisioned LUNs, but it is off
> by default until sufficient testing has been done.
>
> +nosanity_check Don't check for zero m/c/a times when reading a inode.
> + Should not normally be needed.
> +
> +ignore_crc Ignore checksum failures while reading the super block
> + or inodes, but still update the checksums on writing
> + (if you don't want to update the checksums, clear the
> + checksum feature bit in the super block)
> +
> Data Mode
> =========
> There are 3 different data modes:
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 19a4de5..3a99769 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -611,6 +611,7 @@ struct ext4_inode {
> __le32 i_crtime; /* File Creation time */
> __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
> __le32 i_version_hi; /* high 32 bits for 64-bit version */
> + __le32 i_crc; /* CRC32 for this inode */
> };
>
> struct move_extent {
> @@ -836,6 +837,12 @@ struct ext4_inode_info {
> */
> tid_t i_sync_tid;
> tid_t i_datasync_tid;
> +
> + /*
> + * Protect raw inode modifications, mostly to ensure
> + * stability for checksumming.
> + */
> + spinlock_t raw_inode_lock;
> };
>
> /*
> @@ -881,10 +888,12 @@ struct ext4_inode_info {
> #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
> #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
> #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
> +#define EXT4_MOUNT_IGNORE_CRC 0x4000000 /* Ignore CRCs on read */
> #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
> #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
> #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
> #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
> +#define EXT4_MOUNT_INODE_SANITY 0x80000000 /* Inode sanity check */
>
> #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
> #define set_opt(o, opt) o |= EXT4_MOUNT_##opt
> @@ -1003,7 +1012,8 @@ struct ext4_super_block {
> __u8 s_reserved_char_pad2;
> __le16 s_reserved_pad;
> __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
> - __u32 s_reserved[160]; /* Padding to the end of the block */
> + __le32 s_crc; /* CRC32 for this super block */
> + __u32 s_reserved[159]; /* Padding to the end of the block */
> };
>
> #ifdef __KERNEL__
> @@ -1051,6 +1061,7 @@ struct ext4_sb_info {
> u32 s_hash_seed[4];
> int s_def_hash_version;
> int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
> + u8 s_uuid[16];
> struct percpu_counter s_freeblocks_counter;
> struct percpu_counter s_freeinodes_counter;
> struct percpu_counter s_dirs_counter;
> @@ -1265,6 +1276,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
> #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
> #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
> #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
> +#define EXT4_FEATURE_RO_COMPAT_SBI_CRC 0x0080 /* sb and inode CRCs */
>
> #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
> #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
> @@ -1291,7 +1303,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
> EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
> EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
> EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
> - EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
> + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
> + EXT4_FEATURE_RO_COMPAT_SBI_CRC)
>
> /*
> * Default values for user and/or group using reserved blocks
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 42272d6..8ba2f24 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -37,6 +37,7 @@
> #include <linux/namei.h>
> #include <linux/uio.h>
> #include <linux/bio.h>
> +#include <linux/crc32c.h>
> #include <linux/workqueue.h>
> #include <linux/kernel.h>
> #include <linux/slab.h>
> @@ -72,6 +73,141 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
> return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
> }
>
> +#define ZERO_MAGIC 1
> +
> +static __le32 inode_crc(struct super_block *sb, struct ext4_inode *raw_inode, long ino)
> +{
> + struct ext4_csum_header {
> + __le64 ino;
> + __le64 pad;
> + u8 uuid[16];
> + } __attribute__((aligned)) hdr;
> + u32 crc;
> +
> + /*
> + * First CRC in the inode number and the file system UUID
> + * to detect inodes from before the last mkfs and misplaced inode
> + * writes.
> + */
> + hdr.ino = cpu_to_le64(ino);
> + memcpy(&hdr.uuid, EXT4_SB(sb)->s_uuid, 16);
> + hdr.pad = 0;
> +
> + raw_inode->i_crc = 0;
> + crc = crc32c(~0U, &hdr, sizeof(struct ext4_csum_header));
> + /* Could CRC precompute the common zero tail? (if it's really common) */
> + crc = crc32c(crc, raw_inode, EXT4_INODE_SIZE(sb));
> + if (crc == 0)
> + crc = ZERO_MAGIC;
> + return cpu_to_le32(crc);
> +}
> +
> +/*
> + * Update CRC in a inode, including all additional fields after the
> + * standard inode structure.
> + *
> + * This relies on the raw_inode_lock protecting against all writes
> + * to the raw inode state in the bh. Right now the JBD locking
> + * is not good enough for that.
> + *
> + * This always precomputes the full checksum. In principle it would
> + * be possible to be more clever and do incremental changes at least
> + * for some state changes.
> + */
> +static void inode_update_crc(struct super_block *sb, struct ext4_inode *raw_inode,
> + long ino)
> +{
> + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
> + return;
> + raw_inode->i_crc = inode_crc(sb, raw_inode, ino);
> +}
> +
> +/*
> + * This is needed because nfsd might try to access dead inodes
> + * the test is that same one that e2fsck uses
> + * NeilBrown 1999oct15
> + */
> +static int inode_deleted(struct super_block *sb, struct ext4_inode *raw_inode)
> +{
> + if (raw_inode->i_links_count == 0) {
> + if (raw_inode->i_mode == 0 ||
> + !(EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS))
> + return -ESTALE;
> + /*
> + * The only unlinked inodes we let through here have
> + * valid i_mode and are being read by the orphan
> + * recovery code: that's fine, we're about to complete
> + * the process of deleting those.
> + */
> + return 1;
> + }
> + return 0;
> +}
> +
> +/*
> + * Does this checksum-less inode look like a valid inode?
> + * Do a few sanity checks.
> + */
> +static int inode_sanity_check(struct super_block *sb, struct ext4_inode *raw_inode, char **msg)
> +{
> + int err = inode_deleted(sb, raw_inode);
> + if (err)
> + return err;
> + if (!test_opt(sb, INODE_SANITY))
> + return 0;
> + if (raw_inode->i_mode == 0 ||
> + raw_inode->i_atime == 0 ||
> + raw_inode->i_ctime == 0 ||
> + raw_inode->i_mtime == 0) {
> + *msg = "inode has invalid zero times";
> + return -1;
> + }
> + /* More sanity checks? */
> + return 0;
> +}
> +
> +/*
> + * Check CRC for a newly read inode.
> + */
> +static int inode_check_crc(struct super_block *sb, struct ext4_inode *raw_inode,
> + long ino, char **msg)
> +{
> + __le32 crc, check;
> +
> + /*
> + * Special case: zero CRC. This is handled like no checksum yet,
> + * otherwise tune2fs would need to rewrite all inodes when CRCs
> + * are turned on. The CRC will be updated when the inode
> + * is written out.
> + *
> + * This however means that if zeroes are blasted over the inodes
> + * we would think the checksum is not there. So instead do
> + * some special sanity checks in the other fields when this happens,
> + * to catch this case.
> + * This is not 100% fool proof, but should be reasonable.
> + * When the checksum function returns a real zero we turn that
> + * into a one to avoid ambiguity.
> + *
> + * The sanity check is done unconditionally even if the checksum passed
> + * because it's cheap enough.
> + */
> + crc = raw_inode->i_crc;
> + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC) && crc
> + && !test_opt(sb, IGNORE_CRC)) {
> + check = inode_crc(sb, raw_inode, ino);
> + if (check != crc) {
> + *msg = "inode has invalid checksum";
> + /*
> + * Restore bad CRC so that if the inode is reread it'll fail
> + * the check again.
> + */
> + raw_inode->i_crc = crc;
> + return -1;
> + }
> + }
> + return inode_sanity_check(sb, raw_inode, msg);
> +}
> +
> /*
> * Work out how many blocks we need to proceed with the next chunk of a
> * truncate transaction.
> @@ -4996,6 +5132,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> journal_t *journal = EXT4_SB(sb)->s_journal;
> long ret;
> int block;
> + char *msg = NULL;
>
> inode = iget_locked(sb, ino);
> if (!inode)
> @@ -5010,6 +5147,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> if (ret < 0)
> goto bad_inode;
> raw_inode = ext4_raw_inode(&iloc);
> +
> + /*
> + * Relies on the inode lock to protect the raw_inode bh contents.
> + */
> + ret = inode_check_crc(sb, raw_inode, ino, &msg);
> + if (ret < 0) {
> + /*
> + * Here would be the place to send a "read other mirror"
> + * retry hint to the block layer.
> + */
> + brelse(iloc.bh);
> + if (ret != -ESTALE)
> + ext4_error(sb,
> + "%s: inode=%lu, block=%llu", msg,
> + ino,
> + (unsigned long long)iloc.bh->b_blocknr);
> + goto bad_inode;
> + }
> + ret = 0;
> +
> inode->i_mode = le16_to_cpu(raw_inode->i_mode);
> inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
> inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
> @@ -5022,23 +5179,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> ei->i_state_flags = 0;
> ei->i_dir_start_lookup = 0;
> ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
> - /* We now have enough fields to check if the inode was active or not.
> - * This is needed because nfsd might try to access dead inodes
> - * the test is that same one that e2fsck uses
> - * NeilBrown 1999oct15
> - */
> - if (inode->i_nlink == 0) {
> - if (inode->i_mode == 0 ||
> - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
> - /* this inode is deleted */
> - ret = -ESTALE;
> - goto bad_inode;
> - }
> - /* The only unlinked inodes we let through here have
> - * valid i_mode and are being read by the orphan
> - * recovery code: that's fine, we're about to complete
> - * the process of deleting those. */
> - }
> ei->i_flags = le32_to_cpu(raw_inode->i_flags);
> inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
> ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
> @@ -5232,11 +5372,19 @@ static int ext4_do_update_inode(handle_t *handle,
> struct inode *inode,
> struct ext4_iloc *iloc)
> {
> + struct super_block *sb = inode->i_sb;
> struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
> struct ext4_inode_info *ei = EXT4_I(inode);
> struct buffer_head *bh = iloc->bh;
> int err = 0, rc, block;
>
> + /*
> + * Protect the on disk inode against parallel modification
> + * until we compute the checksum and pass the resulting block
> + * to JBD, which protects it then.
> + */
> + spin_lock(&ei->raw_inode_lock);
> +
> /* For fields not not tracking in the in-memory inode,
> * initialise them to zero for new inodes. */
> if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
> @@ -5291,18 +5439,23 @@ static int ext4_do_update_inode(handle_t *handle,
> EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
> EXT4_SB(sb)->s_es->s_rev_level ==
> cpu_to_le32(EXT4_GOOD_OLD_REV)) {
> + spin_unlock(&ei->raw_inode_lock);
> /* If this is the first large file
> * created, add a flag to the superblock.
> */
> err = ext4_journal_get_write_access(handle,
> EXT4_SB(sb)->s_sbh);
> - if (err)
> + if (err) {
> + spin_lock(&ei->raw_inode_lock);
> goto out_brelse;
> + }
> ext4_update_dynamic_rev(sb);
> EXT4_SET_RO_COMPAT_FEATURE(sb,
> EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
> sb->s_dirt = 1;
> ext4_handle_sync(handle);
> + spin_lock(&ei->raw_inode_lock);
> + inode_update_crc(sb, raw_inode, inode->i_ino);
> err = ext4_handle_dirty_metadata(handle, NULL,
> EXT4_SB(sb)->s_sbh);
> }
> @@ -5330,6 +5483,7 @@ static int ext4_do_update_inode(handle_t *handle,
> cpu_to_le32(inode->i_version >> 32);
> raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
> }
> + inode_update_crc(sb, raw_inode, inode->i_ino);
>
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> rc = ext4_handle_dirty_metadata(handle, NULL, bh);
> @@ -5339,6 +5493,7 @@ static int ext4_do_update_inode(handle_t *handle,
>
> ext4_update_inode_fsync_trans(handle, inode, 0);
> out_brelse:
> + spin_unlock(&ei->raw_inode_lock);
> brelse(bh);
> ext4_std_error(inode->i_sb, err);
> return err;
> @@ -5759,6 +5914,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
> if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
> return 0;
>
> + spin_lock(&EXT4_I(inode)->raw_inode_lock);
> +
> raw_inode = ext4_raw_inode(&iloc);
>
> header = IHDR(inode, raw_inode);
> @@ -5770,10 +5927,12 @@ static int ext4_expand_extra_isize(struct inode *inode,
> memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
> new_extra_isize);
> EXT4_I(inode)->i_extra_isize = new_extra_isize;
> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
> return 0;
> }
>
> /* try to expand with EAs present */
> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
> return ext4_expand_extra_isize_ea(inode, new_extra_isize,
> raw_inode, handle);
> }
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 4e8983a..4012753 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -39,6 +39,7 @@
> #include <linux/ctype.h>
> #include <linux/log2.h>
> #include <linux/crc16.h>
> +#include <linux/crc32c.h>
> #include <asm/uaccess.h>
>
> #include "ext4.h"
> @@ -70,6 +71,8 @@ static void ext4_write_super(struct super_block *sb);
> static int ext4_freeze(struct super_block *sb);
> static int ext4_get_sb(struct file_system_type *fs_type, int flags,
> const char *dev_name, void *data, struct vfsmount *mnt);
> +static void ext4_super_update_crc(struct super_block *sb,
> + struct ext4_super_block *es);
>
> #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
> static struct file_system_type ext3_fs_type = {
> @@ -342,7 +345,14 @@ static void ext4_handle_error(struct super_block *sb)
> ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
> sb->s_flags |= MS_RDONLY;
> }
> + /*
> + * Unfortunately must take the lock here, to make sure there
> + * is consistent super state for the checksum. This is very easy to get
> + * wrong in the caller.
> + */
> + lock_super(sb);
> ext4_commit_super(sb, 1);
> + unlock_super(sb);
> if (test_opt(sb, ERRORS_PANIC))
> panic("EXT4-fs (device %s): panic forced after error\n",
> sb->s_id);
> @@ -531,7 +541,9 @@ __acquires(bitlock)
> if (test_opt(sb, ERRORS_CONT)) {
> EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
> es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
> + lock_super(sb);
> ext4_commit_super(sb, 0);
> + unlock_super(sb);
> return;
> }
> ext4_unlock_group(sb, grp);
> @@ -659,9 +671,12 @@ static void ext4_put_super(struct super_block *sb)
> if (sbi->s_journal) {
> err = jbd2_journal_destroy(sbi->s_journal);
> sbi->s_journal = NULL;
> - if (err < 0)
> + if (err < 0) {
> + unlock_super(sb);
> ext4_abort(sb, __func__,
> "Couldn't clean up the journal");
> + lock_super(sb);
> + }
> }
>
> ext4_release_system_zone(sb);
> @@ -758,6 +773,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
> ei->i_da_metadata_calc_len = 0;
> ei->i_delalloc_reserved_flag = 0;
> spin_lock_init(&(ei->i_block_reservation_lock));
> + spin_lock_init(&(ei->raw_inode_lock));
> #ifdef CONFIG_QUOTA
> ei->i_reserved_quota = 0;
> #endif
> @@ -1161,6 +1177,7 @@ enum {
> Opt_inode_readahead_blks, Opt_journal_ioprio,
> Opt_dioread_nolock, Opt_dioread_lock,
> Opt_discard, Opt_nodiscard,
> + Opt_noinode_sanity, Opt_ignore_crc,
> };
>
> static const match_table_t tokens = {
> @@ -1231,6 +1248,8 @@ static const match_table_t tokens = {
> {Opt_dioread_lock, "dioread_lock"},
> {Opt_discard, "discard"},
> {Opt_nodiscard, "nodiscard"},
> + {Opt_noinode_sanity, "noinode_sanity"},
> + {Opt_ignore_crc, "ignore_crc"},
> {Opt_err, NULL},
> };
>
> @@ -1408,6 +1427,12 @@ static int parse_options(char *options, struct super_block *sb,
> case Opt_orlov:
> clear_opt(sbi->s_mount_opt, OLDALLOC);
> break;
> + case Opt_noinode_sanity:
> + clear_opt(sbi->s_mount_opt, INODE_SANITY);
> + break;
> + case Opt_ignore_crc:
> + set_opt(sbi->s_mount_opt, IGNORE_CRC);
> + break;
> #ifdef CONFIG_EXT4_FS_XATTR
> case Opt_user_xattr:
> set_opt(sbi->s_mount_opt, XATTR_USER);
> @@ -1946,6 +1971,7 @@ static int ext4_check_descriptors(struct super_block *sb)
> }
>
> ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
> + ext4_super_update_crc(sb, sbi->s_es);
> sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
> return 1;
> }
> @@ -2431,6 +2457,50 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
> return 1;
> }
>
> +/* could be removed for SBs once e2fsutils are fixed to always compute
> + the CRC when the feature is turned on. */
> +#define ZERO_MAGIC 1
> +
> +/*
> + * Manage CRC32 for the superblock.
> + */
> +static int ext4_super_check_crc(struct super_block *sb,
> + struct ext4_super_block *es)
> +{
> + u32 crc, check;
> +
> + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
> + return 0;
> + crc = le32_to_cpu(es->s_crc);
> + if (crc == 0 || test_opt(sb, IGNORE_CRC)) {
> + ext4_msg(sb, KERN_INFO, "super block checksum ignored");
> + return 0;
> + }
> + es->s_crc = 0;
> + check = crc32c(~0U, es, sizeof(struct ext4_super_block));
> + if (check == 0)
> + check = ZERO_MAGIC;
> + if (check != crc)
> + return -1;
> + /* Remove me */
> + ext4_msg(sb, KERN_INFO, "super block checksum passed");
> + return 0;
> +}
> +
> +static void ext4_super_update_crc(struct super_block *sb,
> + struct ext4_super_block *es)
> +{
> + u32 crc;
> +
> + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
> + return;
> + es->s_crc = 0;
> + crc = crc32c(~0U, es, sizeof(struct ext4_super_block));
> + if (crc == 0)
> + crc = ZERO_MAGIC;
> + es->s_crc = cpu_to_le32(crc);
> +}
> +
> static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> __releases(kernel_lock)
> __acquires(kernel_lock)
> @@ -2554,6 +2624,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
>
> set_opt(sbi->s_mount_opt, BARRIER);
> + set_opt(sbi->s_mount_opt, INODE_SANITY);
>
> /*
> * enable delayed allocation by default
> @@ -2585,6 +2656,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
> goto failed_mount;
>
> + if (ext4_super_check_crc(sb, es) < 0) {
> + ext4_msg(sb, KERN_ERR, "Invalid checksum in super block");
> + goto failed_mount;
> + }
> +
> blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
>
> if (blocksize < EXT4_MIN_BLOCK_SIZE ||
> @@ -2675,6 +2751,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>
> for (i = 0; i < 4; i++)
> sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
> + memcpy(sbi->s_uuid, es->s_uuid, 16);
> sbi->s_def_hash_version = es->s_def_hash_version;
> i = le32_to_cpu(es->s_flags);
> if (i & EXT2_FLAGS_UNSIGNED_HASH)
> @@ -3393,6 +3470,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
> es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
> &EXT4_SB(sb)->s_freeinodes_counter));
> sb->s_dirt = 0;
> + /* can be removed if this is properly journaled, see
> + * http://bugzilla.kernel.org/show_bug.cgi?id=14354 */
> + ext4_super_update_crc(sb, es);
> BUFFER_TRACE(sbh, "marking dirty");
> mark_buffer_dirty(sbh);
> if (sync) {
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 0433800..fbba95a 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1171,6 +1171,7 @@ retry:
>
> free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
> if (free >= new_extra_isize) {
> + spin_lock(&EXT4_I(inode)->raw_inode_lock);
> entry = IFIRST(header);
> ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
> - new_extra_isize, (void *)raw_inode +
> @@ -1179,6 +1180,7 @@ retry:
> inode->i_sb->s_blocksize);
> EXT4_I(inode)->i_extra_isize = new_extra_isize;
> error = 0;
> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
> goto cleanup;
> }
>
> @@ -1301,6 +1303,7 @@ retry:
> if (error)
> goto cleanup;
>
> + spin_lock(&EXT4_I(inode)->raw_inode_lock);
> entry = IFIRST(header);
> if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
> shift_bytes = new_extra_isize;
> @@ -1312,6 +1315,7 @@ retry:
> EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
> (void *)header, total_ino - entry_size,
> inode->i_sb->s_blocksize);
> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
>
> extra_isize += shift_bytes;
> new_extra_isize -= shift_bytes;
>
>
> --
> [email protected] -- Speaking for myself only

2011-04-21 00:25:26

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH 0/2] Add inode checksum support to ext4

On 2011-04-20, at 4:54 PM, Darrick J. Wong wrote:
> On Wed, Apr 20, 2011 at 10:40:26AM -0700, Andi Kleen wrote:
>> "Darrick J. Wong" <[email protected]> writes:
>>
>> FWIW I had a similar idea quite some time ago and implemented checksums for
>> superblocks and inodes. I never posted it because I didn't get around
>> to write the e2fsprogs support and do a lot of performance testing.
>> There was one result that showed a slow down in quick tests, but
>> I suspect it was a fluke and probably needs to be redone. In other
>> tests it was neutral.
>
> I've also seen comparable results between having inode checksums and not having
> them. Unfortunately, like you said, modifying e2fsprogs is really what's
> slowing this down right now-- there are a lot of places that assume inode_size
> = 128 and therefore only read/write that much.

I think it makes sense to include the inode checksum into the core 128-bit inode, so that it is available to all upgraded ext3 filesystems as well. Having a 16-bit checksum is probably sufficient for the 128- or 256-byte inodes, I don't know that we really need to have a full 32-bit checksum? Also, the current group descriptor checksums are already CRC-16 so it probably makes sense to stick with that.

>> Anyways here's the old patch if anyone is interested (against a ~.35ish kernel)
>> I can forward port it if there's interest.
>>
>> IMHO it's a good idea but it should be done for the super blocks too
>> (and ideally for all objects, although unfortunately that breaks
>> the disk format)
>
> I think I've seen some proposals for checksumming the bitmaps and the extent
> tree nodes. It might be worth it to save some rocompat bits and combine them
> all into one big(ger) rocompat flag. I guess it wouldn't be too hard to throw
> in superblock checksumming too. :)
>
>> The locking for stable buffers is still something that needs to be
>> double checked.
>
>
>
>> -Andi
>>
>> commit 8ece10cfa4b148075dbb93ca65855a7e2aad7b07
>> Author: Andi Kleen <[email protected]>
>> Date: Mon May 31 18:27:29 2010 +0200
>>
>> EXT4: Add checksums to the super block and the inodes
>>
>> Currently when a on-disk structure is corrupted ext4 usually
>> detects it only by some internal inconsistency, but this can happen
>> too late or give confusing error messages.
>>
>> One way to detect corruptions sooner is to use checksums. This
>> means the file system can stop using a corrupted object ASAP, not
>> follow any potentially corrupted pointers to other blocks,
>> and also give a clear message on what happened.
>>
>> Potentially it can also be used to limit read only mounts and
>> forced fscks. When the extent of a corruption can be detected
>> accurately using checksums and only force errors on that
>> particular object (this is right now not enabled though
>> because there are still too many objects with missing checksums)
>>
>> There's a general trend in file systems recently to add metadata
>> checksums, this just makes ext4 catch up a bit (in addition
>> to the already existing transaction and block group checksums)
>>
>> Another advantage is that the checksum can also check for misplaced
>> writes (by including the intended block location) and objects
>> left over from before mkfs (by including the file system UUID).
>>
>> Adding checksums to all metadata in ext4 would likely need some
>> incompatible format changes, but it's relatively straight-forward to add
>> them to inodes and the super block at least. This patch-kit does this.
>>
>> Again it only handles some meta-data objects. This is not a full
>> user-data checksumming feature or not even a "all metadata objects"
>> feature.
>>
>> I didn't do a lot of research into different CRC functions, but simply
>> used the same one as btrfs and iSCSI (crc32c). ocfs2 uses ECCs
>> that have some self-correcting ability, but that seemed overkill to me.
>> The standard CRC has the advantage that the CRC is accelerated by hardware
>> instructions on newer Intel CPUs and possibly on other platforms too.
>> The CRC32C function is also hard coded, although in theory it could be made
>> configurable per file system. But since that would lead to a multitude of
>> incompatible formats I decided to simply hard code for now.
>>
>> Right now there is no e2fsprogs support, so fsck doesn't know about CRCs.
>>
>> For now the checksums can be enabled with a simple shell script
>> (which is just a wrapper around debugfs):
>>
>> wget ftp://firstfloor.org/pub/ak/shell/e2checksum
>> chmod +x e2checksum
>>
>> To enable:
>>
>> ./e2checksum /dev/device enable
>>
>> To disable:
>>
>> ./e2checksum /dev/device disable
>>
>> This works with the current ext4 e2fsprogs without any changes.
>>
>> The kernel will automatically add checksums to the file system when the
>> feature is turned on, as the inodes are rewritten.
>>
>> WARNING: Note there is no fsck support currently, so before you run fsck better
>> disable the checksums. After disabling/changing/reenabling
>> you may also need to mount with ignore_crc until the checksums
>> are fixed up again.
>>
>> The checksums are implemented as a read-only compat feature: this means
>> the a file system with checksums enabled can be read by a kernel that
>> does not know about checksums, but not written. Of course you can turn
>> off the flag again to make the file system write compatible again (but
>> that will put the checksums into a inconsistent state) or use the
>> ignore_crc mount option.
>>
>> WARNING: the in-kernel upgrader relies on the checksum fields being
>> zero when checksums are enabled. When you turn on the feature, use the
>> file system, turn it off again, use it again and turn it on again the
>> checksums will be in a inconsistent state. Right now this can be
>> fixed by mounting with -o ignore_crc and then doing a find | xargs touch
>> on the file system.
>>
>> I expect real e2fsprogs support can do that better.
>>
>> The code also adds some more sanity checks on inodes to distingush zeroed
>> inodes from inodes with no checksums. This is currently done by enforcing
>> that a/m/ctime are not zero. If there are broken file systems around
>> where that is not true, the sanity check can be turned off (see below)
>>
>> There are two new mount options:
>>
>> ignore_crc Ignore the CRCs on reading but still update them
>> noinode_sanity Don't do new inode sanity checks
>
> Hm, the usage model of my patch is tune2fs; fsck; mount. I suspect it's a good
> idea to run fsck before/while turning on this feature to correct any other
> errors. Though, that means that the kernel will simply -EIO anything it thinks
> is corrupt.
>
>> The implementation is not particularly optimized: it always recomputes
>> the full CRC on each inode or super block write. Some more optimizations
>> would be possible in this area.
>>
>> BENCHMARKS
>>
>> I ran kernelbench and it didn't show any significant difference between
>> the run with checksums and the run without.
>>
>> I also tried timing fsstress from XFS/LTP, and it gave a 35% slowdown
>> on a disk and 5% slow down on the ramdisk for the system time
>> during the test. However I'm a bit suspicious of these results.
>> This was on a core 2.
>
> I'll give fsstress a try when I get back to this (Friday skunkworks project).
>
> Either way, thanks for the patch!
>
> --D
>>
>> I also tested on a Nehalem system which has CRC acceleration instructions
>> in the CPU.
>>
>> Anyone has a better inode metadata intensive benchmark to run?
>>
>> This gobbled up the last mount flag bits, but there are still some unused
>> holes in the middle.
>>
>> The patch is definitely still experimental and needs more testing and
>> review and e2fsprogs support. In particular I would appreciate review
>> of my bh locking scheme.
>>
>> Also ext4_abort() now takes the super lock, which implies that the callers
>> shouldn't. I think I checked all callers that it's safe, but double checking
>> that would be good.
>>
>> Opens:
>> - e2fsprogs support. In this case the zero crc hack could be removed from
>> the super block code at least.
>> - Right now checksum numbers by default lead to a read-only file system remount
>> and are also logged as "IO error". This could be made more gentle, because
>> a checksum catches problems early enough that continuation might be possible.
>> - Incremental checksumming
>> - Checksums for extents and directories
>> - In ignore_crc mode the checksums are only rewritten when the inode is somehow
>> dirtied otherwise. Do this implicitely? Might be better to move this to e2fsprogs
>>
>> diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
>> index e1def17..e98141a 100644
>> --- a/Documentation/filesystems/ext4.txt
>> +++ b/Documentation/filesystems/ext4.txt
>> @@ -359,6 +359,14 @@ nodiscard(*) commands to the underlying block device when
>> and sparse/thinly-provisioned LUNs, but it is off
>> by default until sufficient testing has been done.
>>
>> +nosanity_check Don't check for zero m/c/a times when reading a inode.
>> + Should not normally be needed.
>> +
>> +ignore_crc Ignore checksum failures while reading the super block
>> + or inodes, but still update the checksums on writing
>> + (if you don't want to update the checksums, clear the
>> + checksum feature bit in the super block)
>> +
>> Data Mode
>> =========
>> There are 3 different data modes:
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 19a4de5..3a99769 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -611,6 +611,7 @@ struct ext4_inode {
>> __le32 i_crtime; /* File Creation time */
>> __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
>> __le32 i_version_hi; /* high 32 bits for 64-bit version */
>> + __le32 i_crc; /* CRC32 for this inode */
>> };
>>
>> struct move_extent {
>> @@ -836,6 +837,12 @@ struct ext4_inode_info {
>> */
>> tid_t i_sync_tid;
>> tid_t i_datasync_tid;
>> +
>> + /*
>> + * Protect raw inode modifications, mostly to ensure
>> + * stability for checksumming.
>> + */
>> + spinlock_t raw_inode_lock;
>> };
>>
>> /*
>> @@ -881,10 +888,12 @@ struct ext4_inode_info {
>> #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
>> #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
>> #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
>> +#define EXT4_MOUNT_IGNORE_CRC 0x4000000 /* Ignore CRCs on read */
>> #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
>> #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
>> #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
>> #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
>> +#define EXT4_MOUNT_INODE_SANITY 0x80000000 /* Inode sanity check */
>>
>> #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
>> #define set_opt(o, opt) o |= EXT4_MOUNT_##opt
>> @@ -1003,7 +1012,8 @@ struct ext4_super_block {
>> __u8 s_reserved_char_pad2;
>> __le16 s_reserved_pad;
>> __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
>> - __u32 s_reserved[160]; /* Padding to the end of the block */
>> + __le32 s_crc; /* CRC32 for this super block */
>> + __u32 s_reserved[159]; /* Padding to the end of the block */
>> };
>>
>> #ifdef __KERNEL__
>> @@ -1051,6 +1061,7 @@ struct ext4_sb_info {
>> u32 s_hash_seed[4];
>> int s_def_hash_version;
>> int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
>> + u8 s_uuid[16];
>> struct percpu_counter s_freeblocks_counter;
>> struct percpu_counter s_freeinodes_counter;
>> struct percpu_counter s_dirs_counter;
>> @@ -1265,6 +1276,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
>> #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
>> #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
>> #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
>> +#define EXT4_FEATURE_RO_COMPAT_SBI_CRC 0x0080 /* sb and inode CRCs */
>>
>> #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
>> #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
>> @@ -1291,7 +1303,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
>> EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
>> EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
>> EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
>> - EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
>> + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
>> + EXT4_FEATURE_RO_COMPAT_SBI_CRC)
>>
>> /*
>> * Default values for user and/or group using reserved blocks
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 42272d6..8ba2f24 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -37,6 +37,7 @@
>> #include <linux/namei.h>
>> #include <linux/uio.h>
>> #include <linux/bio.h>
>> +#include <linux/crc32c.h>
>> #include <linux/workqueue.h>
>> #include <linux/kernel.h>
>> #include <linux/slab.h>
>> @@ -72,6 +73,141 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
>> return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
>> }
>>
>> +#define ZERO_MAGIC 1
>> +
>> +static __le32 inode_crc(struct super_block *sb, struct ext4_inode *raw_inode, long ino)
>> +{
>> + struct ext4_csum_header {
>> + __le64 ino;
>> + __le64 pad;
>> + u8 uuid[16];
>> + } __attribute__((aligned)) hdr;
>> + u32 crc;
>> +
>> + /*
>> + * First CRC in the inode number and the file system UUID
>> + * to detect inodes from before the last mkfs and misplaced inode
>> + * writes.
>> + */
>> + hdr.ino = cpu_to_le64(ino);
>> + memcpy(&hdr.uuid, EXT4_SB(sb)->s_uuid, 16);
>> + hdr.pad = 0;
>> +
>> + raw_inode->i_crc = 0;
>> + crc = crc32c(~0U, &hdr, sizeof(struct ext4_csum_header));
>> + /* Could CRC precompute the common zero tail? (if it's really common) */
>> + crc = crc32c(crc, raw_inode, EXT4_INODE_SIZE(sb));
>> + if (crc == 0)
>> + crc = ZERO_MAGIC;
>> + return cpu_to_le32(crc);
>> +}
>> +
>> +/*
>> + * Update CRC in a inode, including all additional fields after the
>> + * standard inode structure.
>> + *
>> + * This relies on the raw_inode_lock protecting against all writes
>> + * to the raw inode state in the bh. Right now the JBD locking
>> + * is not good enough for that.
>> + *
>> + * This always precomputes the full checksum. In principle it would
>> + * be possible to be more clever and do incremental changes at least
>> + * for some state changes.
>> + */
>> +static void inode_update_crc(struct super_block *sb, struct ext4_inode *raw_inode,
>> + long ino)
>> +{
>> + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
>> + return;
>> + raw_inode->i_crc = inode_crc(sb, raw_inode, ino);
>> +}
>> +
>> +/*
>> + * This is needed because nfsd might try to access dead inodes
>> + * the test is that same one that e2fsck uses
>> + * NeilBrown 1999oct15
>> + */
>> +static int inode_deleted(struct super_block *sb, struct ext4_inode *raw_inode)
>> +{
>> + if (raw_inode->i_links_count == 0) {
>> + if (raw_inode->i_mode == 0 ||
>> + !(EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS))
>> + return -ESTALE;
>> + /*
>> + * The only unlinked inodes we let through here have
>> + * valid i_mode and are being read by the orphan
>> + * recovery code: that's fine, we're about to complete
>> + * the process of deleting those.
>> + */
>> + return 1;
>> + }
>> + return 0;
>> +}
>> +
>> +/*
>> + * Does this checksum-less inode look like a valid inode?
>> + * Do a few sanity checks.
>> + */
>> +static int inode_sanity_check(struct super_block *sb, struct ext4_inode *raw_inode, char **msg)
>> +{
>> + int err = inode_deleted(sb, raw_inode);
>> + if (err)
>> + return err;
>> + if (!test_opt(sb, INODE_SANITY))
>> + return 0;
>> + if (raw_inode->i_mode == 0 ||
>> + raw_inode->i_atime == 0 ||
>> + raw_inode->i_ctime == 0 ||
>> + raw_inode->i_mtime == 0) {
>> + *msg = "inode has invalid zero times";
>> + return -1;
>> + }
>> + /* More sanity checks? */
>> + return 0;
>> +}
>> +
>> +/*
>> + * Check CRC for a newly read inode.
>> + */
>> +static int inode_check_crc(struct super_block *sb, struct ext4_inode *raw_inode,
>> + long ino, char **msg)
>> +{
>> + __le32 crc, check;
>> +
>> + /*
>> + * Special case: zero CRC. This is handled like no checksum yet,
>> + * otherwise tune2fs would need to rewrite all inodes when CRCs
>> + * are turned on. The CRC will be updated when the inode
>> + * is written out.
>> + *
>> + * This however means that if zeroes are blasted over the inodes
>> + * we would think the checksum is not there. So instead do
>> + * some special sanity checks in the other fields when this happens,
>> + * to catch this case.
>> + * This is not 100% fool proof, but should be reasonable.
>> + * When the checksum function returns a real zero we turn that
>> + * into a one to avoid ambiguity.
>> + *
>> + * The sanity check is done unconditionally even if the checksum passed
>> + * because it's cheap enough.
>> + */
>> + crc = raw_inode->i_crc;
>> + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC) && crc
>> + && !test_opt(sb, IGNORE_CRC)) {
>> + check = inode_crc(sb, raw_inode, ino);
>> + if (check != crc) {
>> + *msg = "inode has invalid checksum";
>> + /*
>> + * Restore bad CRC so that if the inode is reread it'll fail
>> + * the check again.
>> + */
>> + raw_inode->i_crc = crc;
>> + return -1;
>> + }
>> + }
>> + return inode_sanity_check(sb, raw_inode, msg);
>> +}
>> +
>> /*
>> * Work out how many blocks we need to proceed with the next chunk of a
>> * truncate transaction.
>> @@ -4996,6 +5132,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> journal_t *journal = EXT4_SB(sb)->s_journal;
>> long ret;
>> int block;
>> + char *msg = NULL;
>>
>> inode = iget_locked(sb, ino);
>> if (!inode)
>> @@ -5010,6 +5147,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> if (ret < 0)
>> goto bad_inode;
>> raw_inode = ext4_raw_inode(&iloc);
>> +
>> + /*
>> + * Relies on the inode lock to protect the raw_inode bh contents.
>> + */
>> + ret = inode_check_crc(sb, raw_inode, ino, &msg);
>> + if (ret < 0) {
>> + /*
>> + * Here would be the place to send a "read other mirror"
>> + * retry hint to the block layer.
>> + */
>> + brelse(iloc.bh);
>> + if (ret != -ESTALE)
>> + ext4_error(sb,
>> + "%s: inode=%lu, block=%llu", msg,
>> + ino,
>> + (unsigned long long)iloc.bh->b_blocknr);
>> + goto bad_inode;
>> + }
>> + ret = 0;
>> +
>> inode->i_mode = le16_to_cpu(raw_inode->i_mode);
>> inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
>> inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
>> @@ -5022,23 +5179,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> ei->i_state_flags = 0;
>> ei->i_dir_start_lookup = 0;
>> ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
>> - /* We now have enough fields to check if the inode was active or not.
>> - * This is needed because nfsd might try to access dead inodes
>> - * the test is that same one that e2fsck uses
>> - * NeilBrown 1999oct15
>> - */
>> - if (inode->i_nlink == 0) {
>> - if (inode->i_mode == 0 ||
>> - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
>> - /* this inode is deleted */
>> - ret = -ESTALE;
>> - goto bad_inode;
>> - }
>> - /* The only unlinked inodes we let through here have
>> - * valid i_mode and are being read by the orphan
>> - * recovery code: that's fine, we're about to complete
>> - * the process of deleting those. */
>> - }
>> ei->i_flags = le32_to_cpu(raw_inode->i_flags);
>> inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
>> ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
>> @@ -5232,11 +5372,19 @@ static int ext4_do_update_inode(handle_t *handle,
>> struct inode *inode,
>> struct ext4_iloc *iloc)
>> {
>> + struct super_block *sb = inode->i_sb;
>> struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
>> struct ext4_inode_info *ei = EXT4_I(inode);
>> struct buffer_head *bh = iloc->bh;
>> int err = 0, rc, block;
>>
>> + /*
>> + * Protect the on disk inode against parallel modification
>> + * until we compute the checksum and pass the resulting block
>> + * to JBD, which protects it then.
>> + */
>> + spin_lock(&ei->raw_inode_lock);
>> +
>> /* For fields not not tracking in the in-memory inode,
>> * initialise them to zero for new inodes. */
>> if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
>> @@ -5291,18 +5439,23 @@ static int ext4_do_update_inode(handle_t *handle,
>> EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
>> EXT4_SB(sb)->s_es->s_rev_level ==
>> cpu_to_le32(EXT4_GOOD_OLD_REV)) {
>> + spin_unlock(&ei->raw_inode_lock);
>> /* If this is the first large file
>> * created, add a flag to the superblock.
>> */
>> err = ext4_journal_get_write_access(handle,
>> EXT4_SB(sb)->s_sbh);
>> - if (err)
>> + if (err) {
>> + spin_lock(&ei->raw_inode_lock);
>> goto out_brelse;
>> + }
>> ext4_update_dynamic_rev(sb);
>> EXT4_SET_RO_COMPAT_FEATURE(sb,
>> EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
>> sb->s_dirt = 1;
>> ext4_handle_sync(handle);
>> + spin_lock(&ei->raw_inode_lock);
>> + inode_update_crc(sb, raw_inode, inode->i_ino);
>> err = ext4_handle_dirty_metadata(handle, NULL,
>> EXT4_SB(sb)->s_sbh);
>> }
>> @@ -5330,6 +5483,7 @@ static int ext4_do_update_inode(handle_t *handle,
>> cpu_to_le32(inode->i_version >> 32);
>> raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
>> }
>> + inode_update_crc(sb, raw_inode, inode->i_ino);
>>
>> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>> rc = ext4_handle_dirty_metadata(handle, NULL, bh);
>> @@ -5339,6 +5493,7 @@ static int ext4_do_update_inode(handle_t *handle,
>>
>> ext4_update_inode_fsync_trans(handle, inode, 0);
>> out_brelse:
>> + spin_unlock(&ei->raw_inode_lock);
>> brelse(bh);
>> ext4_std_error(inode->i_sb, err);
>> return err;
>> @@ -5759,6 +5914,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
>> if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
>> return 0;
>>
>> + spin_lock(&EXT4_I(inode)->raw_inode_lock);
>> +
>> raw_inode = ext4_raw_inode(&iloc);
>>
>> header = IHDR(inode, raw_inode);
>> @@ -5770,10 +5927,12 @@ static int ext4_expand_extra_isize(struct inode *inode,
>> memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
>> new_extra_isize);
>> EXT4_I(inode)->i_extra_isize = new_extra_isize;
>> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
>> return 0;
>> }
>>
>> /* try to expand with EAs present */
>> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
>> return ext4_expand_extra_isize_ea(inode, new_extra_isize,
>> raw_inode, handle);
>> }
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 4e8983a..4012753 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -39,6 +39,7 @@
>> #include <linux/ctype.h>
>> #include <linux/log2.h>
>> #include <linux/crc16.h>
>> +#include <linux/crc32c.h>
>> #include <asm/uaccess.h>
>>
>> #include "ext4.h"
>> @@ -70,6 +71,8 @@ static void ext4_write_super(struct super_block *sb);
>> static int ext4_freeze(struct super_block *sb);
>> static int ext4_get_sb(struct file_system_type *fs_type, int flags,
>> const char *dev_name, void *data, struct vfsmount *mnt);
>> +static void ext4_super_update_crc(struct super_block *sb,
>> + struct ext4_super_block *es);
>>
>> #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
>> static struct file_system_type ext3_fs_type = {
>> @@ -342,7 +345,14 @@ static void ext4_handle_error(struct super_block *sb)
>> ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
>> sb->s_flags |= MS_RDONLY;
>> }
>> + /*
>> + * Unfortunately must take the lock here, to make sure there
>> + * is consistent super state for the checksum. This is very easy to get
>> + * wrong in the caller.
>> + */
>> + lock_super(sb);
>> ext4_commit_super(sb, 1);
>> + unlock_super(sb);
>> if (test_opt(sb, ERRORS_PANIC))
>> panic("EXT4-fs (device %s): panic forced after error\n",
>> sb->s_id);
>> @@ -531,7 +541,9 @@ __acquires(bitlock)
>> if (test_opt(sb, ERRORS_CONT)) {
>> EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
>> es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
>> + lock_super(sb);
>> ext4_commit_super(sb, 0);
>> + unlock_super(sb);
>> return;
>> }
>> ext4_unlock_group(sb, grp);
>> @@ -659,9 +671,12 @@ static void ext4_put_super(struct super_block *sb)
>> if (sbi->s_journal) {
>> err = jbd2_journal_destroy(sbi->s_journal);
>> sbi->s_journal = NULL;
>> - if (err < 0)
>> + if (err < 0) {
>> + unlock_super(sb);
>> ext4_abort(sb, __func__,
>> "Couldn't clean up the journal");
>> + lock_super(sb);
>> + }
>> }
>>
>> ext4_release_system_zone(sb);
>> @@ -758,6 +773,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
>> ei->i_da_metadata_calc_len = 0;
>> ei->i_delalloc_reserved_flag = 0;
>> spin_lock_init(&(ei->i_block_reservation_lock));
>> + spin_lock_init(&(ei->raw_inode_lock));
>> #ifdef CONFIG_QUOTA
>> ei->i_reserved_quota = 0;
>> #endif
>> @@ -1161,6 +1177,7 @@ enum {
>> Opt_inode_readahead_blks, Opt_journal_ioprio,
>> Opt_dioread_nolock, Opt_dioread_lock,
>> Opt_discard, Opt_nodiscard,
>> + Opt_noinode_sanity, Opt_ignore_crc,
>> };
>>
>> static const match_table_t tokens = {
>> @@ -1231,6 +1248,8 @@ static const match_table_t tokens = {
>> {Opt_dioread_lock, "dioread_lock"},
>> {Opt_discard, "discard"},
>> {Opt_nodiscard, "nodiscard"},
>> + {Opt_noinode_sanity, "noinode_sanity"},
>> + {Opt_ignore_crc, "ignore_crc"},
>> {Opt_err, NULL},
>> };
>>
>> @@ -1408,6 +1427,12 @@ static int parse_options(char *options, struct super_block *sb,
>> case Opt_orlov:
>> clear_opt(sbi->s_mount_opt, OLDALLOC);
>> break;
>> + case Opt_noinode_sanity:
>> + clear_opt(sbi->s_mount_opt, INODE_SANITY);
>> + break;
>> + case Opt_ignore_crc:
>> + set_opt(sbi->s_mount_opt, IGNORE_CRC);
>> + break;
>> #ifdef CONFIG_EXT4_FS_XATTR
>> case Opt_user_xattr:
>> set_opt(sbi->s_mount_opt, XATTR_USER);
>> @@ -1946,6 +1971,7 @@ static int ext4_check_descriptors(struct super_block *sb)
>> }
>>
>> ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
>> + ext4_super_update_crc(sb, sbi->s_es);
>> sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
>> return 1;
>> }
>> @@ -2431,6 +2457,50 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
>> return 1;
>> }
>>
>> +/* could be removed for SBs once e2fsutils are fixed to always compute
>> + the CRC when the feature is turned on. */
>> +#define ZERO_MAGIC 1
>> +
>> +/*
>> + * Manage CRC32 for the superblock.
>> + */
>> +static int ext4_super_check_crc(struct super_block *sb,
>> + struct ext4_super_block *es)
>> +{
>> + u32 crc, check;
>> +
>> + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
>> + return 0;
>> + crc = le32_to_cpu(es->s_crc);
>> + if (crc == 0 || test_opt(sb, IGNORE_CRC)) {
>> + ext4_msg(sb, KERN_INFO, "super block checksum ignored");
>> + return 0;
>> + }
>> + es->s_crc = 0;
>> + check = crc32c(~0U, es, sizeof(struct ext4_super_block));
>> + if (check == 0)
>> + check = ZERO_MAGIC;
>> + if (check != crc)
>> + return -1;
>> + /* Remove me */
>> + ext4_msg(sb, KERN_INFO, "super block checksum passed");
>> + return 0;
>> +}
>> +
>> +static void ext4_super_update_crc(struct super_block *sb,
>> + struct ext4_super_block *es)
>> +{
>> + u32 crc;
>> +
>> + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SBI_CRC))
>> + return;
>> + es->s_crc = 0;
>> + crc = crc32c(~0U, es, sizeof(struct ext4_super_block));
>> + if (crc == 0)
>> + crc = ZERO_MAGIC;
>> + es->s_crc = cpu_to_le32(crc);
>> +}
>> +
>> static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>> __releases(kernel_lock)
>> __acquires(kernel_lock)
>> @@ -2554,6 +2624,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>> sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
>>
>> set_opt(sbi->s_mount_opt, BARRIER);
>> + set_opt(sbi->s_mount_opt, INODE_SANITY);
>>
>> /*
>> * enable delayed allocation by default
>> @@ -2585,6 +2656,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>> if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
>> goto failed_mount;
>>
>> + if (ext4_super_check_crc(sb, es) < 0) {
>> + ext4_msg(sb, KERN_ERR, "Invalid checksum in super block");
>> + goto failed_mount;
>> + }
>> +
>> blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
>>
>> if (blocksize < EXT4_MIN_BLOCK_SIZE ||
>> @@ -2675,6 +2751,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>>
>> for (i = 0; i < 4; i++)
>> sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
>> + memcpy(sbi->s_uuid, es->s_uuid, 16);
>> sbi->s_def_hash_version = es->s_def_hash_version;
>> i = le32_to_cpu(es->s_flags);
>> if (i & EXT2_FLAGS_UNSIGNED_HASH)
>> @@ -3393,6 +3470,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
>> es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
>> &EXT4_SB(sb)->s_freeinodes_counter));
>> sb->s_dirt = 0;
>> + /* can be removed if this is properly journaled, see
>> + * http://bugzilla.kernel.org/show_bug.cgi?id=14354 */
>> + ext4_super_update_crc(sb, es);
>> BUFFER_TRACE(sbh, "marking dirty");
>> mark_buffer_dirty(sbh);
>> if (sync) {
>> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
>> index 0433800..fbba95a 100644
>> --- a/fs/ext4/xattr.c
>> +++ b/fs/ext4/xattr.c
>> @@ -1171,6 +1171,7 @@ retry:
>>
>> free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
>> if (free >= new_extra_isize) {
>> + spin_lock(&EXT4_I(inode)->raw_inode_lock);
>> entry = IFIRST(header);
>> ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
>> - new_extra_isize, (void *)raw_inode +
>> @@ -1179,6 +1180,7 @@ retry:
>> inode->i_sb->s_blocksize);
>> EXT4_I(inode)->i_extra_isize = new_extra_isize;
>> error = 0;
>> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
>> goto cleanup;
>> }
>>
>> @@ -1301,6 +1303,7 @@ retry:
>> if (error)
>> goto cleanup;
>>
>> + spin_lock(&EXT4_I(inode)->raw_inode_lock);
>> entry = IFIRST(header);
>> if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
>> shift_bytes = new_extra_isize;
>> @@ -1312,6 +1315,7 @@ retry:
>> EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
>> (void *)header, total_ino - entry_size,
>> inode->i_sb->s_blocksize);
>> + spin_unlock(&EXT4_I(inode)->raw_inode_lock);
>>
>> extra_isize += shift_bytes;
>> new_extra_isize -= shift_bytes;
>>
>>
>> --
>> [email protected] -- Speaking for myself only

Cheers, Andreas