We temporally change checksum fields in buffers of some types of
metadata into '0' for verifying the checksum values. By doing this
without locking the buffer, some metadata's checksums, which are
being committed or written back to the storage, could be damaged.
In our test, several metadata blocks were found with damaged metadata
checksum value during recovery process. When we only verify the
checksum value, we have to avoid modifying checksum fields directly.
Signed-off-by: Daeho Jeong <[email protected]>
Signed-off-by: Youngjin Gil <[email protected]>
---
fs/ext4/inode.c | 38 ++++++++++++++++++++++----------------
fs/ext4/namei.c | 9 ++++-----
fs/ext4/super.c | 18 +++++++++---------
fs/ext4/xattr.c | 13 +++++++------
4 files changed, 42 insertions(+), 36 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 971892d..5ca71aa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -51,25 +51,31 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- __u16 csum_lo;
- __u16 csum_hi = 0;
__u32 csum;
+ __u16 dummy_csum = 0;
+ int offset = offsetof(struct ext4_inode, i_checksum_lo);
+ unsigned int csum_size = sizeof(dummy_csum);
- csum_lo = le16_to_cpu(raw->i_checksum_lo);
- raw->i_checksum_lo = 0;
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum_hi = le16_to_cpu(raw->i_checksum_hi);
- raw->i_checksum_hi = 0;
- }
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ offset += csum_size;
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ EXT4_GOOD_OLD_INODE_SIZE - offset);
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
- EXT4_INODE_SIZE(inode->i_sb));
-
- raw->i_checksum_lo = cpu_to_le16(csum_lo);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
- raw->i_checksum_hi = cpu_to_le16(csum_hi);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ offset = offsetof(struct ext4_inode, i_checksum_hi);
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw +
+ EXT4_GOOD_OLD_INODE_SIZE,
+ offset - EXT4_GOOD_OLD_INODE_SIZE);
+ if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum_size);
+ offset += csum_size;
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ EXT4_INODE_SIZE(inode->i_sb) -
+ offset);
+ }
+ }
return csum;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ec811bb..4a918f8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -420,15 +420,14 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
- __le32 save_csum;
int size;
+ __u32 dummy_csum = 0;
+ int offset = offsetof(struct dx_tail, dt_checksum);
size = count_offset + (count * sizeof(struct dx_entry));
- save_csum = t->dt_checksum;
- t->dt_checksum = 0;
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
- t->dt_checksum = save_csum;
+ csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
return cpu_to_le32(csum);
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index de02a9e..b6cb89a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2058,23 +2058,25 @@ failed:
static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
- int offset;
+ int offset = offsetof(struct ext4_group_desc, bg_checksum);
__u16 crc = 0;
__le32 le_group = cpu_to_le32(block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (ext4_has_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
- __le16 save_csum;
__u32 csum32;
+ __u16 dummy_csum = 0;
- save_csum = gdp->bg_checksum;
- gdp->bg_checksum = 0;
csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
- sbi->s_desc_size);
- gdp->bg_checksum = save_csum;
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
+ sizeof(dummy_csum));
+ offset += sizeof(dummy_csum);
+ if (offset < sbi->s_desc_size)
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
+ sbi->s_desc_size - offset);
crc = csum32 & 0xFFFF;
goto out;
@@ -2084,8 +2086,6 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
if (!ext4_has_feature_gdt_csum(sb))
return 0;
- offset = offsetof(struct ext4_group_desc, bg_checksum);
-
crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
crc = crc16(crc, (__u8 *)gdp, offset);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 0441e05..988b379 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -121,17 +121,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- __le32 save_csum;
__le64 dsk_block_nr = cpu_to_le64(block_nr);
+ __u32 dummy_csum = 0;
+ int offset = offsetof(struct ext4_xattr_header, h_checksum);
- save_csum = hdr->h_checksum;
- hdr->h_checksum = 0;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
- EXT4_BLOCK_SIZE(inode->i_sb));
+ csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
+ csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ offset += sizeof(dummy_csum);
+ csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
+ EXT4_BLOCK_SIZE(inode->i_sb) - offset);
- hdr->h_checksum = save_csum;
return cpu_to_le32(csum);
}
--
1.7.9.5
On Wed, Jun 15, 2016 at 03:12:53PM +0900, Daeho Jeong wrote:
> We temporally change checksum fields in buffers of some types of
> metadata into '0' for verifying the checksum values. By doing this
> without locking the buffer, some metadata's checksums, which are
> being committed or written back to the storage, could be damaged.
> In our test, several metadata blocks were found with damaged metadata
> checksum value during recovery process. When we only verify the
> checksum value, we have to avoid modifying checksum fields directly.
/me wonders how it is we end up writing a block to disk while the
checksum is being calculated?
Is something verifying the block at the same time the journal is
writing the same block out via replay? Or even just a regular commit?
If that's the case, then yes, you're right, we can't touch a single
bit on a metadata block without a transaction protecting it. Oops.
Or is something prepping the block to get logged at the same time the
journal is writing it? I'm confused, I wouldn't have thought jbd2
would allow further changes to a block after a transaction finishes,
at least not without making a private copy of that block...
...but as far as the patch goes, it looks good enough to fix the
corruption-on-replay bug.
Reviewed-by: Darrick J. Wong <[email protected]>
--D
>
> Signed-off-by: Daeho Jeong <[email protected]>
> Signed-off-by: Youngjin Gil <[email protected]>
> ---
> fs/ext4/inode.c | 38 ++++++++++++++++++++++----------------
> fs/ext4/namei.c | 9 ++++-----
> fs/ext4/super.c | 18 +++++++++---------
> fs/ext4/xattr.c | 13 +++++++------
> 4 files changed, 42 insertions(+), 36 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 971892d..5ca71aa 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -51,25 +51,31 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
> struct ext4_inode_info *ei)
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> - __u16 csum_lo;
> - __u16 csum_hi = 0;
> __u32 csum;
> + __u16 dummy_csum = 0;
> + int offset = offsetof(struct ext4_inode, i_checksum_lo);
> + unsigned int csum_size = sizeof(dummy_csum);
>
> - csum_lo = le16_to_cpu(raw->i_checksum_lo);
> - raw->i_checksum_lo = 0;
> - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
> - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
> - csum_hi = le16_to_cpu(raw->i_checksum_hi);
> - raw->i_checksum_hi = 0;
> - }
> + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
> + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
> + offset += csum_size;
> + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
> + EXT4_GOOD_OLD_INODE_SIZE - offset);
>
> - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
> - EXT4_INODE_SIZE(inode->i_sb));
> -
> - raw->i_checksum_lo = cpu_to_le16(csum_lo);
> - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
> - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
> - raw->i_checksum_hi = cpu_to_le16(csum_hi);
> + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
> + offset = offsetof(struct ext4_inode, i_checksum_hi);
> + csum = ext4_chksum(sbi, csum, (__u8 *)raw +
> + EXT4_GOOD_OLD_INODE_SIZE,
> + offset - EXT4_GOOD_OLD_INODE_SIZE);
> + if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
> + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
> + csum_size);
> + offset += csum_size;
> + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
> + EXT4_INODE_SIZE(inode->i_sb) -
> + offset);
> + }
> + }
>
> return csum;
> }
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index ec811bb..4a918f8 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -420,15 +420,14 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> struct ext4_inode_info *ei = EXT4_I(inode);
> __u32 csum;
> - __le32 save_csum;
> int size;
> + __u32 dummy_csum = 0;
> + int offset = offsetof(struct dx_tail, dt_checksum);
>
> size = count_offset + (count * sizeof(struct dx_entry));
> - save_csum = t->dt_checksum;
> - t->dt_checksum = 0;
> csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
> - csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
> - t->dt_checksum = save_csum;
> + csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
> + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
>
> return cpu_to_le32(csum);
> }
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index de02a9e..b6cb89a 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -2058,23 +2058,25 @@ failed:
> static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
> struct ext4_group_desc *gdp)
> {
> - int offset;
> + int offset = offsetof(struct ext4_group_desc, bg_checksum);
> __u16 crc = 0;
> __le32 le_group = cpu_to_le32(block_group);
> struct ext4_sb_info *sbi = EXT4_SB(sb);
>
> if (ext4_has_metadata_csum(sbi->s_sb)) {
> /* Use new metadata_csum algorithm */
> - __le16 save_csum;
> __u32 csum32;
> + __u16 dummy_csum = 0;
>
> - save_csum = gdp->bg_checksum;
> - gdp->bg_checksum = 0;
> csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
> sizeof(le_group));
> - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
> - sbi->s_desc_size);
> - gdp->bg_checksum = save_csum;
> + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
> + csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
> + sizeof(dummy_csum));
> + offset += sizeof(dummy_csum);
> + if (offset < sbi->s_desc_size)
> + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
> + sbi->s_desc_size - offset);
>
> crc = csum32 & 0xFFFF;
> goto out;
> @@ -2084,8 +2086,6 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
> if (!ext4_has_feature_gdt_csum(sb))
> return 0;
>
> - offset = offsetof(struct ext4_group_desc, bg_checksum);
> -
> crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
> crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
> crc = crc16(crc, (__u8 *)gdp, offset);
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 0441e05..988b379 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -121,17 +121,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> __u32 csum;
> - __le32 save_csum;
> __le64 dsk_block_nr = cpu_to_le64(block_nr);
> + __u32 dummy_csum = 0;
> + int offset = offsetof(struct ext4_xattr_header, h_checksum);
>
> - save_csum = hdr->h_checksum;
> - hdr->h_checksum = 0;
> csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
> sizeof(dsk_block_nr));
> - csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
> - EXT4_BLOCK_SIZE(inode->i_sb));
> + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
> + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
> + offset += sizeof(dummy_csum);
> + csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
> + EXT4_BLOCK_SIZE(inode->i_sb) - offset);
>
> - hdr->h_checksum = save_csum;
> return cpu_to_le32(csum);
> }
>
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Darrick,
> Is something verifying the block at the same time the journal is
> writing the same block out via replay? Or even just a regular commit?
> If that's the case, then yes, you're right, we can't touch a single
> bit on a metadata block without a transaction protecting it. Oops.
As far as I know, if you want to modify a metadata block, you have to
get write access for a metadata block in advance in order to notify
your intention to jbd2 journaling module so that jbd2 can control all
the modifications to the metadata block and it can make a copied
version of the metadata block with copying out the original metadata
block if necessary.
However, if we touch a metadata block without jbd2's approval, jbd2 doesn't
know about what we are doing and it cannot do anything for us. So, we can
modify the content of the metadata block even being written back.
On Wed, Jun 15, 2016 at 03:12:53PM +0900, Daeho Jeong wrote:
> We temporally change checksum fields in buffers of some types of
> metadata into '0' for verifying the checksum values. By doing this
> without locking the buffer, some metadata's checksums, which are
> being committed or written back to the storage, could be damaged.
> In our test, several metadata blocks were found with damaged metadata
> checksum value during recovery process. When we only verify the
> checksum value, we have to avoid modifying checksum fields directly.
>
> Signed-off-by: Daeho Jeong <[email protected]>
> Signed-off-by: Youngjin Gil <[email protected]>
Applied, thanks!
- Ted