2006-10-23 16:44:16

by Andreas Dilger

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On Oct 23, 2006 16:45 +0200, Andre Noll wrote:
> stress tests on a 6.3T ext3 filesystem which runs on top of software
> raid 6 revealed the following:
>
> [663594.224641] init_special_inode: bogus i_mode (4412)
> [663596.355652] init_special_inode: bogus i_mode (5123)
> [663596.355832] init_special_inode: bogus i_mode (71562)

This would appear to be inode table corruption.

> [663763.319514] EXT3-fs error (device md0): ext3_new_block: Allocating block in system zone - blocks from 517570560, length 1

This is bitmap corruption.

> [663763.339423] EXT3-fs error (device md0): ext3_free_blocks: Freeing blocks in system zones - Block = 517570560, count = 1

Hmm, this would appear to be a buglet in error handling. If the block just
allocated above is in the system zone it should be marked in-use in the
bitmap but otherwise ignored. We definitely should NOT be freeing it on
error.

Yikes! It seems a patch I submitted to 2.4 that fixed the behaviour
of ext3_new_block() so that if we detect this block shouldn't be
allocated it is skipped instead of corrupting the filesystem if it
is running with errors=continue...

It looks like ext3_free_blocks() needs a similar fix - i.e. report an
error and don't actually free those blocks.

> This system is currently not in production use, so I can use it for tests ATM.

I would suggest that you give this a try on RAID5, just for starters.
I'm not aware of any problems in the existing ext3 code for anything
below 8TB.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.


2006-10-23 20:02:48

by Andreas Dilger

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On Oct 23, 2006 10:44 -0600, Andreas Dilger wrote:
> Hmm, this would appear to be a buglet in error handling. If the block just
> allocated above is in the system zone it should be marked in-use in the
> bitmap but otherwise ignored. We definitely should NOT be freeing it on
> error.
>
> Yikes! It seems a patch I submitted to 2.4 that fixed the behaviour
> of ext3_new_block() so that if we detect this block shouldn't be
> allocated it is skipped instead of corrupting the filesystem if it
> is running with errors=continue...
>
> It looks like ext3_free_blocks() needs a similar fix - i.e. report an
> error and don't actually free those blocks.

I found a URL for the 2.4 version of this patch, if some kind soul would
update it for 2.6 it might save someone's data in the future. It looks
at the time I wasn't working on 2.5 kernels and nobody else took it on.

http://lkml.org/lkml/2003/4/7/252

[patch is pasted, may not be free of whitespace munging, but then it will
need to be applied to 2.6 by hand anyways]
======================= ext2-2.4.18-badalloc.diff ===========================
--- linux-2.4.18.orig/fs/ext3/balloc.c Wed Feb 27 10:31:59 2002
+++ linux-2.4.18-aed/fs/ext3/balloc.c Mon Mar 18 17:15:46 2002
@@ -276,7 +273,8 @@ void ext3_free_blocks
}
lock_super (sb);
es = sb->u.ext3_sb.s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
+ if (block < le32_to_cpu(es->s_first_data_block) ||
+ block + count < block ||
(block + count) > le32_to_cpu(es->s_blocks_count)) {
ext3_error (sb, "ext3_free_blocks",
"Freeing blocks not in datazone - "
@@ -309,17 +307,6 @@ void ext3_free_blocks
if (!gdp)
goto error_return;

- if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
- in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
- in_range (block, le32_to_cpu(gdp->bg_inode_table),
- sb->u.ext3_sb.s_itb_per_group) ||
- in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
- sb->u.ext3_sb.s_itb_per_group))
- ext3_error (sb, "ext3_free_blocks",
- "Freeing blocks in system zones - "
- "Block = %lu, count = %lu",
- block, count);
-
/*
* We are about to start releasing blocks in the bitmap,
* so we need undo access.
@@ -345,14 +332,24 @@ void ext3_free_blocks
if (err)
goto error_return;

- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++, block++) {
+ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ in_range(block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group)) {
+ ext3_error(sb, __FUNCTION__,
+ "Freeing block in system zone - block =
%lu",
+ block);
+ continue;
+ }
+
/*
* An HJ special. This is expensive...
*/
#ifdef CONFIG_JBD_DEBUG
{
struct buffer_head *debug_bh;
- debug_bh = sb_get_hash_table(sb, block + i);
+ debug_bh = sb_get_hash_table(sb, block);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
@@ -365,9 +362,8 @@ void ext3_free_blocks
#endif
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
- ext3_error (sb, __FUNCTION__,
- "bit already cleared for block %lu",
- block + i);
+ ext3_error(sb, __FUNCTION__,
+ "bit already cleared for block %lu",
block);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
dquot_freed_blocks++;
@@ -415,7 +411,6 @@ void ext3_free_blocks
if (!err) err = ret;

if (overflow && !err) {
- block += count;
count = overflow;
goto do_more;
}
@@ -575,6 +574,7 @@ int ext3_new_block

ext3_debug ("goal=%lu.\n", goal);

+repeat:
/*
* First, test whether the goal block is free.
*/
@@ -684,10 +684,21 @@ int ext3_new_block
if (tmp == le32_to_cpu(gdp->bg_block_bitmap) ||
tmp == le32_to_cpu(gdp->bg_inode_bitmap) ||
in_range (tmp, le32_to_cpu(gdp->bg_inode_table),
- sb->u.ext3_sb.s_itb_per_group))
- ext3_error (sb, "ext3_new_block",
- "Allocating block in system zone - "
- "block = %u", tmp);
+ EXT3_SB(sb)->s_itb_per_group)) {
+ ext3_error(sb, __FUNCTION__,
+ "Allocating block in system zone - block = %u",
tmp);
+
+ /* Note: This will potentially use up one of the handle's
+ * buffer credits. Normally we have way too many credits,
+ * so that is OK. In _very_ rare cases it might not be OK.
+ * We will trigger an assertion if we run out of credits,
+ * and we will have to do a full fsck of the filesystem -
+ * better than randomly corrupting filesystem metadata.
+ */
+ ext3_set_bit(j, bh->b_data);
+ goto repeat;
+ }
+

/* The superblock lock should guard against anybody else beating
* us to this point! */

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-10-24 09:15:09

by Andre Noll

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On 14:02, Andreas Dilger wrote:

> I found a URL for the 2.4 version of this patch, if some kind soul would
> update it for 2.6 it might save someone's data in the future.

Something like the this? (only compile tested). And no, I do _not_ know,
what I'm doing ;)

Thanks
Andre


diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 063d994..da2bd51 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -359,17 +359,6 @@ do_more:
if (!desc)
goto error_return;

- if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
- in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
- in_range (block, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group) ||
- in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group))
- ext3_error (sb, "ext3_free_blocks",
- "Freeing blocks in system zones - "
- "Block = "E3FSBLK", count = %lu",
- block, count);
-
/*
* We are about to start releasing blocks in the bitmap,
* so we need undo access.
@@ -392,7 +381,17 @@ do_more:

jbd_lock_bh_state(bitmap_bh);

- for (i = 0, group_freed = 0; i < count; i++) {
+ for (i = 0, group_freed = 0; i < count; i++, block++) {
+ struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
+ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ in_range(block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group)) {
+ ext3_error(sb, __FUNCTION__,
+ "Freeing block in system zone - block = %lu",
+ block);
+ continue;
+ }
/*
* An HJ special. This is expensive...
*/
@@ -400,7 +399,7 @@ #ifdef CONFIG_JBD_DEBUG
jbd_unlock_bh_state(bitmap_bh);
{
struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
+ debug_bh = sb_find_get_block(sb, block);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
@@ -452,7 +451,7 @@ #endif
jbd_unlock_bh_state(bitmap_bh);
ext3_error(sb, __FUNCTION__,
"bit already cleared for block "E3FSBLK,
- block + i);
+ block);
jbd_lock_bh_state(bitmap_bh);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
@@ -479,7 +478,6 @@ #endif
*pdquot_freed_blocks += group_freed;

if (overflow && !err) {
- block += count;
count = overflow;
goto do_more;
}
@@ -1260,7 +1258,7 @@ #endif
*errp = -ENOSPC;
goto out;
}
-
+repeat:
/*
* First, test whether the goal block is free.
*/
@@ -1372,12 +1370,24 @@ allocated:
in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
EXT3_SB(sb)->s_itb_per_group) ||
in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group))
- ext3_error(sb, "ext3_new_block",
+ EXT3_SB(sb)->s_itb_per_group)) {
+ int j;
+ ext3_error(sb, __FUNCTION__,
"Allocating block in system zone - "
"blocks from "E3FSBLK", length %lu",
ret_block, num);
-
+ /* Note: This will potentially use up one of the handle's
+ * buffer credits. Normally we have way too many credits,
+ * so that is OK. In _very_ rare cases it might not be OK.
+ * We will trigger an assertion if we run out of credits,
+ * and we will have to do a full fsck of the filesystem -
+ * better than randomly corrupting filesystem metadata.
+ */
+ j = find_next_usable_block(-1, gdp, EXT3_BLOCKS_PER_GROUP(sb));
+ if (j >= 0)
+ ext3_set_bit(j, gdp_bh->b_data);
+ goto repeat;
+ }
performed_allocation = 1;

#ifdef CONFIG_JBD_DEBUG
--
The only person who always got his work done by Friday was Robinson Crusoe


Attachments:
(No filename) (3.58 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments

2006-10-24 20:27:19

by Andreas Dilger

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On Oct 24, 2006 11:14 +0200, Andre Noll wrote:
> On 14:02, Andreas Dilger wrote:
> Something like the this? (only compile tested). And no, I do _not_ know,
> what I'm doing ;)

Don't worry, everyone starts out not knowing what they are doing.
The ext3_free_blocks() part looks OK from a cursory review.

> @@ -1372,12 +1370,24 @@ allocated:
> in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
> EXT3_SB(sb)->s_itb_per_group) ||
> in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
> + EXT3_SB(sb)->s_itb_per_group)) {
> + int j;
> + ext3_error(sb, __FUNCTION__,
> "Allocating block in system zone - "
> "blocks from "E3FSBLK", length %lu",
> ret_block, num);
> -
> + /* Note: This will potentially use up one of the handle's
> + * buffer credits. Normally we have way too many credits,
> + * so that is OK. In _very_ rare cases it might not be OK.
> + * We will trigger an assertion if we run out of credits,
> + * and we will have to do a full fsck of the filesystem -
> + * better than randomly corrupting filesystem metadata.
> + */
> + j = find_next_usable_block(-1, gdp, EXT3_BLOCKS_PER_GROUP(sb));

I'm not sure why the "find_next_usable_block()" part is in here? At this
point we KNOW that ret_block is not a block we should be allocating, yet
it is marked free in the bitmap. So we should just mark the block(s) in-use
in the bitmap and look for a different block(s).

> + if (j >= 0)
> + ext3_set_bit(j, gdp_bh->b_data);

Note that we need to loop over "num" blocks and set any bits that should
be set, as is done in the ext3_free_blocks() code. This function has
changed since 2.4 in that it can now potentially allocate multiple blocks.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-10-25 09:44:48

by Andre Noll

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On 14:27, Andreas Dilger wrote:

> > + j = find_next_usable_block(-1, gdp, EXT3_BLOCKS_PER_GROUP(sb));
>
> I'm not sure why the "find_next_usable_block()" part is in here? At this
> point we KNOW that ret_block is not a block we should be allocating, yet
> it is marked free in the bitmap. So we should just mark the block(s) in-use
> in the bitmap and look for a different block(s).

Are you saying that ext3_set_bit() should simply be called with
"ret_block" as its first argument? If yes, that is what the revised
patch below does.

Thanks
Andre

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 063d994..3cca317 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -359,17 +359,6 @@ do_more:
if (!desc)
goto error_return;

- if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
- in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
- in_range (block, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group) ||
- in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group))
- ext3_error (sb, "ext3_free_blocks",
- "Freeing blocks in system zones - "
- "Block = "E3FSBLK", count = %lu",
- block, count);
-
/*
* We are about to start releasing blocks in the bitmap,
* so we need undo access.
@@ -392,7 +381,17 @@ do_more:

jbd_lock_bh_state(bitmap_bh);

- for (i = 0, group_freed = 0; i < count; i++) {
+ for (i = 0, group_freed = 0; i < count; i++, block++) {
+ struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
+ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ in_range(block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group)) {
+ ext3_error(sb, __FUNCTION__,
+ "Freeing block in system zone - block = %lu",
+ block);
+ continue;
+ }
/*
* An HJ special. This is expensive...
*/
@@ -400,7 +399,7 @@ #ifdef CONFIG_JBD_DEBUG
jbd_unlock_bh_state(bitmap_bh);
{
struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
+ debug_bh = sb_find_get_block(sb, block);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
@@ -452,7 +451,7 @@ #endif
jbd_unlock_bh_state(bitmap_bh);
ext3_error(sb, __FUNCTION__,
"bit already cleared for block "E3FSBLK,
- block + i);
+ block);
jbd_lock_bh_state(bitmap_bh);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
@@ -479,7 +478,6 @@ #endif
*pdquot_freed_blocks += group_freed;

if (overflow && !err) {
- block += count;
count = overflow;
goto do_more;
}
@@ -1260,7 +1258,7 @@ #endif
*errp = -ENOSPC;
goto out;
}
-
+repeat:
/*
* First, test whether the goal block is free.
*/
@@ -1372,12 +1370,21 @@ allocated:
in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
EXT3_SB(sb)->s_itb_per_group) ||
in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group))
- ext3_error(sb, "ext3_new_block",
+ EXT3_SB(sb)->s_itb_per_group)) {
+ ext3_error(sb, __FUNCTION__,
"Allocating block in system zone - "
"blocks from "E3FSBLK", length %lu",
ret_block, num);
-
+ /* Note: This will potentially use up one of the handle's
+ * buffer credits. Normally we have way too many credits,
+ * so that is OK. In _very_ rare cases it might not be OK.
+ * We will trigger an assertion if we run out of credits,
+ * and we will have to do a full fsck of the filesystem -
+ * better than randomly corrupting filesystem metadata.
+ */
+ ext3_set_bit(ret_block, gdp_bh->b_data);
+ goto repeat;
+ }
performed_allocation = 1;

#ifdef CONFIG_JBD_DEBUG
--
The only person who always got his work done by Friday was Robinson Crusoe


Attachments:
(No filename) (3.77 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments

2006-10-26 09:36:15

by Andreas Dilger

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On Oct 25, 2006 11:44 +0200, Andre Noll wrote:
> Are you saying that ext3_set_bit() should simply be called with
> "ret_block" as its first argument? If yes, that is what the revised
> patch below does.

You might need to call ext3_set_bit_atomic() (as claim_block() does,
not sure.

> @@ -1372,12 +1370,21 @@ allocated:
> in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
> EXT3_SB(sb)->s_itb_per_group) ||
> in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
> + EXT3_SB(sb)->s_itb_per_group)) {
> + ext3_error(sb, __FUNCTION__,
> "Allocating block in system zone - "
> "blocks from "E3FSBLK", length %lu",
> ret_block, num);
> + /* Note: This will potentially use up one of the handle's
> + * buffer credits. Normally we have way too many credits,
> + * so that is OK. In _very_ rare cases it might not be OK.
> + * We will trigger an assertion if we run out of credits,
> + * and we will have to do a full fsck of the filesystem -
> + * better than randomly corrupting filesystem metadata.
> + */
> + ext3_set_bit(ret_block, gdp_bh->b_data);
> + goto repeat;
> + }

The other issue is that you need to potentially set "num" bits in the
bitmap here, if those all overlap metadata. In fact, it might just
make more sense at this stage to walk all of the bits in the bitmaps,
the inode table and the backup superblock and group descriptor to see
if they need fixing also.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-10-26 16:03:16

by Andre Noll

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On 03:36, Andreas Dilger wrote:

> On Oct 25, 2006 11:44 +0200, Andre Noll wrote:
> > Are you saying that ext3_set_bit() should simply be called with
> > "ret_block" as its first argument? If yes, that is what the revised
> > patch below does.
>
> You might need to call ext3_set_bit_atomic() (as claim_block() does,
> not sure.

I _think_ it doesn't matter much which one is used as on most archs
ext3_set_bit_atomic() is only a wrapper for test_and_set_bit() just like
ext3_set_bit() is. The only exceptions seem to be those archs that use
the generic bitops and m68knommu.

> The other issue is that you need to potentially set "num" bits in the
> bitmap here, if those all overlap metadata. In fact, it might just
> make more sense at this stage to walk all of the bits in the bitmaps,
> the inode table and the backup superblock and group descriptor to see
> if they need fixing also.

I tried to implement this, but I could not find out how to check at this
point whether a given bit (in the block bitmap, say) needs fixing.

Thanks
Andre
--
The only person who always got his work done by Friday was Robinson Crusoe


Attachments:
(No filename) (1.10 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments

2006-10-26 18:01:33

by Andreas Dilger

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On Oct 26, 2006 18:02 +0200, Andre Noll wrote:
> On 03:36, Andreas Dilger wrote:
> > The other issue is that you need to potentially set "num" bits in the
> > bitmap here, if those all overlap metadata. In fact, it might just
> > make more sense at this stage to walk all of the bits in the bitmaps,
> > the inode table and the backup superblock and group descriptor to see
> > if they need fixing also.
>
> I tried to implement this, but I could not find out how to check at this
> point whether a given bit (in the block bitmap, say) needs fixing.

Well, since we know at least one bit needs fixing and results in the block
being written to disk then setting the bits for all of the other metadata
blocks in this group has no extra IO cost (only a tiny amount of CPU).
Re-setting the bits if they are already set is not harmful.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-10-27 15:34:52

by Andre Noll

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On 12:01, Andreas Dilger wrote:
> On Oct 26, 2006 18:02 +0200, Andre Noll wrote:
> > On 03:36, Andreas Dilger wrote:
> > > The other issue is that you need to potentially set "num" bits in the
> > > bitmap here, if those all overlap metadata. In fact, it might just
> > > make more sense at this stage to walk all of the bits in the bitmaps,
> > > the inode table and the backup superblock and group descriptor to see
> > > if they need fixing also.
> >
> > I tried to implement this, but I could not find out how to check at this
> > point whether a given bit (in the block bitmap, say) needs fixing.
>
> Well, since we know at least one bit needs fixing and results in the block
> being written to disk then setting the bits for all of the other metadata
> blocks in this group has no extra IO cost (only a tiny amount of CPU).
> Re-setting the bits if they are already set is not harmful.

I.e, something like

int i;
ext3_fsblk_t bit;
unsigned long gdblocks = EXT3_SB(sb)->s_gdb_count;

for (i = 0, bit = 1; i < gdblocks; i++, bit++)
ext3_set_bit(bit, gdp_bh->b_data);

Is that correct?

Andre
--
The only person who always got his work done by Friday was Robinson Crusoe


Attachments:
(No filename) (1.20 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments

2006-10-28 14:25:01

by Andreas Dilger

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On Oct 27, 2006 17:34 +0200, Andre Noll wrote:
> On 12:01, Andreas Dilger wrote:
> > Well, since we know at least one bit needs fixing and results in the block
> > being written to disk then setting the bits for all of the other metadata
> > blocks in this group has no extra IO cost (only a tiny amount of CPU).
> > Re-setting the bits if they are already set is not harmful.
>
> I.e, something like
>
> int i;
> ext3_fsblk_t bit;
> unsigned long gdblocks = EXT3_SB(sb)->s_gdb_count;
>
> for (i = 0, bit = 1; i < gdblocks; i++, bit++)
> ext3_set_bit(bit, gdp_bh->b_data);
>
> Is that correct?

Well, it needs to also handle backup superblock, bitmaps, inode table:

if (ext3_bg_has_super())
ext3_set_bit(0, gdp_bh->b_data);
gdblocks = ext3_bg_num_gdb(sb, group);
for (i = 0, bit = 1; i < gdblocks; i++, bit++)
/* actually a bit more complex for INCOMPAT_META_BG fs */
ext3_set_bit(i, gdp_bh->b_data);
ext3_set_bit(gdp->bg_inode_bitmap % EXT3_BLOCKS_PER_GROUP(sb), ...);
ext3_set_bit(gdp->bg_block_bitmap % EXT3_BLOCKS_PER_GROUP(sb), ...);
for (i = 0, bit = gdp->bg_inode_table % EXT3_BLOCKS_PER_GROUP(sb);
i < EXT3_SB(sb)->s_itb_per_group; i++, bit++)
ext3_set_bit(i, gdp_bh->b_data);

(or something close to this).

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-10-30 09:56:20

by Andre Noll

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On 22:24, Andreas Dilger wrote:
> Well, it needs to also handle backup superblock, bitmaps, inode table:
>
> if (ext3_bg_has_super())
> ext3_set_bit(0, gdp_bh->b_data);
> gdblocks = ext3_bg_num_gdb(sb, group);
> for (i = 0, bit = 1; i < gdblocks; i++, bit++)
> /* actually a bit more complex for INCOMPAT_META_BG fs */
> ext3_set_bit(i, gdp_bh->b_data);
> ext3_set_bit(gdp->bg_inode_bitmap % EXT3_BLOCKS_PER_GROUP(sb), ...);
> ext3_set_bit(gdp->bg_block_bitmap % EXT3_BLOCKS_PER_GROUP(sb), ...);
> for (i = 0, bit = gdp->bg_inode_table % EXT3_BLOCKS_PER_GROUP(sb);
> i < EXT3_SB(sb)->s_itb_per_group; i++, bit++)
> ext3_set_bit(i, gdp_bh->b_data);
>
> (or something close to this).

OK. So here's a new version of the patch. I moved the check whether a
block needs fixing into an inline function, block_needs_fix(), and
introduced a new function fix_group() that sets the bits in
gdp_bh->b_data. Note that the patch does not address the
EXT3_FEATURE_INCOMPAT_META_BG case yet. I'll have to look at this in
more detail.

BTW: Currently e2fsck is running on the file system which suffered from
the bug this patch tries to fix. It looks like the file system is
completely busted as e2fsck is spitting out zillions of

"Too many illegal blocks"
"Inode xxxxx has compression flag set on filesystem without compression support."
"Inode xxxxx has INDEX_FL flag set but is not a directory"

Thanks
Andre


diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 063d994..cc29e5b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -359,17 +359,6 @@ do_more:
if (!desc)
goto error_return;

- if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
- in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
- in_range (block, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group) ||
- in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group))
- ext3_error (sb, "ext3_free_blocks",
- "Freeing blocks in system zones - "
- "Block = "E3FSBLK", count = %lu",
- block, count);
-
/*
* We are about to start releasing blocks in the bitmap,
* so we need undo access.
@@ -392,7 +381,17 @@ do_more:

jbd_lock_bh_state(bitmap_bh);

- for (i = 0, group_freed = 0; i < count; i++) {
+ for (i = 0, group_freed = 0; i < count; i++, block++) {
+ struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
+ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ in_range(block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group)) {
+ ext3_error(sb, __FUNCTION__,
+ "Freeing block in system zone - block = %lu",
+ block);
+ continue;
+ }
/*
* An HJ special. This is expensive...
*/
@@ -400,7 +399,7 @@ do_more:
jbd_unlock_bh_state(bitmap_bh);
{
struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
+ debug_bh = sb_find_get_block(sb, block);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
@@ -452,7 +451,7 @@ do_more:
jbd_unlock_bh_state(bitmap_bh);
ext3_error(sb, __FUNCTION__,
"bit already cleared for block "E3FSBLK,
- block + i);
+ block);
jbd_lock_bh_state(bitmap_bh);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
@@ -479,7 +478,6 @@ do_more:
*pdquot_freed_blocks += group_freed;

if (overflow && !err) {
- block += count;
count = overflow;
goto do_more;
}
@@ -1191,6 +1189,57 @@ int ext3_should_retry_alloc(struct super
return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
}

+static inline int block_needs_fix(ext3_fsblk_t block,
+ unsigned long num,
+ struct ext3_group_desc *gdp,
+ struct super_block *sb)
+{
+ if (in_range(le32_to_cpu(gdp->bg_block_bitmap), block, num))
+ return 1;
+ if (in_range(le32_to_cpu(gdp->bg_inode_bitmap), block, num))
+ return 1;
+ if (in_range(block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group))
+ return 1;
+ if (in_range(block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group))
+ return 1;
+ return 0;
+}
+
+/*
+ *
+ * set the bits for all of the metadata blocks in the group
+ *
+ * Note: This will potentially use up some of the handle's buffer credits.
+ * Normally we have way too many credits, so that is OK. In _very_ rare cases it
+ * might not be OK. We will trigger an assertion if we run out of credits, and we
+ * will have to do a full fsck of the filesystem - better than randomly corrupting
+ * filesystem metadata.
+ */
+static void fix_group(int group, struct super_block *sb)
+{
+ int i;
+ ext3_fsblk_t bit;
+ unsigned long gdblocks;
+ struct buffer_head *gdp_bh;
+ struct ext3_group_desc *gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+
+ if (ext3_bg_has_super(sb, group))
+ ext3_set_bit(0, gdp_bh->b_data);
+ gdblocks = ext3_bg_num_gdb(sb, group);
+ for (i = 0, bit = 1; i < gdblocks; i++, bit++)
+ /* actually a bit more complex for INCOMPAT_META_BG fs */
+ ext3_set_bit(i, gdp_bh->b_data);
+ ext3_set_bit(gdp->bg_inode_bitmap % EXT3_BLOCKS_PER_GROUP(sb),
+ gdp_bh->b_data);
+ ext3_set_bit(gdp->bg_block_bitmap % EXT3_BLOCKS_PER_GROUP(sb),
+ gdp_bh->b_data);
+ for (i = 0, bit = gdp->bg_inode_table % EXT3_BLOCKS_PER_GROUP(sb);
+ i < EXT3_SB(sb)->s_itb_per_group; i++, bit++)
+ ext3_set_bit(i, gdp_bh->b_data);
+}
+
/*
* ext3_new_block uses a goal block to assist allocation. If the goal is
* free, or there is a free block within 32 blocks of the goal, that block
@@ -1260,7 +1309,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
*errp = -ENOSPC;
goto out;
}
-
+repeat:
/*
* First, test whether the goal block is free.
*/
@@ -1367,17 +1416,10 @@ allocated:

ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);

- if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
- in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
- in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group) ||
- in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group))
- ext3_error(sb, "ext3_new_block",
- "Allocating block in system zone - "
- "blocks from "E3FSBLK", length %lu",
- ret_block, num);
-
+ if (block_needs_fix(ret_block, num, gdp, sb)) {
+ fix_group(group_no, sb);
+ goto repeat;
+ }
performed_allocation = 1;

#ifdef CONFIG_JBD_DEBUG
--
The only person who always got his work done by Friday was Robinson Crusoe


Attachments:
(No filename) (6.45 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments

2006-11-15 00:04:00

by Andreas Dilger

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On Oct 30, 2006 10:55 +0100, Andre Noll wrote:
> Note that the patch does not address the EXT3_FEATURE_INCOMPAT_META_BG
> case yet. I'll have to look at this in more detail.

See more below... Basically, the patch looks good enough to submit for
inclusion. Can you CC it to Andrew on the next iteration.

> +static inline int block_needs_fix(ext3_fsblk_t block,
> + unsigned long num,
> + struct ext3_group_desc *gdp,
> + struct super_block *sb)
> +{
> + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), block, num))
> + return 1;
> + if (in_range(le32_to_cpu(gdp->bg_inode_bitmap), block, num))
> + return 1;
> + if (in_range(block, le32_to_cpu(gdp->bg_inode_table),
> + EXT3_SB(sb)->s_itb_per_group))
> + return 1;
> + if (in_range(block + num - 1, le32_to_cpu(gdp->bg_inode_table),
> + EXT3_SB(sb)->s_itb_per_group))
> + return 1;

This _should_ probably also include a check for the backup group descriptors
and superblock, but unfortunately the helpers ext3_bg_has_super() and
ext3_bg_num_gdb() are fairly CPU intensive and not something you want to
check for each block that is allocated/freed. In the future (mballoc) when
we have per-group data structs we can just store a flag per group whether
it has a backup or not instead of recomputing the powers of 3, 5, 7 each time.
Maybe a comment for now to indicate that needs to be fixed in the future?


Also a minor nit - this is actually checking "num" blocks, and it might be
useful in the future so it would be clearer to call it something like
ext3_blocks_are_metadata().

> +static void fix_group(int group, struct super_block *sb)
> +{
> + int i;
> + ext3_fsblk_t bit;
> + unsigned long gdblocks;
> + struct buffer_head *gdp_bh;
> + struct ext3_group_desc *gdp = ext3_get_group_desc(sb, group, &gdp_bh);
> +
> + if (ext3_bg_has_super(sb, group))
> + ext3_set_bit(0, gdp_bh->b_data);
> + gdblocks = ext3_bg_num_gdb(sb, group);
> + for (i = 0, bit = 1; i < gdblocks; i++, bit++)
> + /* actually a bit more complex for INCOMPAT_META_BG fs */
> + ext3_set_bit(i, gdp_bh->b_data);

Actually, in newer kernels ext3_bg_num_gdb() includes INCOMPAT_META_BG
support, so that may be enough.


Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-11-15 14:31:28

by Andre Noll

[permalink] [raw]
Subject: Re: ext3: bogus i_mode errors with 2.6.18.1

On 17:03, Andreas Dilger wrote:

> On Oct 30, 2006 10:55 +0100, Andre Noll wrote:
> > Note that the patch does not address the EXT3_FEATURE_INCOMPAT_META_BG
> > case yet. I'll have to look at this in more detail.
>
> See more below... Basically, the patch looks good enough to submit for
> inclusion. Can you CC it to Andrew on the next iteration.

Andrew added to CC list. Andrew: Please consider the patch below for
inclusion in the mm tree.

> This _should_ probably also include a check for the backup group descriptors
> and superblock, but unfortunately the helpers ext3_bg_has_super() and
> ext3_bg_num_gdb() are fairly CPU intensive and not something you want to
> check for each block that is allocated/freed. In the future (mballoc) when
> we have per-group data structs we can just store a flag per group whether
> it has a backup or not instead of recomputing the powers of 3, 5, 7 each time.
> Maybe a comment for now to indicate that needs to be fixed in the future?

Done, see below.

> Also a minor nit - this is actually checking "num" blocks, and it might be
> useful in the future so it would be clearer to call it something like
> ext3_blocks_are_metadata().

Agreed, ext3_blocks_are_metadata() is more appropriate.

> > + gdblocks = ext3_bg_num_gdb(sb, group);
> > + for (i = 0, bit = 1; i < gdblocks; i++, bit++)
> > + /* actually a bit more complex for INCOMPAT_META_BG fs */
> > + ext3_set_bit(i, gdp_bh->b_data);
>
> Actually, in newer kernels ext3_bg_num_gdb() includes INCOMPAT_META_BG
> support, so that may be enough.

OK, comment removed.

Thanks again for your assistance and your patience, Andreas.
Andre
---
From: Andre Noll <[email protected]>

This is a port of an old linux-2.4 patch by Andreas Dilger for handling
on-disk corruption.

From the changelog of the 2.4-patch:

In case of on-disk corruption, the current ext2/3 code will
happily reallocate or mark free blocks in the system area
(bitmaps, inode table, etc) instead of ignoring the bad
alloc/free and leaving the system blocks alone. This will
cause further corruption of the filesystem as metadata
gets overwritten by file data and in turn causes more bad
alloc/free requests.

Unlike in the original version, the check whether a set of blocks are
metadata is moved to a new function, ext3_blocks_are_metadata(). There
is also another new function, fix_group(), which is called whenever
an on-disk corruption is detected and which performs the actual fixing.


Signed-Off-By: Andre Noll <[email protected]>
---
fs/ext3/balloc.c | 102 +++++++++++++++++++++++++++++++++++++++--------------
1 files changed, 75 insertions(+), 27 deletions(-)

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 063d994..763b7a0 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -359,17 +359,6 @@ do_more:
if (!desc)
goto error_return;

- if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
- in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
- in_range (block, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group) ||
- in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group))
- ext3_error (sb, "ext3_free_blocks",
- "Freeing blocks in system zones - "
- "Block = "E3FSBLK", count = %lu",
- block, count);
-
/*
* We are about to start releasing blocks in the bitmap,
* so we need undo access.
@@ -392,7 +381,17 @@ do_more:

jbd_lock_bh_state(bitmap_bh);

- for (i = 0, group_freed = 0; i < count; i++) {
+ for (i = 0, group_freed = 0; i < count; i++, block++) {
+ struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
+ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ in_range(block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group)) {
+ ext3_error(sb, __FUNCTION__,
+ "Freeing block in system zone - block = %lu",
+ block);
+ continue;
+ }
/*
* An HJ special. This is expensive...
*/
@@ -400,7 +399,7 @@ do_more:
jbd_unlock_bh_state(bitmap_bh);
{
struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
+ debug_bh = sb_find_get_block(sb, block);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
@@ -452,7 +451,7 @@ do_more:
jbd_unlock_bh_state(bitmap_bh);
ext3_error(sb, __FUNCTION__,
"bit already cleared for block "E3FSBLK,
- block + i);
+ block);
jbd_lock_bh_state(bitmap_bh);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
@@ -479,7 +478,6 @@ do_more:
*pdquot_freed_blocks += group_freed;

if (overflow && !err) {
- block += count;
count = overflow;
goto do_more;
}
@@ -1192,6 +1190,63 @@ int ext3_should_retry_alloc(struct super
}

/*
+ * Check if given blocks are metadata blocks.
+ *
+ * We should also check the backup group descriptors and the superblock,
+ * but it is too expensive to do so for each allocated/freed block. So,
+ * let's defer that check until we have per-group data structs.
+ */
+static inline int ext3_blocks_are_metadata(ext3_fsblk_t block,
+ unsigned long num,
+ struct ext3_group_desc *gdp,
+ struct super_block *sb)
+{
+ if (in_range(le32_to_cpu(gdp->bg_block_bitmap), block, num))
+ return 1;
+ if (in_range(le32_to_cpu(gdp->bg_inode_bitmap), block, num))
+ return 1;
+ if (in_range(block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group))
+ return 1;
+ if (in_range(block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group))
+ return 1;
+ return 0;
+}
+
+/*
+ *
+ * set the bits for all of the metadata blocks in the group
+ *
+ * Note: This will potentially use up some of the handle's buffer credits.
+ * Normally we have way too many credits, so that is OK. In _very_ rare cases it
+ * might not be OK. We will trigger an assertion if we run out of credits, and we
+ * will have to do a full fsck of the filesystem - better than randomly corrupting
+ * filesystem metadata.
+ */
+static void fix_group(int group, struct super_block *sb)
+{
+ int i;
+ ext3_fsblk_t bit;
+ unsigned long gdblocks;
+ struct buffer_head *gdp_bh;
+ struct ext3_group_desc *gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+
+ if (ext3_bg_has_super(sb, group))
+ ext3_set_bit(0, gdp_bh->b_data);
+ gdblocks = ext3_bg_num_gdb(sb, group);
+ for (i = 0, bit = 1; i < gdblocks; i++, bit++)
+ ext3_set_bit(i, gdp_bh->b_data);
+ ext3_set_bit(gdp->bg_inode_bitmap % EXT3_BLOCKS_PER_GROUP(sb),
+ gdp_bh->b_data);
+ ext3_set_bit(gdp->bg_block_bitmap % EXT3_BLOCKS_PER_GROUP(sb),
+ gdp_bh->b_data);
+ for (i = 0, bit = gdp->bg_inode_table % EXT3_BLOCKS_PER_GROUP(sb);
+ i < EXT3_SB(sb)->s_itb_per_group; i++, bit++)
+ ext3_set_bit(i, gdp_bh->b_data);
+}
+
+/*
* ext3_new_block uses a goal block to assist allocation. If the goal is
* free, or there is a free block within 32 blocks of the goal, that block
* is allocated. Otherwise a forward search is made for a free block; within
@@ -1260,7 +1315,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
*errp = -ENOSPC;
goto out;
}
-
+repeat:
/*
* First, test whether the goal block is free.
*/
@@ -1367,17 +1422,10 @@ allocated:

ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);

- if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
- in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
- in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group) ||
- in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group))
- ext3_error(sb, "ext3_new_block",
- "Allocating block in system zone - "
- "blocks from "E3FSBLK", length %lu",
- ret_block, num);
-
+ if (ext3_blocks_are_metadata(ret_block, num, gdp, sb)) {
+ fix_group(group_no, sb);
+ goto repeat;
+ }
performed_allocation = 1;

#ifdef CONFIG_JBD_DEBUG

--
The only person who always got his work done by Friday was Robinson Crusoe


Attachments:
(No filename) (7.90 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments