2012-05-22 01:42:18

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH] e2fsck: fix 64-bit journal support


I just found what a somewhat disturbing bug which is still present in
1.42.3 --- namely, that the journal replay code in e2fsck was not
properly handling 64-bit file systems correctly, by dropping the high
bits of the block number from the jbd2 descriptor blocks.

I have not had a chance to fully test to make sure we don't have other
bugs hiding in the 64-bit code, but it's clear no one else has had a lot
of time to test it either -- at least not to the extent of doing power
fail testing of > 16TB file systems! :-(

- Ted

>From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <[email protected]>
Date: Mon, 21 May 2012 21:30:45 -0400
Subject: [PATCH] e2fsck: fix 64-bit journal support

64-bit journal support was broken; we weren't using the high bits from
the journal descriptor blocks! We were also using "unsigned long" for
the journal block numbers, which would be a problem on 32-bit systems.

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
e2fsck/jfs_user.h | 4 ++--
e2fsck/journal.c | 33 +++++++++++++++++----------------
e2fsck/recovery.c | 25 ++++++++++++-------------
3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
index 9e33306..92f8ae2 100644
--- a/e2fsck/jfs_user.h
+++ b/e2fsck/jfs_user.h
@@ -18,7 +18,7 @@ struct buffer_head {
e2fsck_t b_ctx;
io_channel b_io;
int b_size;
- blk_t b_blocknr;
+ unsigned long long b_blocknr;
int b_dirty;
int b_uptodate;
int b_err;
@@ -121,7 +121,7 @@ _INLINE_ size_t journal_tag_bytes(journal_t *journal)
/*
* Kernel compatibility functions are defined in journal.c
*/
-int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
+int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
void sync_blockdev(kdev_t kdev);
void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
diff --git a/e2fsck/journal.c b/e2fsck/journal.c
index 915b8bb..bada028 100644
--- a/e2fsck/journal.c
+++ b/e2fsck/journal.c
@@ -44,7 +44,7 @@ static int bh_count = 0;
* to use the recovery.c file virtually unchanged from the kernel, so we
* don't have to do much to keep kernel and user recovery in sync.
*/
-int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
+int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
{
#ifdef USE_INODE_IO
*phys = block;
@@ -80,8 +80,8 @@ struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
if (journal_enable_debug >= 3)
bh_count++;
#endif
- jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
- (unsigned long) blocknr, blocksize, bh_count);
+ jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
+ (unsigned long long) blocknr, blocksize, bh_count);

bh->b_ctx = kdev->k_ctx;
if (kdev->k_dev == K_DEV_FS)
@@ -114,38 +114,39 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
for (; nr > 0; --nr) {
bh = *bhp++;
if (rw == READ && !bh->b_uptodate) {
- jfs_debug(3, "reading block %lu/%p\n",
- (unsigned long) bh->b_blocknr, (void *) bh);
+ jfs_debug(3, "reading block %llu/%p\n",
+ bh->b_blocknr, (void *) bh);
retval = io_channel_read_blk64(bh->b_io,
bh->b_blocknr,
1, bh->b_data);
if (retval) {
com_err(bh->b_ctx->device_name, retval,
- "while reading block %lu\n",
- (unsigned long) bh->b_blocknr);
+ "while reading block %llu\n",
+ bh->b_blocknr);
bh->b_err = retval;
continue;
}
bh->b_uptodate = 1;
} else if (rw == WRITE && bh->b_dirty) {
- jfs_debug(3, "writing block %lu/%p\n",
- (unsigned long) bh->b_blocknr, (void *) bh);
+ jfs_debug(3, "writing block %llu/%p\n",
+ bh->b_blocknr,
+ (void *) bh);
retval = io_channel_write_blk64(bh->b_io,
bh->b_blocknr,
1, bh->b_data);
if (retval) {
com_err(bh->b_ctx->device_name, retval,
- "while writing block %lu\n",
- (unsigned long) bh->b_blocknr);
+ "while writing block %llu\n",
+ bh->b_blocknr);
bh->b_err = retval;
continue;
}
bh->b_dirty = 0;
bh->b_uptodate = 1;
} else {
- jfs_debug(3, "no-op %s for block %lu\n",
+ jfs_debug(3, "no-op %s for block %llu\n",
rw == READ ? "read" : "write",
- (unsigned long) bh->b_blocknr);
+ bh->b_blocknr);
}
}
}
@@ -164,8 +165,8 @@ void brelse(struct buffer_head *bh)
{
if (bh->b_dirty)
ll_rw_block(WRITE, 1, &bh);
- jfs_debug(3, "freeing block %lu/%p (total %d)\n",
- (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
+ jfs_debug(3, "freeing block %llu/%p (total %d)\n",
+ bh->b_blocknr, (void *) bh, --bh_count);
ext2fs_free_mem(&bh);
}

@@ -237,7 +238,7 @@ static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
journal_t *journal = NULL;
errcode_t retval = 0;
io_manager io_ptr = 0;
- unsigned long start = 0;
+ unsigned long long start = 0;
int ext_journal = 0;
int tried_backup_jnl = 0;

diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
index b669941..e94ef4e 100644
--- a/e2fsck/recovery.c
+++ b/e2fsck/recovery.c
@@ -71,7 +71,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
- unsigned long blocknr;
+ unsigned long long blocknr;
struct buffer_head *bh;

struct buffer_head * bufs[MAXBUF];
@@ -133,7 +133,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
- unsigned long blocknr;
+ unsigned long long blocknr;
struct buffer_head *bh;

*bhp = NULL;
@@ -309,7 +309,6 @@ int journal_skip_recovery(journal_t *journal)
return err;
}

-#if 0
static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
@@ -317,17 +316,16 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
-#endif

/*
* calc_chksums calculates the checksums for the blocks described in the
* descriptor block.
*/
static int calc_chksums(journal_t *journal, struct buffer_head *bh,
- unsigned long *next_log_block, __u32 *crc32_sum)
+ unsigned long long *next_log_block, __u32 *crc32_sum)
{
int i, num_blks, err;
- unsigned long io_block;
+ unsigned long long io_block;
struct buffer_head *obh;

num_blks = count_tags(journal, bh);
@@ -340,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
err = jread(&obh, journal, io_block);
if (err) {
printk(KERN_ERR "JBD: IO error %d recovering block "
- "%lu in log\n", err, io_block);
+ "%llu in log\n", err, io_block);
return 1;
} else {
*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
@@ -355,7 +353,7 @@ static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
- unsigned long next_log_block;
+ unsigned long long next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
@@ -485,7 +483,7 @@ static int do_one_pass(journal_t *journal,
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes)
<= journal->j_blocksize) {
- unsigned long io_block;
+ unsigned long long io_block;

tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
@@ -499,13 +497,14 @@ static int do_one_pass(journal_t *journal,
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
- "block %ld in log\n",
+ "block %llu in log\n",
err, io_block);
} else {
- unsigned long blocknr;
+ unsigned long long blocknr;

J_ASSERT(obh != NULL);
- blocknr = be32_to_cpu(tag->t_blocknr);
+ blocknr = read_tag_block(tag_bytes,
+ tag);

/* If the block has been
* revoked, then we're all done
@@ -733,7 +732,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
record_len = 8;

while (offset < max) {
- unsigned long blocknr;
+ unsigned long long blocknr;
int err;

if (record_len == 4)
--
1.7.10.rc3



2012-05-22 15:25:02

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: fix 64-bit journal support

On 5/21/12 8:42 PM, Theodore Ts'o wrote:
> I just found what a somewhat disturbing bug which is still present in
> 1.42.3 --- namely, that the journal replay code in e2fsck was not
> properly handling 64-bit file systems correctly, by dropping the high
> bits of the block number from the jbd2 descriptor blocks.
>
> I have not had a chance to fully test to make sure we don't have other
> bugs hiding in the 64-bit code, but it's clear no one else has had a lot
> of time to test it either -- at least not to the extent of doing power
> fail testing of > 16TB file systems! :-(

Whoops. :(

How do you decide between using "unsigned long long" and "blk64_t" below?

If they'd all been blk_t's they'd have been easier to spot, if anyone went
looking, I suppose.

-Eric

> - Ted
>
> From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
> From: Theodore Ts'o <[email protected]>
> Date: Mon, 21 May 2012 21:30:45 -0400
> Subject: [PATCH] e2fsck: fix 64-bit journal support
>
> 64-bit journal support was broken; we weren't using the high bits from
> the journal descriptor blocks! We were also using "unsigned long" for
> the journal block numbers, which would be a problem on 32-bit systems.
>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
> ---
> e2fsck/jfs_user.h | 4 ++--
> e2fsck/journal.c | 33 +++++++++++++++++----------------
> e2fsck/recovery.c | 25 ++++++++++++-------------
> 3 files changed, 31 insertions(+), 31 deletions(-)
>
> diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
> index 9e33306..92f8ae2 100644
> --- a/e2fsck/jfs_user.h
> +++ b/e2fsck/jfs_user.h
> @@ -18,7 +18,7 @@ struct buffer_head {
> e2fsck_t b_ctx;
> io_channel b_io;
> int b_size;
> - blk_t b_blocknr;
> + unsigned long long b_blocknr;
> int b_dirty;
> int b_uptodate;
> int b_err;
> @@ -121,7 +121,7 @@ _INLINE_ size_t journal_tag_bytes(journal_t *journal)
> /*
> * Kernel compatibility functions are defined in journal.c
> */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
> struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
> void sync_blockdev(kdev_t kdev);
> void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
> diff --git a/e2fsck/journal.c b/e2fsck/journal.c
> index 915b8bb..bada028 100644
> --- a/e2fsck/journal.c
> +++ b/e2fsck/journal.c
> @@ -44,7 +44,7 @@ static int bh_count = 0;
> * to use the recovery.c file virtually unchanged from the kernel, so we
> * don't have to do much to keep kernel and user recovery in sync.
> */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
> {
> #ifdef USE_INODE_IO
> *phys = block;
> @@ -80,8 +80,8 @@ struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
> if (journal_enable_debug >= 3)
> bh_count++;
> #endif
> - jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
> - (unsigned long) blocknr, blocksize, bh_count);
> + jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
> + (unsigned long long) blocknr, blocksize, bh_count);
>
> bh->b_ctx = kdev->k_ctx;
> if (kdev->k_dev == K_DEV_FS)
> @@ -114,38 +114,39 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
> for (; nr > 0; --nr) {
> bh = *bhp++;
> if (rw == READ && !bh->b_uptodate) {
> - jfs_debug(3, "reading block %lu/%p\n",
> - (unsigned long) bh->b_blocknr, (void *) bh);
> + jfs_debug(3, "reading block %llu/%p\n",
> + bh->b_blocknr, (void *) bh);
> retval = io_channel_read_blk64(bh->b_io,
> bh->b_blocknr,
> 1, bh->b_data);
> if (retval) {
> com_err(bh->b_ctx->device_name, retval,
> - "while reading block %lu\n",
> - (unsigned long) bh->b_blocknr);
> + "while reading block %llu\n",
> + bh->b_blocknr);
> bh->b_err = retval;
> continue;
> }
> bh->b_uptodate = 1;
> } else if (rw == WRITE && bh->b_dirty) {
> - jfs_debug(3, "writing block %lu/%p\n",
> - (unsigned long) bh->b_blocknr, (void *) bh);
> + jfs_debug(3, "writing block %llu/%p\n",
> + bh->b_blocknr,
> + (void *) bh);
> retval = io_channel_write_blk64(bh->b_io,
> bh->b_blocknr,
> 1, bh->b_data);
> if (retval) {
> com_err(bh->b_ctx->device_name, retval,
> - "while writing block %lu\n",
> - (unsigned long) bh->b_blocknr);
> + "while writing block %llu\n",
> + bh->b_blocknr);
> bh->b_err = retval;
> continue;
> }
> bh->b_dirty = 0;
> bh->b_uptodate = 1;
> } else {
> - jfs_debug(3, "no-op %s for block %lu\n",
> + jfs_debug(3, "no-op %s for block %llu\n",
> rw == READ ? "read" : "write",
> - (unsigned long) bh->b_blocknr);
> + bh->b_blocknr);
> }
> }
> }
> @@ -164,8 +165,8 @@ void brelse(struct buffer_head *bh)
> {
> if (bh->b_dirty)
> ll_rw_block(WRITE, 1, &bh);
> - jfs_debug(3, "freeing block %lu/%p (total %d)\n",
> - (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
> + jfs_debug(3, "freeing block %llu/%p (total %d)\n",
> + bh->b_blocknr, (void *) bh, --bh_count);
> ext2fs_free_mem(&bh);
> }
>
> @@ -237,7 +238,7 @@ static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
> journal_t *journal = NULL;
> errcode_t retval = 0;
> io_manager io_ptr = 0;
> - unsigned long start = 0;
> + unsigned long long start = 0;
> int ext_journal = 0;
> int tried_backup_jnl = 0;
>
> diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
> index b669941..e94ef4e 100644
> --- a/e2fsck/recovery.c
> +++ b/e2fsck/recovery.c
> @@ -71,7 +71,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
> {
> int err;
> unsigned int max, nbufs, next;
> - unsigned long blocknr;
> + unsigned long long blocknr;
> struct buffer_head *bh;
>
> struct buffer_head * bufs[MAXBUF];
> @@ -133,7 +133,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
> unsigned int offset)
> {
> int err;
> - unsigned long blocknr;
> + unsigned long long blocknr;
> struct buffer_head *bh;
>
> *bhp = NULL;
> @@ -309,7 +309,6 @@ int journal_skip_recovery(journal_t *journal)
> return err;
> }
>
> -#if 0
> static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
> {
> unsigned long long block = be32_to_cpu(tag->t_blocknr);
> @@ -317,17 +316,16 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
> block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
> return block;
> }
> -#endif
>
> /*
> * calc_chksums calculates the checksums for the blocks described in the
> * descriptor block.
> */
> static int calc_chksums(journal_t *journal, struct buffer_head *bh,
> - unsigned long *next_log_block, __u32 *crc32_sum)
> + unsigned long long *next_log_block, __u32 *crc32_sum)
> {
> int i, num_blks, err;
> - unsigned long io_block;
> + unsigned long long io_block;
> struct buffer_head *obh;
>
> num_blks = count_tags(journal, bh);
> @@ -340,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
> err = jread(&obh, journal, io_block);
> if (err) {
> printk(KERN_ERR "JBD: IO error %d recovering block "
> - "%lu in log\n", err, io_block);
> + "%llu in log\n", err, io_block);
> return 1;
> } else {
> *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
> @@ -355,7 +353,7 @@ static int do_one_pass(journal_t *journal,
> struct recovery_info *info, enum passtype pass)
> {
> unsigned int first_commit_ID, next_commit_ID;
> - unsigned long next_log_block;
> + unsigned long long next_log_block;
> int err, success = 0;
> journal_superblock_t * sb;
> journal_header_t * tmp;
> @@ -485,7 +483,7 @@ static int do_one_pass(journal_t *journal,
> tagp = &bh->b_data[sizeof(journal_header_t)];
> while ((tagp - bh->b_data + tag_bytes)
> <= journal->j_blocksize) {
> - unsigned long io_block;
> + unsigned long long io_block;
>
> tag = (journal_block_tag_t *) tagp;
> flags = be32_to_cpu(tag->t_flags);
> @@ -499,13 +497,14 @@ static int do_one_pass(journal_t *journal,
> success = err;
> printk (KERN_ERR
> "JBD: IO error %d recovering "
> - "block %ld in log\n",
> + "block %llu in log\n",
> err, io_block);
> } else {
> - unsigned long blocknr;
> + unsigned long long blocknr;
>
> J_ASSERT(obh != NULL);
> - blocknr = be32_to_cpu(tag->t_blocknr);
> + blocknr = read_tag_block(tag_bytes,
> + tag);
>
> /* If the block has been
> * revoked, then we're all done
> @@ -733,7 +732,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
> record_len = 8;
>
> while (offset < max) {
> - unsigned long blocknr;
> + unsigned long long blocknr;
> int err;
>
> if (record_len == 4)


2012-05-22 17:01:12

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: fix 64-bit journal support

On Tue, May 22, 2012 at 10:24:59AM -0500, Eric Sandeen wrote:
>
> How do you decide between using "unsigned long long" and "blk64_t" below?
>

Consistency more than anything else. At least at one point
e2fsck/recovery.c was supposed to be identical to the kernel's
fs/jbd2/recovery.c, and so that meant we tried to use avoid using both
kernel-specific idioms that would be hard to replicate in userspace
(but we do quite a bit of that; see e2fsck/jfs_user.h) as well as
userspace-specific idioms/typedefs that don't work well in the kernel
context. Over time the two have drifted apart, and a good future
project would be to try to bring them closer together so that bugs in
one are more likely to be correctly fixed in the other.

The other headache is with %llu in printf strings. One of the
problems with using blk64_t is that we don't know whether it's an
unsigned int or an unsigned long or an unsigned long long, and that
causes all sorts of warnings with respect to printf format strings.
We *can* fix it via a cast to an unsigned long long and using %llu,
but it makes the code quite ugly.

So it's a bit simpler to use %llu and just use unsigned long long,
since we don't support any platform where long long is 128 bits, and
on pretty much all Linux systems, long long is pretty consistently 64
bits. I wouldn't do this on anything that was an on-disk
representation, of course, but in this case I figured it was easier
just to use an "unsigned long long" in the struct buffer_head in
jfs_user.h, and then I made everything consistent with that.

- Ted

2012-05-22 17:09:26

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: fix 64-bit journal support

On 2012-05-22, at 9:24 AM, Eric Sandeen wrote:
> On 5/21/12 8:42 PM, Theodore Ts'o wrote:
>> I just found what a somewhat disturbing bug which is still present in
>> 1.42.3 --- namely, that the journal replay code in e2fsck was not
>> properly handling 64-bit file systems correctly, by dropping the high
>> bits of the block number from the jbd2 descriptor blocks.

Hmm, we did have a bug report that would be explained by this, but it wasn't easily reproduced (dependent on metadata blocks beyond 16TB in
hindsight) and only on a test filesystem, so I hadn't had a chance to
dig into it yet.

We did do a 128TB full-filesystem data write + verification + e2fsck,
but I guess that didn't validate journal recovery. Most filesystems
only have a fraction of data beyond 16TB, and the allocators prefer
to locate inodes/blocks at the beginning of the filesystem, so it
hasn't shown up in real usage very much.

Good catch.

>> I have not had a chance to fully test to make sure we don't have other
>> bugs hiding in the 64-bit code, but it's clear no one else has had a
>> lot of time to test it either -- at least not to the extent of doing
>> power fail testing of > 16TB file systems! :-(
>
> Whoops. :(
>
> How do you decide between using "unsigned long long" and "blk64_t" below?
>
> If they'd all been blk_t's they'd have been easier to spot, if anyone went looking, I suppose.

Makes sense. Better to be consistent with these things, so that the
compiler and users (grep, etc) can find type mismatches and such.

>> - Ted
>>
>> From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
>> From: Theodore Ts'o <[email protected]>
>> Date: Mon, 21 May 2012 21:30:45 -0400
>> Subject: [PATCH] e2fsck: fix 64-bit journal support
>>
>> 64-bit journal support was broken; we weren't using the high bits from
>> the journal descriptor blocks! We were also using "unsigned long" for
>> the journal block numbers, which would be a problem on 32-bit systems.
>>
>> Signed-off-by: "Theodore Ts'o" <[email protected]>
>> ---
>> e2fsck/jfs_user.h | 4 ++--
>> e2fsck/journal.c | 33 +++++++++++++++++----------------
>> e2fsck/recovery.c | 25 ++++++++++++-------------
>> 3 files changed, 31 insertions(+), 31 deletions(-)
>>
>> diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
>> index 9e33306..92f8ae2 100644
>> --- a/e2fsck/jfs_user.h
>> +++ b/e2fsck/jfs_user.h
>> @@ -18,7 +18,7 @@ struct buffer_head {
>> e2fsck_t b_ctx;
>> io_channel b_io;
>> int b_size;
>> - blk_t b_blocknr;
>> + unsigned long long b_blocknr;
>> int b_dirty;
>> int b_uptodate;
>> int b_err;
>> @@ -121,7 +121,7 @@ _INLINE_ size_t journal_tag_bytes(journal_t *journal)
>> /*
>> * Kernel compatibility functions are defined in journal.c
>> */
>> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
>> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
>> struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
>> void sync_blockdev(kdev_t kdev);
>> void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
>> diff --git a/e2fsck/journal.c b/e2fsck/journal.c
>> index 915b8bb..bada028 100644
>> --- a/e2fsck/journal.c
>> +++ b/e2fsck/journal.c
>> @@ -44,7 +44,7 @@ static int bh_count = 0;
>> * to use the recovery.c file virtually unchanged from the kernel, so we
>> * don't have to do much to keep kernel and user recovery in sync.
>> */
>> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
>> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
>> {
>> #ifdef USE_INODE_IO
>> *phys = block;
>> @@ -80,8 +80,8 @@ struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
>> if (journal_enable_debug >= 3)
>> bh_count++;
>> #endif
>> - jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
>> - (unsigned long) blocknr, blocksize, bh_count);
>> + jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
>> + (unsigned long long) blocknr, blocksize, bh_count);
>>
>> bh->b_ctx = kdev->k_ctx;
>> if (kdev->k_dev == K_DEV_FS)
>> @@ -114,38 +114,39 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
>> for (; nr > 0; --nr) {
>> bh = *bhp++;
>> if (rw == READ && !bh->b_uptodate) {
>> - jfs_debug(3, "reading block %lu/%p\n",
>> - (unsigned long) bh->b_blocknr, (void *) bh);
>> + jfs_debug(3, "reading block %llu/%p\n",
>> + bh->b_blocknr, (void *) bh);
>> retval = io_channel_read_blk64(bh->b_io,
>> bh->b_blocknr,
>> 1, bh->b_data);
>> if (retval) {
>> com_err(bh->b_ctx->device_name, retval,
>> - "while reading block %lu\n",
>> - (unsigned long) bh->b_blocknr);
>> + "while reading block %llu\n",
>> + bh->b_blocknr);
>> bh->b_err = retval;
>> continue;
>> }
>> bh->b_uptodate = 1;
>> } else if (rw == WRITE && bh->b_dirty) {
>> - jfs_debug(3, "writing block %lu/%p\n",
>> - (unsigned long) bh->b_blocknr, (void *) bh);
>> + jfs_debug(3, "writing block %llu/%p\n",
>> + bh->b_blocknr,
>> + (void *) bh);
>> retval = io_channel_write_blk64(bh->b_io,
>> bh->b_blocknr,
>> 1, bh->b_data);
>> if (retval) {
>> com_err(bh->b_ctx->device_name, retval,
>> - "while writing block %lu\n",
>> - (unsigned long) bh->b_blocknr);
>> + "while writing block %llu\n",
>> + bh->b_blocknr);
>> bh->b_err = retval;
>> continue;
>> }
>> bh->b_dirty = 0;
>> bh->b_uptodate = 1;
>> } else {
>> - jfs_debug(3, "no-op %s for block %lu\n",
>> + jfs_debug(3, "no-op %s for block %llu\n",
>> rw == READ ? "read" : "write",
>> - (unsigned long) bh->b_blocknr);
>> + bh->b_blocknr);
>> }
>> }
>> }
>> @@ -164,8 +165,8 @@ void brelse(struct buffer_head *bh)
>> {
>> if (bh->b_dirty)
>> ll_rw_block(WRITE, 1, &bh);
>> - jfs_debug(3, "freeing block %lu/%p (total %d)\n",
>> - (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
>> + jfs_debug(3, "freeing block %llu/%p (total %d)\n",
>> + bh->b_blocknr, (void *) bh, --bh_count);
>> ext2fs_free_mem(&bh);
>> }
>>
>> @@ -237,7 +238,7 @@ static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
>> journal_t *journal = NULL;
>> errcode_t retval = 0;
>> io_manager io_ptr = 0;
>> - unsigned long start = 0;
>> + unsigned long long start = 0;
>> int ext_journal = 0;
>> int tried_backup_jnl = 0;
>>
>> diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
>> index b669941..e94ef4e 100644
>> --- a/e2fsck/recovery.c
>> +++ b/e2fsck/recovery.c
>> @@ -71,7 +71,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
>> {
>> int err;
>> unsigned int max, nbufs, next;
>> - unsigned long blocknr;
>> + unsigned long long blocknr;
>> struct buffer_head *bh;
>>
>> struct buffer_head * bufs[MAXBUF];
>> @@ -133,7 +133,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
>> unsigned int offset)
>> {
>> int err;
>> - unsigned long blocknr;
>> + unsigned long long blocknr;
>> struct buffer_head *bh;
>>
>> *bhp = NULL;
>> @@ -309,7 +309,6 @@ int journal_skip_recovery(journal_t *journal)
>> return err;
>> }
>>
>> -#if 0
>> static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
>> {
>> unsigned long long block = be32_to_cpu(tag->t_blocknr);
>> @@ -317,17 +316,16 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
>> block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
>> return block;
>> }
>> -#endif
>>
>> /*
>> * calc_chksums calculates the checksums for the blocks described in the
>> * descriptor block.
>> */
>> static int calc_chksums(journal_t *journal, struct buffer_head *bh,
>> - unsigned long *next_log_block, __u32 *crc32_sum)
>> + unsigned long long *next_log_block, __u32 *crc32_sum)
>> {
>> int i, num_blks, err;
>> - unsigned long io_block;
>> + unsigned long long io_block;
>> struct buffer_head *obh;
>>
>> num_blks = count_tags(journal, bh);
>> @@ -340,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
>> err = jread(&obh, journal, io_block);
>> if (err) {
>> printk(KERN_ERR "JBD: IO error %d recovering block "
>> - "%lu in log\n", err, io_block);
>> + "%llu in log\n", err, io_block);
>> return 1;
>> } else {
>> *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
>> @@ -355,7 +353,7 @@ static int do_one_pass(journal_t *journal,
>> struct recovery_info *info, enum passtype pass)
>> {
>> unsigned int first_commit_ID, next_commit_ID;
>> - unsigned long next_log_block;
>> + unsigned long long next_log_block;
>> int err, success = 0;
>> journal_superblock_t * sb;
>> journal_header_t * tmp;
>> @@ -485,7 +483,7 @@ static int do_one_pass(journal_t *journal,
>> tagp = &bh->b_data[sizeof(journal_header_t)];
>> while ((tagp - bh->b_data + tag_bytes)
>> <= journal->j_blocksize) {
>> - unsigned long io_block;
>> + unsigned long long io_block;
>>
>> tag = (journal_block_tag_t *) tagp;
>> flags = be32_to_cpu(tag->t_flags);
>> @@ -499,13 +497,14 @@ static int do_one_pass(journal_t *journal,
>> success = err;
>> printk (KERN_ERR
>> "JBD: IO error %d recovering "
>> - "block %ld in log\n",
>> + "block %llu in log\n",
>> err, io_block);
>> } else {
>> - unsigned long blocknr;
>> + unsigned long long blocknr;
>>
>> J_ASSERT(obh != NULL);
>> - blocknr = be32_to_cpu(tag->t_blocknr);
>> + blocknr = read_tag_block(tag_bytes,
>> + tag);
>>
>> /* If the block has been
>> * revoked, then we're all done
>> @@ -733,7 +732,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
>> record_len = 8;
>>
>> while (offset < max) {
>> - unsigned long blocknr;
>> + unsigned long long blocknr;
>> int err;
>>
>> if (record_len == 4)
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html


Cheers, Andreas






2012-05-24 01:00:40

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsck: fix 64-bit journal support

Ted, were you going to push this patch? I didn't see it in the maint branch where the other post-1.42.3 patches were.

Cheers, Andreas

On 2012-05-21, at 19:42, "Theodore Ts'o" <[email protected]> wrote:

>
> I just found what a somewhat disturbing bug which is still present in
> 1.42.3 --- namely, that the journal replay code in e2fsck was not
> properly handling 64-bit file systems correctly, by dropping the high
> bits of the block number from the jbd2 descriptor blocks.
>
> I have not had a chance to fully test to make sure we don't have other
> bugs hiding in the 64-bit code, but it's clear no one else has had a lot
> of time to test it either -- at least not to the extent of doing power
> fail testing of > 16TB file systems! :-(
>
> - Ted
>
> From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
> From: Theodore Ts'o <[email protected]>
> Date: Mon, 21 May 2012 21:30:45 -0400
> Subject: [PATCH] e2fsck: fix 64-bit journal support
>
> 64-bit journal support was broken; we weren't using the high bits from
> the journal descriptor blocks! We were also using "unsigned long" for
> the journal block numbers, which would be a problem on 32-bit systems.
>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
> ---
> e2fsck/jfs_user.h | 4 ++--
> e2fsck/journal.c | 33 +++++++++++++++++----------------
> e2fsck/recovery.c | 25 ++++++++++++-------------
> 3 files changed, 31 insertions(+), 31 deletions(-)
>
> diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
> index 9e33306..92f8ae2 100644
> --- a/e2fsck/jfs_user.h
> +++ b/e2fsck/jfs_user.h
> @@ -18,7 +18,7 @@ struct buffer_head {
> e2fsck_t b_ctx;
> io_channel b_io;
> int b_size;
> - blk_t b_blocknr;
> + unsigned long long b_blocknr;
> int b_dirty;
> int b_uptodate;
> int b_err;
> @@ -121,7 +121,7 @@ _INLINE_ size_t journal_tag_bytes(journal_t *journal)
> /*
> * Kernel compatibility functions are defined in journal.c
> */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
> struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
> void sync_blockdev(kdev_t kdev);
> void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
> diff --git a/e2fsck/journal.c b/e2fsck/journal.c
> index 915b8bb..bada028 100644
> --- a/e2fsck/journal.c
> +++ b/e2fsck/journal.c
> @@ -44,7 +44,7 @@ static int bh_count = 0;
> * to use the recovery.c file virtually unchanged from the kernel, so we
> * don't have to do much to keep kernel and user recovery in sync.
> */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
> {
> #ifdef USE_INODE_IO
> *phys = block;
> @@ -80,8 +80,8 @@ struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
> if (journal_enable_debug >= 3)
> bh_count++;
> #endif
> - jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
> - (unsigned long) blocknr, blocksize, bh_count);
> + jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
> + (unsigned long long) blocknr, blocksize, bh_count);
>
> bh->b_ctx = kdev->k_ctx;
> if (kdev->k_dev == K_DEV_FS)
> @@ -114,38 +114,39 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
> for (; nr > 0; --nr) {
> bh = *bhp++;
> if (rw == READ && !bh->b_uptodate) {
> - jfs_debug(3, "reading block %lu/%p\n",
> - (unsigned long) bh->b_blocknr, (void *) bh);
> + jfs_debug(3, "reading block %llu/%p\n",
> + bh->b_blocknr, (void *) bh);
> retval = io_channel_read_blk64(bh->b_io,
> bh->b_blocknr,
> 1, bh->b_data);
> if (retval) {
> com_err(bh->b_ctx->device_name, retval,
> - "while reading block %lu\n",
> - (unsigned long) bh->b_blocknr);
> + "while reading block %llu\n",
> + bh->b_blocknr);
> bh->b_err = retval;
> continue;
> }
> bh->b_uptodate = 1;
> } else if (rw == WRITE && bh->b_dirty) {
> - jfs_debug(3, "writing block %lu/%p\n",
> - (unsigned long) bh->b_blocknr, (void *) bh);
> + jfs_debug(3, "writing block %llu/%p\n",
> + bh->b_blocknr,
> + (void *) bh);
> retval = io_channel_write_blk64(bh->b_io,
> bh->b_blocknr,
> 1, bh->b_data);
> if (retval) {
> com_err(bh->b_ctx->device_name, retval,
> - "while writing block %lu\n",
> - (unsigned long) bh->b_blocknr);
> + "while writing block %llu\n",
> + bh->b_blocknr);
> bh->b_err = retval;
> continue;
> }
> bh->b_dirty = 0;
> bh->b_uptodate = 1;
> } else {
> - jfs_debug(3, "no-op %s for block %lu\n",
> + jfs_debug(3, "no-op %s for block %llu\n",
> rw == READ ? "read" : "write",
> - (unsigned long) bh->b_blocknr);
> + bh->b_blocknr);
> }
> }
> }
> @@ -164,8 +165,8 @@ void brelse(struct buffer_head *bh)
> {
> if (bh->b_dirty)
> ll_rw_block(WRITE, 1, &bh);
> - jfs_debug(3, "freeing block %lu/%p (total %d)\n",
> - (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
> + jfs_debug(3, "freeing block %llu/%p (total %d)\n",
> + bh->b_blocknr, (void *) bh, --bh_count);
> ext2fs_free_mem(&bh);
> }
>
> @@ -237,7 +238,7 @@ static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
> journal_t *journal = NULL;
> errcode_t retval = 0;
> io_manager io_ptr = 0;
> - unsigned long start = 0;
> + unsigned long long start = 0;
> int ext_journal = 0;
> int tried_backup_jnl = 0;
>
> diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
> index b669941..e94ef4e 100644
> --- a/e2fsck/recovery.c
> +++ b/e2fsck/recovery.c
> @@ -71,7 +71,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
> {
> int err;
> unsigned int max, nbufs, next;
> - unsigned long blocknr;
> + unsigned long long blocknr;
> struct buffer_head *bh;
>
> struct buffer_head * bufs[MAXBUF];
> @@ -133,7 +133,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
> unsigned int offset)
> {
> int err;
> - unsigned long blocknr;
> + unsigned long long blocknr;
> struct buffer_head *bh;
>
> *bhp = NULL;
> @@ -309,7 +309,6 @@ int journal_skip_recovery(journal_t *journal)
> return err;
> }
>
> -#if 0
> static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
> {
> unsigned long long block = be32_to_cpu(tag->t_blocknr);
> @@ -317,17 +316,16 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
> block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
> return block;
> }
> -#endif
>
> /*
> * calc_chksums calculates the checksums for the blocks described in the
> * descriptor block.
> */
> static int calc_chksums(journal_t *journal, struct buffer_head *bh,
> - unsigned long *next_log_block, __u32 *crc32_sum)
> + unsigned long long *next_log_block, __u32 *crc32_sum)
> {
> int i, num_blks, err;
> - unsigned long io_block;
> + unsigned long long io_block;
> struct buffer_head *obh;
>
> num_blks = count_tags(journal, bh);
> @@ -340,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
> err = jread(&obh, journal, io_block);
> if (err) {
> printk(KERN_ERR "JBD: IO error %d recovering block "
> - "%lu in log\n", err, io_block);
> + "%llu in log\n", err, io_block);
> return 1;
> } else {
> *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
> @@ -355,7 +353,7 @@ static int do_one_pass(journal_t *journal,
> struct recovery_info *info, enum passtype pass)
> {
> unsigned int first_commit_ID, next_commit_ID;
> - unsigned long next_log_block;
> + unsigned long long next_log_block;
> int err, success = 0;
> journal_superblock_t * sb;
> journal_header_t * tmp;
> @@ -485,7 +483,7 @@ static int do_one_pass(journal_t *journal,
> tagp = &bh->b_data[sizeof(journal_header_t)];
> while ((tagp - bh->b_data + tag_bytes)
> <= journal->j_blocksize) {
> - unsigned long io_block;
> + unsigned long long io_block;
>
> tag = (journal_block_tag_t *) tagp;
> flags = be32_to_cpu(tag->t_flags);
> @@ -499,13 +497,14 @@ static int do_one_pass(journal_t *journal,
> success = err;
> printk (KERN_ERR
> "JBD: IO error %d recovering "
> - "block %ld in log\n",
> + "block %llu in log\n",
> err, io_block);
> } else {
> - unsigned long blocknr;
> + unsigned long long blocknr;
>
> J_ASSERT(obh != NULL);
> - blocknr = be32_to_cpu(tag->t_blocknr);
> + blocknr = read_tag_block(tag_bytes,
> + tag);
>
> /* If the block has been
> * revoked, then we're all done
> @@ -733,7 +732,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
> record_len = 8;
>
> while (offset < max) {
> - unsigned long blocknr;
> + unsigned long long blocknr;
> int err;
>
> if (record_len == 4)
> --
> 1.7.10.rc3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html