2008-01-08 19:33:40

by Josef Bacik

[permalink] [raw]
Subject: [PATCH] e2fsprogs: play with 8TB to 16TB fs's better

Hello,

Currently in order to mkfs a >8TB fs with 4k blocks you have to do -F, which is
kind of dumb since you can go to 16TB-4k. This patch removes that one check
that won't let you go above 8TB and fixes getsize so that if you have a 16TB fs
instead of complaining that the device is too big just -1 from the size and go
ahead on business as usual. Tested this with my box and it works fine (yay
having >16tb to play with). Thank you,

Josef


Index: e2fsprogs/lib/ext2fs/getsize.c
===================================================================
--- e2fsprogs.orig/lib/ext2fs/getsize.c
+++ e2fsprogs/lib/ext2fs/getsize.c
@@ -190,8 +190,13 @@ errcode_t ext2fs_get_device_size(const c
ioctl(fd, BLKGETSIZE64, &size64) >= 0) {
if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
((size64 / blocksize) > 0xFFFFFFFF)) {
- rc = EFBIG;
- goto out;
+ /* 16tb fs is fine, just adjust slightly */
+ if ((size64 / blocksize) == 0x100000000) {
+ size64--;
+ } else {
+ rc = EFBIG;
+ goto out;
+ }
}
*retblocks = size64 / blocksize;
goto out;
@@ -252,13 +257,19 @@ errcode_t ext2fs_get_device_size(const c
struct stat st;
if (fstat(fd, &st) == 0)
#endif
+ size64 = st.st_size;
if (S_ISREG(st.st_mode)) {
if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
- ((st.st_size / blocksize) > 0xFFFFFFFF)) {
- rc = EFBIG;
- goto out;
+ ((size64 / blocksize) > 0xFFFFFFFF)) {
+ /* 16tb fs is fine, just adjust slightly */
+ if ((size64 / blocksize) > 0x100000000) {
+ size64--;
+ } else {
+ rc = EFBIG;
+ goto out;
+ }
}
- *retblocks = st.st_size / blocksize;
+ *retblocks = size64 / blocksize;
goto out;
}
}
@@ -284,8 +295,13 @@ errcode_t ext2fs_get_device_size(const c
size64 = low + 1;
if ((sizeof(*retblocks) < sizeof(unsigned long long))
&& ((size64 / blocksize) > 0xFFFFFFFF)) {
- rc = EFBIG;
- goto out;
+ /* 16tb fs is fine, just adjust slightly */
+ if ((size64 / blocksize) > 0x100000000) {
+ size64--;
+ } else {
+ rc = EFBIG;
+ goto out;
+ }
}
*retblocks = size64 / blocksize;
out:
Index: e2fsprogs/misc/mke2fs.c
===================================================================
--- e2fsprogs.orig/misc/mke2fs.c
+++ e2fsprogs/misc/mke2fs.c
@@ -1455,13 +1455,6 @@ static void PRS(int argc, char *argv[])
}
}

- if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
- com_err(program_name, 0,
- _("Filesystem too large. No more than 2**31-1 blocks\n"
- "\t (8TB using a blocksize of 4k) are currently supported."));
- exit(1);
- }
-
if ((blocksize > 4096) &&
(fs_param.s_feature_compat & EXT3_FEATURE_COMPAT_HAS_JOURNAL))
fprintf(stderr, _("\nWarning: some 2.4 kernels do not support "


2008-01-08 23:02:18

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: play with 8TB to 16TB fs's better

On Jan 08, 2008 14:33 -0500, Josef Bacik wrote:
> @@ -190,8 +190,13 @@ errcode_t ext2fs_get_device_size(const c
> ioctl(fd, BLKGETSIZE64, &size64) >= 0) {
> if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
> ((size64 / blocksize) > 0xFFFFFFFF)) {
> - rc = EFBIG;
> - goto out;
> + /* 16tb fs is fine, just adjust slightly */
> + if ((size64 / blocksize) == 0x100000000) {
> + size64--;
> + } else {
> + rc = EFBIG;
> + goto out;
> + }

It might be cleaner to localize this check/fixup into a small helper function?

> +++ e2fsprogs/misc/mke2fs.c
> @@ -1455,13 +1455,6 @@ static void PRS(int argc, char *argv[])
> - if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
> - com_err(program_name, 0,
> - _("Filesystem too large. No more than 2**31-1 blocks\n"
> - "\t (8TB using a blocksize of 4k) are currently supported."));
> - exit(1);
> - }
> -
> if ((blocksize > 4096) &&
> (fs_param.s_feature_compat & EXT3_FEATURE_COMPAT_HAS_JOURNAL))
> fprintf(stderr, _("\nWarning: some 2.4 kernels do not support "

It is also worthwhile to report at least a warning for filesystems larger
than 0x7fffffff blocks that older kernels (2.6.18 and older, IIRC) don't
necessarily work correctly with such large filesystems.

Doing something like having mke2fs zero out block 1, flush it from cache
with ioctl(BLKFLSBUF), then write some data at 8TB+1 to verify it doesn't
clobber block 1 might also be prudent. I've seen some RAID arrays do this
in the past, and when we pass 0xffffffff blocks we should do the same so
it may as well be a simple helper function.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-01-09 21:05:19

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: play with 8TB to 16TB fs's better

On Tue, Jan 08, 2008 at 04:02:16PM -0700, Andreas Dilger wrote:
> On Jan 08, 2008 14:33 -0500, Josef Bacik wrote:
> > @@ -190,8 +190,13 @@ errcode_t ext2fs_get_device_size(const c
> > ioctl(fd, BLKGETSIZE64, &size64) >= 0) {
> > if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
> > ((size64 / blocksize) > 0xFFFFFFFF)) {
> > - rc = EFBIG;
> > - goto out;
> > + /* 16tb fs is fine, just adjust slightly */
> > + if ((size64 / blocksize) == 0x100000000) {
> > + size64--;
> > + } else {
> > + rc = EFBIG;
> > + goto out;
> > + }
>
> It might be cleaner to localize this check/fixup into a small helper function?
>
> > +++ e2fsprogs/misc/mke2fs.c
> > @@ -1455,13 +1455,6 @@ static void PRS(int argc, char *argv[])
> > - if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
> > - com_err(program_name, 0,
> > - _("Filesystem too large. No more than 2**31-1 blocks\n"
> > - "\t (8TB using a blocksize of 4k) are currently supported."));
> > - exit(1);
> > - }
> > -
> > if ((blocksize > 4096) &&
> > (fs_param.s_feature_compat & EXT3_FEATURE_COMPAT_HAS_JOURNAL))
> > fprintf(stderr, _("\nWarning: some 2.4 kernels do not support "
>
> It is also worthwhile to report at least a warning for filesystems larger
> than 0x7fffffff blocks that older kernels (2.6.18 and older, IIRC) don't
> necessarily work correctly with such large filesystems.
>
> Doing something like having mke2fs zero out block 1, flush it from cache
> with ioctl(BLKFLSBUF), then write some data at 8TB+1 to verify it doesn't
> clobber block 1 might also be prudent. I've seen some RAID arrays do this
> in the past, and when we pass 0xffffffff blocks we should do the same so
> it may as well be a simple helper function.
>

Ok I've reworked this with your comments in mind. Tested it with 6TB, 8TB,
10TB, 16TB and 17TB to make sure everything was kosher. Let me know how this
works. Thank you,

Josef


Index: e2fsprogs/lib/ext2fs/getsize.c
===================================================================
--- e2fsprogs.orig/lib/ext2fs/getsize.c
+++ e2fsprogs/lib/ext2fs/getsize.c
@@ -135,6 +135,21 @@ static int valid_offset (int fd, ext2_lo
return 1;
}

+static int valid_size(unsigned long long *size64, int blocksize)
+{
+ /* see if we are above 16tb */
+ if ((*size64 / blocksize) > 0xFFFFFFFF) {
+ /* if we are just at 16tb adjust the size slightly */
+ if ((*size64 / blocksize) == 0x100000000) {
+ (*size64)--;
+ return 1;
+ } else
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* Returns the number of blocks in a partition
*/
@@ -189,7 +204,7 @@ errcode_t ext2fs_get_device_size(const c
if (valid_blkgetsize64 &&
ioctl(fd, BLKGETSIZE64, &size64) >= 0) {
if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
- ((size64 / blocksize) > 0xFFFFFFFF)) {
+ !valid_size(&size64, blocksize)) {
rc = EFBIG;
goto out;
}
@@ -252,13 +267,14 @@ errcode_t ext2fs_get_device_size(const c
struct stat st;
if (fstat(fd, &st) == 0)
#endif
+ size64 = st.st_size;
if (S_ISREG(st.st_mode)) {
if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
- ((st.st_size / blocksize) > 0xFFFFFFFF)) {
+ !valid_size(&size64, blocksize)) {
rc = EFBIG;
goto out;
}
- *retblocks = st.st_size / blocksize;
+ *retblocks = size64 / blocksize;
goto out;
}
}
@@ -283,7 +299,7 @@ errcode_t ext2fs_get_device_size(const c
valid_offset (fd, 0);
size64 = low + 1;
if ((sizeof(*retblocks) < sizeof(unsigned long long))
- && ((size64 / blocksize) > 0xFFFFFFFF)) {
+ && !valid_size(&size64, blocksize)) {
rc = EFBIG;
goto out;
}
Index: e2fsprogs/misc/mke2fs.c
===================================================================
--- e2fsprogs.orig/misc/mke2fs.c
+++ e2fsprogs/misc/mke2fs.c
@@ -916,6 +916,90 @@ static void edit_feature(const char *str
}
}

+static int check_for_wrap(const char *file, int blocksize)
+{
+ int fd, tmp, total = 0;
+ char buffer[blocksize];
+
+#ifdef HAVE_OPEN64
+ fd = open64(file, O_RDWR);
+#else
+ fd = open(file, O_RDWR);
+#endif
+
+ if (fd < 0) {
+ fprintf(stderr, "Error opening disk %s\n", file);
+ exit(1);
+ }
+
+ memset(buffer, 0, blocksize);
+ ext2fs_llseek(fd, 1*blocksize, SEEK_SET);
+
+ while (total < blocksize) {
+ tmp = write(fd, buffer+total, blocksize-total);
+ if (tmp < 0) {
+ fprintf(stderr, "Error writing to disk %s\n", file);
+ close(fd);
+ exit(1);
+ }
+
+ total += tmp;
+ }
+
+ if (ext2fs_sync_device(fd, 1)) {
+ fprintf(stderr, "Error flushing cache to disk %s\n", file);
+ close(fd);
+ exit(1);
+ }
+
+ memset(buffer, 0xa, blocksize);
+ ext2fs_llseek(fd, ((1UL << 31)+1)*blocksize, SEEK_SET);
+ total = 0;
+
+ while (total < blocksize) {
+ tmp = write(fd, buffer+total, blocksize-total);
+ if (tmp < 0) {
+ fprintf(stderr, "Error writing to disk %s\n", file);
+ close(fd);
+ exit(1);
+ }
+
+ total += tmp;
+ }
+
+ if (ext2fs_sync_device(fd, 1)) {
+ fprintf(stderr, "Error flushing cache to disk %s\n", file);
+ close(fd);
+ exit(1);
+ }
+
+ memset(buffer, 0xa, blocksize);
+ ext2fs_llseek(fd, 1*blocksize, SEEK_SET);
+ total = 0;
+
+ while (total < blocksize) {
+ tmp = read(fd, buffer+total, blocksize-total);
+ if (tmp < 0) {
+ fprintf(stderr, "Error reading from disk %s\n", file);
+ close(fd);
+ exit(1);
+ }
+
+ total += tmp;
+ }
+
+ for (tmp = 0; tmp < blocksize; tmp++) {
+ if (buffer[tmp] != 0x0) {
+ close(fd);
+ return -1;
+ }
+ }
+
+ close(fd);
+
+ return 0;
+}
+
extern const char *mke2fs_default_profile;
static const char *default_files[] = { "<default>", 0 };

@@ -1455,11 +1539,22 @@ static void PRS(int argc, char *argv[])
}
}

- if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
- com_err(program_name, 0,
- _("Filesystem too large. No more than 2**31-1 blocks\n"
- "\t (8TB using a blocksize of 4k) are currently supported."));
- exit(1);
+ if (fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
+ if (!noaction) {
+ retval = check_for_wrap(device_name,
+ EXT2_BLOCK_SIZE(&fs_param));
+ if (retval) {
+ com_err(program_name, retval, "Write wrapped, "
+ "filesystem is too large for the disk "
+ "to handle\n");
+ exit(1);
+ }
+ }
+
+ fprintf(stderr, "\nWarning: older 2.6 kernels (2.6.18 and "
+ "older) may have problems with such a \n\tlarge "
+ "filesystem. If you have problems try a newer "
+ "kernel\n");
}

if ((blocksize > 4096) &&

2008-01-10 03:59:37

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: play with 8TB to 16TB fs's better

On Jan 09, 2008 16:04 -0500, Josef Bacik wrote:
> +static int valid_size(unsigned long long *size64, int blocksize)
> +{
> + /* see if we are above 16tb */
> + if ((*size64 / blocksize) > 0xFFFFFFFF) {
> + /* if we are just at 16tb adjust the size slightly */

I'd change these comments to read "2^32 blocks" instead of "16TB" because
this is a different size with 1kB or 16kB blocks...

> +static int check_for_wrap(const char *file, int blocksize)
> +{
> + int fd, tmp, total = 0;
> + char buffer[blocksize];

I don't think this is ANSI-C to declare the stack variable size based
on a function parameter. This instead should malloc() a buffer instead.
It probably makes sense to make it a unique non-zero pattern, just to
catch the case where the block is not read from disk, or is read from some
other spot on disk that IS zero.

> +#ifdef HAVE_OPEN64
> + fd = open64(file, O_RDWR);
> +#else
> + fd = open(file, O_RDWR);
> +#endif

It would be desirable to use the ext2fs_open() fs handle, and
io_channel_write_block() to do the write at block 2^31+1 to verify the libext2fs
code. It appears the io_channel_{read,write}_block() code would at least work
with 64-bit offsets on 64-bit architectures, because it takes an unsigned long
as the block number...

The other issue is that none of the unix IO calls will be portable to the
other IO managers, and will circumvent the undo IO manager, so it is better
to use the normal io_channel_{read,write}_block().

> + if (ext2fs_sync_device(fd, 1)) {
> + fprintf(stderr, "Error flushing cache to disk %s\n", file);
> + close(fd);
> + exit(1);
> + }

I'm not sure we need a sync+flush before the second write?

> + memset(buffer, 0xa, blocksize);

It would be better to set this to some more unique pattern (e.g. current
time in first 8 bytes) to ensure there isn't some hold-over data from a
previous run or something.

> + memset(buffer, 0xa, blocksize);

I don't think we need this second memset, since the buffer should still be
the existing non-zero data.

> + for (tmp = 0; tmp < blocksize; tmp++) {
> + if (buffer[tmp] != 0x0) {
> + close(fd);
> + return -1;
> + }

Would probably be easier to just allocate a second buffer, copy it from
the original buffer at the beginning of the test, and then memcmp() it
against the block read from disk.
> + com_err(program_name, retval, "Write wrapped, "
> + "filesystem is too large for the disk "
> + "to handle\n");
> +
> + fprintf(stderr, "\nWarning: older 2.6 kernels (2.6.18 and "
> + "older) may have problems with such a \n\tlarge "
> + "filesystem. If you have problems try a newer "
> + "kernel\n");

Why are some messages com_err() and others fprintf()? The content of the
message is good though.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-01-10 21:54:05

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: play with 8TB to 16TB fs's better

Hello,

Here's take 3. I've moved the blocks >= 1 << 31 check after the instantiation
of the ext2_filsys so I can use the io manager stuff directly. Also made a few
tweaks and such based on Andreas' comments. Let me know if this works. Thank
you,

Josef


Index: e2fsprogs/lib/ext2fs/getsize.c
===================================================================
--- e2fsprogs.orig/lib/ext2fs/getsize.c
+++ e2fsprogs/lib/ext2fs/getsize.c
@@ -135,6 +135,21 @@ static int valid_offset (int fd, ext2_lo
return 1;
}

+static int valid_size(unsigned long long *size64, int blocksize)
+{
+ /* see if we are above 16tb */
+ if ((*size64 / blocksize) > 0xFFFFFFFF) {
+ /* if we are just at 2^32 blocks adjust the size slightly */
+ if ((*size64 / blocksize) == 0x100000000) {
+ (*size64)--;
+ return 1;
+ } else
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* Returns the number of blocks in a partition
*/
@@ -189,7 +204,7 @@ errcode_t ext2fs_get_device_size(const c
if (valid_blkgetsize64 &&
ioctl(fd, BLKGETSIZE64, &size64) >= 0) {
if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
- ((size64 / blocksize) > 0xFFFFFFFF)) {
+ !valid_size(&size64, blocksize)) {
rc = EFBIG;
goto out;
}
@@ -252,13 +267,14 @@ errcode_t ext2fs_get_device_size(const c
struct stat st;
if (fstat(fd, &st) == 0)
#endif
+ size64 = st.st_size;
if (S_ISREG(st.st_mode)) {
if ((sizeof(*retblocks) < sizeof(unsigned long long)) &&
- ((st.st_size / blocksize) > 0xFFFFFFFF)) {
+ !valid_size(&size64, blocksize)) {
rc = EFBIG;
goto out;
}
- *retblocks = st.st_size / blocksize;
+ *retblocks = size64 / blocksize;
goto out;
}
}
@@ -283,7 +299,7 @@ errcode_t ext2fs_get_device_size(const c
valid_offset (fd, 0);
size64 = low + 1;
if ((sizeof(*retblocks) < sizeof(unsigned long long))
- && ((size64 / blocksize) > 0xFFFFFFFF)) {
+ && !valid_size(&size64, blocksize)) {
rc = EFBIG;
goto out;
}
Index: e2fsprogs/misc/mke2fs.c
===================================================================
--- e2fsprogs.orig/misc/mke2fs.c
+++ e2fsprogs/misc/mke2fs.c
@@ -916,6 +916,54 @@ static void edit_feature(const char *str
}
}

+static errcode_t check_for_wrap(ext2_filsys fs)
+{
+ char *buf, *orig;
+ errcode_t retval;
+
+ buf = malloc(fs->blocksize);
+ if (!buf) {
+ com_err(program_name, retval, "trying to allocate buffer\n");
+ exit(1);
+ }
+
+ orig = malloc(fs->blocksize);
+ if (!orig) {
+ com_err(program_name, retval, "trying to allocate buffer\n");
+ free(buf);
+ exit(1);
+ }
+
+ memset(buf, 0, fs->blocksize);
+ memset(orig, 0, fs->blocksize);
+
+ retval = io_channel_write_blk(fs->io, 1, 1, buf);
+ if (retval)
+ goto out;
+
+ memset(buf, 0xdead, fs->blocksize);
+ retval = io_channel_write_blk(fs->io, (1UL << 31)+1, 1, buf);
+ if (retval)
+ goto out;
+
+ retval = io_channel_flush(fs->io);
+ if (retval)
+ goto out;
+
+ retval = io_channel_read_blk(fs->io, 1, 1, buf);
+ if (retval)
+ goto out;
+
+ if (memcmp(buf, orig, fs->blocksize))
+ retval = -1;
+
+out:
+ free(buf);
+ free(orig);
+
+ return retval;
+}
+
extern const char *mke2fs_default_profile;
static const char *default_files[] = { "<default>", 0 };

@@ -1455,13 +1503,6 @@ static void PRS(int argc, char *argv[])
}
}

- if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
- com_err(program_name, 0,
- _("Filesystem too large. No more than 2**31-1 blocks\n"
- "\t (8TB using a blocksize of 4k) are currently supported."));
- exit(1);
- }
-
if ((blocksize > 4096) &&
(fs_param.s_feature_compat & EXT3_FEATURE_COMPAT_HAS_JOURNAL))
fprintf(stderr, _("\nWarning: some 2.4 kernels do not support "
@@ -1572,6 +1613,23 @@ int main (int argc, char *argv[])
exit(1);
}

+ if (fs->super->s_blocks_count >= ((unsigned) 1 << 31)) {
+ if (!noaction) {
+ retval = check_for_wrap(fs);
+ if (retval) {
+ com_err(program_name, retval, "Write wrapped,"
+ "filesystem is too large for the disk"
+ "to handle %d\n", (signed)retval);
+ exit(1);
+ }
+ }
+
+ com_err(program_name, 0, "\nWarning: older 2.6 kernels "
+ "(2.6.18 and older) may have problems with such a \n\t"
+ "large filesystem. If you have problems try a newer "
+ "kernel\n");
+ }
+
/*
* Wipe out the old on-disk superblock
*/