Subject: created ext4 disk image differs depending on the underlying filesystem

Hi,

I originally observed this issue when creating ext4 disk images on a 9p
filesystem which differed from the images I created on a tmpfs. I observed that
the difference also exists when the underlying file system is fat32, so I'm
using this as an example here. For what it's worth, the ext4 filesystem images
created on a tmpfs are identical to those created on an ext4 fs. To demonstrate
the issue, please see the script at the end of this mail (it requires sudo to
mount and unmount the fat32 disk image). As you can see from the printed
hashes, the disk images produced outside the fat32 disk are always identical as
expected. The diff between the reproducible images and those stored on fat32 is
also very short but I don't know what data is stored at those points:

@@ -85,7 +85,7 @@
00000540: 0000 0000 0000 0000 0000 0000 0000 1000 ................
00000550: 0000 0000 0000 0000 0000 0000 2000 2000 ............ . .
00000560: 0200 0000 0000 0000 0000 0000 0000 0000 ................
-00000570: 0000 0000 0401 0000 8c04 0000 0000 0000 ................
+00000570: 0000 0000 0401 0000 4900 0000 0000 0000 ........I.......
00000580: 0000 0000 0000 0000 0000 0000 0000 0000 ................
00000590: 0000 0000 0000 0000 0000 0000 0000 0000 ................
000005a0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
@@ -125,9 +125,9 @@
000007c0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
000007d0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
000007e0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
-000007f0: 0000 0000 0000 0000 0000 0000 264c 0251 ............&L.Q
+000007f0: 0000 0000 0000 0000 0000 0000 64ca bba5 ............d...
00000800: 1200 0000 2200 0000 3200 0000 9d03 7300 ...."...2.....s.
-00000810: 0200 0000 0000 0000 babb 8a41 7300 2004 ...........As. .
+00000810: 0200 0400 0000 0000 babb 8a41 7300 69f5 ...........As.i.
00000820: 0000 0000 0000 0000 0000 0000 0000 0000 ................
00000830: 0000 0000 0000 0000 bc7a 6e31 0000 0000 .........zn1....
00000840: 0000 0000 0000 0000 0000 0000 0000 0000 ................

Any idea what is going on? Is there a better way to diff two ext4 disk images
than diffing the xxd output? If I try diffing the dumpe2fs output I get these
differences:

@@ -32,7 +32,7 @@
Maximum mount count: -1
Last checked: Fri May 3 16:14:49 2024
Check interval: 0 (<none>)
-Lifetime writes: 1164 kB
+Lifetime writes: 73 kB
Reserved blocks uid: 0 (user root)
Reserved blocks gid: 0 (group root)
First inode: 11
@@ -44,7 +44,7 @@
Directory Hash Seed: 0b7f9cfd-0113-486c-a453-4f5483bd486b
Journal backup: inode blocks
Checksum type: crc32c
-Checksum: 0x51024c26
+Checksum: 0xa5bbca64
Checksum seed: 0xf81d767d
Orphan file inode: 12
Journal features: (none)
@@ -56,7 +56,7 @@
Journal start: 0


-Group 0: (Blocks 1-2047) csum 0x0420
+Group 0: (Blocks 1-2047) csum 0xf569 [ITABLE_ZEROED]
Primary superblock at 1, Group descriptors at 2-2
Reserved GDT blocks at 3-17
Block bitmap at 18 (+17), csum 0x7abcbbba

Why would these bits differ depending on the filesystem on which the disk image
is stored? Is there a way to equalize this information so that the disk image
looks the same independent on the underlying filesystem?

Thanks!

cheers, josch

#!/bin/sh
set -eu
mkfs() {
imgpath="$1"
rm -f "$imgpath"
dd if=/dev/zero of="$imgpath" bs=1024 count=2048 2>/dev/null
echo H4sIAAAAAAAAA+3OQQrCMBCF4Vl7ihwho9PkPKVEtJgU2rjo7a240JXSRSnC/20ew5vFy/P5ekulzUk24xfB7JkaG/+ZL3oUtaCnYE2IUZZbTcX57Sa93afajs5JP0zd5cvfr/5P5bkbSk2lHvZeAgAAAAAAAAAAAAAAAABY4wEWZDwwACgAAA== \
| base64 -d \
| env LC_ALL=C.UTF-8 SOURCE_DATE_EPOCH=1714745689 /sbin/mke2fs -d - \
-q -F -o Linux -T ext4 -O metadata_csum,64bit \
-U 0b7f9cfd-0113-486c-a453-4f5483bd486b \
-E hash_seed=0b7f9cfd-0113-486c-a453-4f5483bd486b \
-b 1024 "$imgpath"
md5sum "$imgpath"
}

mkfs "/dev/shm/disk.ext4"
mkfs disk.ext4

rm -f fat32.img
mkdir -p mnt
dd if=/dev/zero of=fat32.img bs=1024 count=65536 2>/dev/null
/sbin/mkfs.vfat -F 32 fat32.img
sudo mount -o rw,umask=0000 fat32.img mnt
mkfs mnt/disk.ext4
bash -c 'diff -u <(xxd mnt/disk.ext4) <(xxd disk.ext4) || true'
bash -c 'diff -u <(/sbin/dumpe2fs mnt/disk.ext4) <(/sbin/dumpe2fs disk.ext4) || true'
sudo umount mnt
mkfs disk.ext4
mkfs "/dev/shm/disk.ext4"
rm "/dev/shm/disk.ext4" disk.ext4 fat32.img
rmdir mnt


Attachments:
signature.asc (849.00 B)
signature

Subject: Re: created ext4 disk image differs depending on the underlying filesystem

Quoting Johannes Schauer Marin Rodrigues (2024-05-04 16:32:50)
> I originally observed this issue when creating ext4 disk images on a 9p
> filesystem which differed from the images I created on a tmpfs. I observed
> that the difference also exists when the underlying file system is fat32, so
> I'm using this as an example here. For what it's worth, the ext4 filesystem
> images created on a tmpfs are identical to those created on an ext4 fs. To
> demonstrate the issue, please see the script at the end of this mail (it
> requires sudo to mount and unmount the fat32 disk image). As you can see from
> the printed hashes, the disk images produced outside the fat32 disk are
> always identical as expected. The diff between the reproducible images and
> those stored on fat32 is also very short but I don't know what data is stored
> at those points:
>
> @@ -85,7 +85,7 @@
> 00000540: 0000 0000 0000 0000 0000 0000 0000 1000 ................
> 00000550: 0000 0000 0000 0000 0000 0000 2000 2000 ............ . .
> 00000560: 0200 0000 0000 0000 0000 0000 0000 0000 ................
> -00000570: 0000 0000 0401 0000 8c04 0000 0000 0000 ................
> +00000570: 0000 0000 0401 0000 4900 0000 0000 0000 ........I.......
> 00000580: 0000 0000 0000 0000 0000 0000 0000 0000 ................
> 00000590: 0000 0000 0000 0000 0000 0000 0000 0000 ................
> 000005a0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
> @@ -125,9 +125,9 @@
> 000007c0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
> 000007d0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
> 000007e0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
> -000007f0: 0000 0000 0000 0000 0000 0000 264c 0251 ............&L.Q
> +000007f0: 0000 0000 0000 0000 0000 0000 64ca bba5 ............d...
> 00000800: 1200 0000 2200 0000 3200 0000 9d03 7300 ...."...2.....s.
> -00000810: 0200 0000 0000 0000 babb 8a41 7300 2004 ...........As. .
> +00000810: 0200 0400 0000 0000 babb 8a41 7300 69f5 ...........As.i.
> 00000820: 0000 0000 0000 0000 0000 0000 0000 0000 ................
> 00000830: 0000 0000 0000 0000 bc7a 6e31 0000 0000 .........zn1....
> 00000840: 0000 0000 0000 0000 0000 0000 0000 0000 ................
>
> Any idea what is going on? Is there a better way to diff two ext4 disk images
> than diffing the xxd output? If I try diffing the dumpe2fs output I get these
> differences:
>
> @@ -32,7 +32,7 @@
> Maximum mount count: -1
> Last checked: Fri May 3 16:14:49 2024
> Check interval: 0 (<none>)
> -Lifetime writes: 1164 kB
> +Lifetime writes: 73 kB
> Reserved blocks uid: 0 (user root)
> Reserved blocks gid: 0 (group root)
> First inode: 11
> @@ -44,7 +44,7 @@
> Directory Hash Seed: 0b7f9cfd-0113-486c-a453-4f5483bd486b
> Journal backup: inode blocks
> Checksum type: crc32c
> -Checksum: 0x51024c26
> +Checksum: 0xa5bbca64
> Checksum seed: 0xf81d767d
> Orphan file inode: 12
> Journal features: (none)
> @@ -56,7 +56,7 @@
> Journal start: 0
>
>
> -Group 0: (Blocks 1-2047) csum 0x0420
> +Group 0: (Blocks 1-2047) csum 0xf569 [ITABLE_ZEROED]
> Primary superblock at 1, Group descriptors at 2-2
> Reserved GDT blocks at 3-17
> Block bitmap at 18 (+17), csum 0x7abcbbba
>
> Why would these bits differ depending on the filesystem on which the disk image
> is stored? Is there a way to equalize this information so that the disk image
> looks the same independent on the underlying filesystem?

The diff becomes a bit smaller when using
-E lazy_itable_init=0,assume_storage_prezeroed=0,nodiscard

@@ -85,7 +85,7 @@
00000540: 0000 0000 0000 0000 0000 0000 0000 1000 ................
00000550: 0000 0000 0000 0000 0000 0000 2000 2000 ............ . .
00000560: 0200 0000 0000 0000 0000 0000 0000 0000 ................
-00000570: 0000 0000 0401 0000 ac04 0000 0000 0000 ................
+00000570: 0000 0000 0401 0000 4900 0000 0000 0000 ........I.......
00000580: 0000 0000 0000 0000 0000 0000 0000 0000 ................
00000590: 0000 0000 0000 0000 0000 0000 0000 0000 ................
000005a0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
@@ -125,7 +125,7 @@
000007c0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
000007d0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
000007e0: 0000 0000 0000 0000 0000 0000 0000 0000 ................
-000007f0: 0000 0000 0000 0000 0000 0000 fb8d a90f ................
+000007f0: 0000 0000 0000 0000 0000 0000 64ca bba5 ............d...
00000800: 1200 0000 2200 0000 3200 0000 9d03 7300 ...."...2.....s.
00000810: 0200 0400 0000 0000 babb 8a41 7300 69f5 ...........As.i.
00000820: 0000 0000 0000 0000 0000 0000 0000 0000 ................

@@ -32,7 +32,7 @@
Maximum mount count: -1
Last checked: Fri May 3 16:14:49 2024
Check interval: 0 (<none>)
-Lifetime writes: 1196 kB
+Lifetime writes: 73 kB
Reserved blocks uid: 0 (user root)
Reserved blocks gid: 0 (group root)
First inode: 11
@@ -44,7 +44,7 @@
Directory Hash Seed: 0b7f9cfd-0113-486c-a453-4f5483bd486b
Journal backup: inode blocks
Checksum type: crc32c
-Checksum: 0x0fa98dfb
+Checksum: 0xa5bbca64
Checksum seed: 0xf81d767d
Orphan file inode: 12
Journal features: (none)


The "Lifetime writes" being much higher on fat32 suggests that despite
"nodiscard", less zeroes were written out when ext4 or tmpfs are the underlying
FS?

Thanks!

cheers, josch


Attachments:
signature.asc (849.00 B)
signature

2024-05-05 00:10:52

by Theodore Ts'o

[permalink] [raw]
Subject: Re: created ext4 disk image differs depending on the underlying filesystem

On Sat, May 04, 2024 at 07:53:29PM +0200, Johannes Schauer Marin Rodrigues wrote:
> >
> > Any idea what is going on?

The fundamental issue has to do with how ext2fs_zero_blocks() in
lib/ext2fs/mkjournal.c is implemented.

> The "Lifetime writes" being much higher on fat32 suggests that despite
> "nodiscard", less zeroes were written out when ext4 or tmpfs are the underlying
> FS?

Yes, that's exactly right.

The ext2fs_zero_blocks() function will attempt to call the io
channel's zeroout function --- for Unix systems, that's
lib/ext2fs/unix_io.c's __unix_zeroout() function. This will attempt
to use fallocate's FALLOC_FL_ZERO_RANGE or FALLOCATE_FL_PUNCH_HOLE to
zero a range of blocks. Now, exactly how ZERO_RANGE and PUNCH_HOLE is
implemented depends on whether the "storage device" being accessed via
unix_io is a block device or a file, and if it is a file, whether the
underlying file system supports ZERO_RANGE or PUNCH_HOLE.

Depending on how the underlying file system supports ZERO_RANGE and/or
PUNCH_HOLE, it may simply manipulate metadata blocks (e.g., ext4's
extent tree) so that the relevant file offsets will return zero --- or
if the file system doesn't support unitialized extent range, and/or
doesn't support sparse files, the file system MAY write all zeros, or
the file system MAY simply return an EOPNOTSUPP error, or the file
system MAY issue a SCSI WRITE SAME or moral equivalent for UFS, NVMe,
etc., if the block device supports it (and this might turn into a
SSD-level discard, so long as it is a reliable discard). And of
course, if unix_io is accessing a block device, depending on the
capabilities of the storage device and its connection bus, this might
also turn into a SCSI WRITE SAME, or some other zeroout command.

Now, the zeroout command doesn't actually increment the lifetime
writes counter. Whether or not it should is an interesting
philosophical question, since it might actually result in writes to
the device, or it might just simply involve metadata updates, either
on the underlying file (if the file system supports it), or
potentially in the metadata for the SSD's Flash Translation Layer. At
the userspace level, we simply don't know how FALLOC_FL_ZERO_RANGE and
FALLOC_FL_PUNCH_HOLE will be implemented.

In the case of FAT32, the file system doesn't support sparse files,
and it also doesn't support unitialized extents. So
FALLOC_FL_ZERO_RANGE and FALLOC_FL_PUNCH_HOLE will fail on a fat32
file system. As a result, ext2fs_zero_blocks() will fall back to
explicitly writing zeros using io_channel_write_blk64(), and this
*does* increment the lifetime writes counter.

If you enhance the script by adding "ls -ls "$imgpath" and "filefrag
-v "$imgpath" || /bin/true", you can see that the disk space consumed
by the image file varies, and it varies even more if you use the
original version of the script that doesn't disable lazy_itable_init,
discard, et.al.

Unfortunately tmpfs and fat don't support filefrag -v, but you could
see the difference if you write a debugging program which used lseek's
SEEK_HOLE and SEEK_DATA to see which parts of the file are sparse
(although it won't show which parts of the file are marked
unitialized, assuming the file system supported it).


If your goal is to create completely reproducible image files, one
question is whether keeping the checksums identical is enough, or do
you care about whether the underlying file is being more efficiently
stored by using sparse files or extents marked unitialized?

Depending on how much you care about reproducibility versus file
storage efficiency, I could imagine adding some kind of option which
disables the zeroout function, and forces e2fsprogs to always write
zeros, even if that increases the write wearout rate of the underlying
flash file system, and increasing the size of the image file. Or I
could imageine some kind of extended option which hacks mke2fs to zero
out the lifetime writes counter.;

Cheers,

- Ted

Subject: Re: created ext4 disk image differs depending on the underlying filesystem

Hi Ted,

thank you very much for your (as usual) very detailed and comprehensive reply!
:D

Quoting Theodore Ts'o (2024-05-05 02:10:20)
> If your goal is to create completely reproducible image files, one question
> is whether keeping the checksums identical is enough, or do you care about
> whether the underlying file is being more efficiently stored by using sparse
> files or extents marked unitialized?
>
> Depending on how much you care about reproducibility versus file
> storage efficiency, I could imagine adding some kind of option which
> disables the zeroout function, and forces e2fsprogs to always write
> zeros, even if that increases the write wearout rate of the underlying
> flash file system, and increasing the size of the image file. Or I
> could imageine some kind of extended option which hacks mke2fs to zero out
> the lifetime writes counter.;

the good news is, that the fix in my situation is very simple: create the
filesystem on a tmpfs first and then copy it into 9p fs afterwards. Tada, the
created images will be reproducible. I think there are multiple ways forward
with which I'd be happy with:

1. leave everything as it is. It's just one more copy operation on my end. I
can just document that if your underlying file system is stupid, you might
not get the same identical image as somebody with a more intelligent
filesystem does.

2. allow resetting fs->super->s_kbytes_written to zero. This patch worked for
me:

--- a/lib/ext2fs/closefs.c
+++ b/lib/ext2fs/closefs.c
@@ -504,6 +504,7 @@ errcode_t ext2fs_close2(ext2_filsys fs, int flags)
(fs->blocksize / 1024);
if ((fs->flags & EXT2_FLAG_DIRTY) == 0)
fs->flags |= EXT2_FLAG_SUPER_ONLY | EXT2_FLAG_DIRTY;
+ fs->super->s_kbytes_written = 0;
}
if (fs->flags & EXT2_FLAG_DIRTY) {
retval = ext2fs_flush2(fs, flags);


If my goal is to create disk images, one could argue that what the end user
is interested in, is the filesystem writes that *they* performed and that
the disk image they receive should therefor have the counter start at zero.

3. Somehow do magic with the zeroout function. If anybody has too much
free-time... ;)

As an end-user I am very interested in keeping the functionality of mke2fs
which keeps track of which parts are actually sparse and which ones are not.
This functionality can be used with tools like "bmaptool" (a more clever dd) to
only copy those parts of the image to the flash drive which are actually
supposed to contain data.

Would you be happy about a patch for (2.)? If yes, I can send something over
once I find some time. :)

Thanks!

cheers, josch


Attachments:
signature.asc (849.00 B)
signature

2024-05-11 21:11:34

by Theodore Ts'o

[permalink] [raw]
Subject: Re: created ext4 disk image differs depending on the underlying filesystem

On Sat, May 11, 2024 at 07:34:42AM +0200, Johannes Schauer Marin Rodrigues wrote:
> 2. allow resetting fs->super->s_kbytes_written to zero. This patch worked for
> me:
>
> Would you be happy about a patch for (2.)? If yes, I can send something over
> once I find some time. :)
>

I'm currently going back and forth about whether we should just (a)
unconditionally set s_kbytes_writes to zero before we write out the
superblock, or (b) whether we add a new extended operation, or (c) do
a hack where if SOURCE_DATE_EPOCH is set, use that as implied "set
s_kbytes_written to zero since the user is probably caring about a
reproducible file system".

(c) is a bit hacky, but it's the most convenient for users, and adding
Yet Another extended operation.

Related to this is the design question about whether SOURCE_DATE_EPOCH
should imply using a fixed value for s_uuid and s_hash_seed. Again,
it's a little weird to overload SOURCE_DATE_EPOCH to setting the uuid
and hash_seed to some fixed value, which might be a time-based UUID
with the ethernet address set to all zeroes, or some other fixed
value. But it's a pretty good proxy of what the user wants, and if
this is this is the default, the user can always override it via an
extended option if they really want something different.

If it weren't for the fact that I'm considering have SOURCE_DATE_EPOCH
provide default values for s_uuid and s_hash_seed, I'd be tempted to just
unconditionally set the s_kbytes_written to zero.

I'm curious what your opinions might be on this, as someone who might
want to use this feature.

> As an end-user I am very interested in keeping the functionality of mke2fs
> which keeps track of which parts are actually sparse and which ones are not.
> This functionality can be used with tools like "bmaptool" (a more clever dd) to
> only copy those parts of the image to the flash drive which are actually
> supposed to contain data.

If the file system where the image is created supports either the
FIEMAP ioctl or fallocate SEEK_HOLE, then "bmaptool create" can figure
out which parts of the file is sparse, so we don't need to make any
changes to e2fsprogs. If the file system doesn't support FIEMAP or
SEEK_HOLE, one could imagine that bmaptool could figure out which
parts of the file could be sparse simply by looking for blocks that
are all zeroes. This is basically what "cp --sparse=always" or what
the attached make-sparse.c file does to determine where the holes could be.

Yes, I could imagine adding a new io_manager much like test_io and
undo_io which tracked which blocks had been written, and then would
write out a BMAP file. However, the vast majority of constructed file
systems are quite small, so simply reading all of the blocks to
determine which blocks were all zeroes ala cp --sparse=always isn't
going to invole all that much overhead. And I'd argue the right thing
to do would be to teach bmaptool how to do what cp --sparse=always so
that the same interface regardless of whether bmaptool is running on a
modern file system that supports FIEMAP or SEEK_HOLE, or some legacy
file system like FAT16 or FAT32.

Cheers,

- Ted
/*
* make-sparse.c --- make a sparse file from stdin
*
* Copyright 2004 by Theodore Ts'o.
*
* %Begin-Header%
* This file may be redistributed under the terms of the GNU Public
* License.
* %End-Header%
*/

#define _LARGEFILE_SOURCE
#define _LARGEFILE64_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

int full_read(int fd, char *buf, size_t count)
{
int got, total = 0;
int pass = 0;

while (count > 0) {
got = read(fd, buf, count);
if (got == -1) {
if ((errno == EINTR) || (errno == EAGAIN))
continue;
return total ? total : -1;
}
if (got == 0) {
if (pass++ >= 3)
return total;
continue;
}
pass = 0;
buf += got;
total += got;
count -= got;
}
return total;
}

int main(int argc, char **argv)
{
int fd, got, i;
int zflag = 0;
char buf[1024];

if (argc != 2) {
fprintf(stderr, "Usage: make-sparse out-file\n");
exit(1);
}
fd = open(argv[1], O_WRONLY|O_CREAT|O_TRUNC|O_LARGEFILE, 0777);
if (fd < 0) {
perror(argv[1]);
exit(1);
}
while (1) {
got = full_read(0, buf, sizeof(buf));
if (got == 0)
break;
if (got == sizeof(buf)) {
for (i=0; i < sizeof(buf); i++)
if (buf[i])
break;
if (i == sizeof(buf)) {
lseek(fd, sizeof(buf), SEEK_CUR);
zflag = 1;
continue;
}
}
zflag = 0;
write(fd, buf, got);
}
if (zflag) {
lseek(fd, -1, SEEK_CUR);
buf[0] = 0;
write(fd, buf, 1);
}
return 0;
}


Subject: Re: created ext4 disk image differs depending on the underlying filesystem

Hi Ted,

Quoting Theodore Ts'o (2024-05-11 23:11:11)
> On Sat, May 11, 2024 at 07:34:42AM +0200, Johannes Schauer Marin Rodrigues wrote:
> > 2. allow resetting fs->super->s_kbytes_written to zero. This patch worked for
> > me:
> >
> > Would you be happy about a patch for (2.)? If yes, I can send something over
> > once I find some time. :)
> >
>
> I'm currently going back and forth about whether we should just (a)
> unconditionally set s_kbytes_writes to zero before we write out the
> superblock, or (b) whether we add a new extended operation, or (c) do
> a hack where if SOURCE_DATE_EPOCH is set, use that as implied "set
> s_kbytes_written to zero since the user is probably caring about a
> reproducible file system".
>
> (c) is a bit hacky, but it's the most convenient for users, and adding
> Yet Another extended operation.

when changing defaults in my own software I try to think about if and if yes
how users will be able to change that default to get back to how it was before
because xkcd #1172 is very real. Your options a) and c) do not give users a way
to tell mke2fs that yes, they *do* want their filesystem to record how much was
written to it during its creation. Even somebody who uses SOURCE_DATE_EPOCH to
get reproducible output might have this need. Now you can argue "but users who
want this will be very very rare" and i will not disagree but then i think the
need to set s_kbytes_writes to zero because i want reproducible images when
creating an ext4 file system image on top of 9p fs is also very, very rare. And
then I look at option b) which is not nice but isn't it okay to have a
cumbersome option for people in very niche situations?

> Related to this is the design question about whether SOURCE_DATE_EPOCH should
> imply using a fixed value for s_uuid and s_hash_seed. Again, it's a little
> weird to overload SOURCE_DATE_EPOCH to setting the uuid and hash_seed to some
> fixed value, which might be a time-based UUID with the ethernet address set
> to all zeroes, or some other fixed value. But it's a pretty good proxy of
> what the user wants, and if this is this is the default, the user can always
> override it via an extended option if they really want something different.

Beware that generating a fitting uuid can quickly become not so fun anymore if
you want to follow the relevant RFC instead of "just making something up". I've
had a talk with the reproducible builds people about this issue as I was
looking for prior art on how to turn a SOURCE_DATE_EPOCH into a predictable
uuid and I was told that the proper way would be to first generate a version 5
uuid using a DNS name you control and then use that uuid as the namespace for
another uuid together with SOURCE_DATE_EPOCH. It was then discussed whether the
reproducible builds team should formalize a method to turn a SOURCE_DATE_EPOCH
into a uuid and document it and how that should be done but it seems to be
tricky to do it right if one wants to follow the relevant RFCs to the letter.

> If it weren't for the fact that I'm considering have SOURCE_DATE_EPOCH
> provide default values for s_uuid and s_hash_seed, I'd be tempted to just
> unconditionally set the s_kbytes_written to zero.
>
> I'm curious what your opinions might be on this, as someone who might
> want to use this feature.

Not only "might want to use this" but "actively using it":

https://tracker.debian.org/news/1529763/accepted-mmdebstrap-150-1-source-into-unstable/

As the mmdebstrap upstream author, I would have no problem with mke2fs setting
s_uuid and s_hash_seed to some reproducible value by default. As you said, any
user who doesn't like this can always run mke2fs manually and because
mmdebstrap writes tarballs to stdout, adding custom mke2fs options is really
easy:

mmdebstrap | mke2fs -d - -U $(uuidgen) -E hash_seed=$(uuidgen) ...

That being said, I'm not aware of anybody else requiring bit-by-bit
reproducible ext4 images. I never got bit-by-bit reproducible output to work
when using an unpacked filesystem directory as the source for mke2fs. But I'm
at MiniDebConf Berlin this week and just yesterday I met somebody from the
Debian Cloud team who said that they are interested in this functionality. I
shall make them aware of this thread today and maybe they have some further
input.

> > As an end-user I am very interested in keeping the functionality of mke2fs
> > which keeps track of which parts are actually sparse and which ones are
> > not. This functionality can be used with tools like "bmaptool" (a more
> > clever dd) to only copy those parts of the image to the flash drive which
> > are actually supposed to contain data.
> If the file system where the image is created supports either the FIEMAP
> ioctl or fallocate SEEK_HOLE, then "bmaptool create" can figure out which
> parts of the file is sparse, so we don't need to make any changes to
> e2fsprogs. If the file system doesn't support FIEMAP or SEEK_HOLE, one could
> imagine that bmaptool could figure out which parts of the file could be
> sparse simply by looking for blocks that are all zeroes. This is basically
> what "cp --sparse=always" or what the attached make-sparse.c file does to
> determine where the holes could be.
>
> Yes, I could imagine adding a new io_manager much like test_io and
> undo_io which tracked which blocks had been written, and then would
> write out a BMAP file. However, the vast majority of constructed file
> systems are quite small, so simply reading all of the blocks to
> determine which blocks were all zeroes ala cp --sparse=always isn't
> going to invole all that much overhead. And I'd argue the right thing
> to do would be to teach bmaptool how to do what cp --sparse=always so
> that the same interface regardless of whether bmaptool is running on a
> modern file system that supports FIEMAP or SEEK_HOLE, or some legacy
> file system like FAT16 or FAT32.

Thank you for your make-sparse.c. I was wondering whether it is really as
simple as finding all 1024 byte blocks that are all zeros and then skipping
them with lseek, creating a "hole". I imagined that maybe there were some
potential issues of the sort that when my filesystem is part of a disk image
together with a partition table, then maybe due to at what offset the
filesystem is stored on that image or what expectations ext4 or other
filesystems on that image have, there could be situations when it really *is*
necessary to write 1024 consecutive zeroes to my flash drive. At the time that
I was pondering about this, I used fallocate --dig-holes to turn a disk image I
had into a sparse one and this made fsck report problems in the filesystem that
it was not able to fix. I didn't investigate this further.

My need to "dig holes" did not come from the underlying filesystem being a dumb
one like fat or 9p but because it turns out that copying an ext4 image onto a
disk image with an offset while preserving its sparse-ness is not something dd
(or any similar utility I found) was able to do. So at the end of this mail is
the program I am now using to copy the output of mke2fs (which is sparse) onto
my disk image while preserving all the holes created by mke2fs exactly as
mke2fs decided they should be placed. After flashing this to a SD-Card which I
filled with random bytes before, the resulting system booted fine and fsck did
not report any issues. Assuming that the underlying filesystem is smart, I
imagine that simply preserving the holes is the safer option than digging new
ones.

Thanks!

cheers, josch




#define _GNU_SOURCE
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>


int main(int argc, char *argv[]) {
if (argc != 3 && argc != 4) {
fprintf(stderr, "Usage: %s infile outfile [offset]\n", argv[0]);
exit(EXIT_FAILURE);
}
int infd = open(argv[1], O_RDONLY);
if (infd == -1) {
perror("open");
exit(EXIT_FAILURE);
}
off64_t inlength = lseek64(infd, 0, SEEK_END);
if (inlength == -1) {
perror("lseek64");
exit(EXIT_FAILURE);
}
int outfd = open(argv[2], O_CREAT | O_WRONLY);
if (outfd == -1) {
perror("open");
exit(EXIT_FAILURE);
}
off64_t outlength = lseek64(outfd, 0, SEEK_END);
if (outlength == -1) {
perror("lseek64");
exit(EXIT_FAILURE);
}
long long offset = 0;
if (argc == 4) {
offset = strtoll(argv[3], NULL, 10);
if (errno != 0) {
perror("strtoll");
exit(EXIT_FAILURE);
}
}
off64_t curr = 0;
while (true) {
off64_t data = lseek64(infd, curr, SEEK_DATA);
if (data == -1) {
break;
}
off64_t hole = lseek64(infd, data, SEEK_HOLE);
if (hole == -1) {
hole = inlength;
}
off64_t off_out = data + offset;
ssize_t ret = copy_file_range(infd, &data, outfd, &off_out, hole - data, 0);
if (ret == -1) {
perror("copy_file_range");
exit(EXIT_FAILURE);
}
curr = hole;
}
if (outlength < inlength + offset) {
int ret = ftruncate(outfd, inlength + offset);
if (ret == -1) {
perror("ftruncate");
exit(EXIT_FAILURE);
}
}
close(infd);
close(outfd);
exit(EXIT_SUCCESS);
}


Attachments:
signature.asc (849.00 B)
signature