Hi,
This patch gives userspace ability to prevent underlying file of loop
device to be sparse. Currently, if loop device is 'trimmed'
(BLKDISCARD), then underlying file will always become sparse and disk
space freed. This behaviour is good for thin provisioning but not good
for preallocated disk images. This patch introduces LO_FLAGS_NO_DEALLOC
flag, which will write zeroes to underlying file instead of punching
holes when BLKDISCARD is requested. Thus reducing file fragmentation of
preallocated disk images and improve performance.
I will also submit patches to util-linux to provide userspace support if
this patch is merged.
Changes in V1->V2:
Renamed NODEALLOC to NO_DEALLOC, to avoid confusion between NO_DEALLOC
and NODE_ALLOC. Suggested by Jens Axboe.
Best Regards,
Zhang Boyang
Previously, for file-backed loop devices, REQ_OP_DISCARD and
REQ_OP_WRITE_ZEROES (without REQ_NOUNMAP) are implemented using
fallocate(FALLOC_FL_PUNCH_HOLE), which will cause the underlying file to
be sparse and disk space freed. The users have no choice to prevent this
this from happening.
This patch introduces LO_FLAGS_NO_DEALLOC. With this flag set,
REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES are forced to use
fallocate(FALLOC_FL_ZERO_RANGE). The disk space of underlying file is
kept allocated. This is useful if users, for example, want to use a
preallocated file as the backing file.
Signed-off-by: Zhang Boyang <[email protected]>
---
drivers/block/loop.c | 17 +++++++++++++++--
include/uapi/linux/loop.h | 15 +++++++++++----
2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 084f9b8a0ba3..36bd9906a154 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -483,11 +483,15 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
* write zeroes the range. Otherwise, punch them out.
*/
return lo_fallocate(lo, rq, pos,
- (rq->cmd_flags & REQ_NOUNMAP) ?
+ ((rq->cmd_flags & REQ_NOUNMAP) ||
+ (lo->lo_flags & LO_FLAGS_NO_DEALLOC)) ?
FALLOC_FL_ZERO_RANGE :
FALLOC_FL_PUNCH_HOLE);
case REQ_OP_DISCARD:
- return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
+ return lo_fallocate(lo, rq, pos,
+ (lo->lo_flags & LO_FLAGS_NO_DEALLOC) ?
+ FALLOC_FL_ZERO_RANGE :
+ FALLOC_FL_PUNCH_HOLE);
case REQ_OP_WRITE:
if (cmd->use_aio)
return lo_rw_aio(lo, cmd, pos, WRITE);
@@ -719,12 +723,20 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
return sysfs_emit(buf, "%s\n", dio ? "1" : "0");
}
+static ssize_t loop_attr_no_dealloc_show(struct loop_device *lo, char *buf)
+{
+ int no_dealloc = (lo->lo_flags & LO_FLAGS_NO_DEALLOC);
+
+ return sysfs_emit(buf, "%s\n", no_dealloc ? "1" : "0");
+}
+
LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
LOOP_ATTR_RO(partscan);
LOOP_ATTR_RO(dio);
+LOOP_ATTR_RO(no_dealloc);
static struct attribute *loop_attrs[] = {
&loop_attr_backing_file.attr,
@@ -733,6 +745,7 @@ static struct attribute *loop_attrs[] = {
&loop_attr_autoclear.attr,
&loop_attr_partscan.attr,
&loop_attr_dio.attr,
+ &loop_attr_no_dealloc.attr,
NULL,
};
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index 6f63527dd2ed..91a0a8b1f298 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -18,17 +18,24 @@ enum {
LO_FLAGS_AUTOCLEAR = 4,
LO_FLAGS_PARTSCAN = 8,
LO_FLAGS_DIRECT_IO = 16,
+ LO_FLAGS_NO_DEALLOC = 32,
};
/* LO_FLAGS that can be set using LOOP_SET_STATUS(64) */
-#define LOOP_SET_STATUS_SETTABLE_FLAGS (LO_FLAGS_AUTOCLEAR | LO_FLAGS_PARTSCAN)
+#define LOOP_SET_STATUS_SETTABLE_FLAGS (LO_FLAGS_AUTOCLEAR \
+ | LO_FLAGS_PARTSCAN \
+ | LO_FLAGS_NO_DEALLOC)
/* LO_FLAGS that can be cleared using LOOP_SET_STATUS(64) */
-#define LOOP_SET_STATUS_CLEARABLE_FLAGS (LO_FLAGS_AUTOCLEAR)
+#define LOOP_SET_STATUS_CLEARABLE_FLAGS (LO_FLAGS_AUTOCLEAR \
+ | LO_FLAGS_NO_DEALLOC)
/* LO_FLAGS that can be set using LOOP_CONFIGURE */
-#define LOOP_CONFIGURE_SETTABLE_FLAGS (LO_FLAGS_READ_ONLY | LO_FLAGS_AUTOCLEAR \
- | LO_FLAGS_PARTSCAN | LO_FLAGS_DIRECT_IO)
+#define LOOP_CONFIGURE_SETTABLE_FLAGS (LO_FLAGS_READ_ONLY \
+ | LO_FLAGS_AUTOCLEAR \
+ | LO_FLAGS_PARTSCAN \
+ | LO_FLAGS_DIRECT_IO \
+ | LO_FLAGS_NO_DEALLOC)
#include <asm/posix_types.h> /* for __kernel_old_dev_t */
#include <linux/types.h> /* for __u64 */
--
2.30.2
On Sat, Aug 06, 2022 at 11:30:22PM +0800, Zhang Boyang wrote:
> Previously, for file-backed loop devices, REQ_OP_DISCARD and
> REQ_OP_WRITE_ZEROES (without REQ_NOUNMAP) are implemented using
> fallocate(FALLOC_FL_PUNCH_HOLE), which will cause the underlying file to
> be sparse and disk space freed. The users have no choice to prevent this
> this from happening.
>
> This patch introduces LO_FLAGS_NO_DEALLOC. With this flag set,
> REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES are forced to use
> fallocate(FALLOC_FL_ZERO_RANGE). The disk space of underlying file is
> kept allocated. This is useful if users, for example, want to use a
> preallocated file as the backing file.
Considering that discard isn't required to do anything, why not
echo 0 | sudo tee /sys/block/loopX/queue/discard_max_bytes ?
--D
> Signed-off-by: Zhang Boyang <[email protected]>
> ---
> drivers/block/loop.c | 17 +++++++++++++++--
> include/uapi/linux/loop.h | 15 +++++++++++----
> 2 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 084f9b8a0ba3..36bd9906a154 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -483,11 +483,15 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
> * write zeroes the range. Otherwise, punch them out.
> */
> return lo_fallocate(lo, rq, pos,
> - (rq->cmd_flags & REQ_NOUNMAP) ?
> + ((rq->cmd_flags & REQ_NOUNMAP) ||
> + (lo->lo_flags & LO_FLAGS_NO_DEALLOC)) ?
> FALLOC_FL_ZERO_RANGE :
> FALLOC_FL_PUNCH_HOLE);
> case REQ_OP_DISCARD:
> - return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
> + return lo_fallocate(lo, rq, pos,
> + (lo->lo_flags & LO_FLAGS_NO_DEALLOC) ?
> + FALLOC_FL_ZERO_RANGE :
> + FALLOC_FL_PUNCH_HOLE);
> case REQ_OP_WRITE:
> if (cmd->use_aio)
> return lo_rw_aio(lo, cmd, pos, WRITE);
> @@ -719,12 +723,20 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
> return sysfs_emit(buf, "%s\n", dio ? "1" : "0");
> }
>
> +static ssize_t loop_attr_no_dealloc_show(struct loop_device *lo, char *buf)
> +{
> + int no_dealloc = (lo->lo_flags & LO_FLAGS_NO_DEALLOC);
> +
> + return sysfs_emit(buf, "%s\n", no_dealloc ? "1" : "0");
> +}
> +
> LOOP_ATTR_RO(backing_file);
> LOOP_ATTR_RO(offset);
> LOOP_ATTR_RO(sizelimit);
> LOOP_ATTR_RO(autoclear);
> LOOP_ATTR_RO(partscan);
> LOOP_ATTR_RO(dio);
> +LOOP_ATTR_RO(no_dealloc);
>
> static struct attribute *loop_attrs[] = {
> &loop_attr_backing_file.attr,
> @@ -733,6 +745,7 @@ static struct attribute *loop_attrs[] = {
> &loop_attr_autoclear.attr,
> &loop_attr_partscan.attr,
> &loop_attr_dio.attr,
> + &loop_attr_no_dealloc.attr,
> NULL,
> };
>
> diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
> index 6f63527dd2ed..91a0a8b1f298 100644
> --- a/include/uapi/linux/loop.h
> +++ b/include/uapi/linux/loop.h
> @@ -18,17 +18,24 @@ enum {
> LO_FLAGS_AUTOCLEAR = 4,
> LO_FLAGS_PARTSCAN = 8,
> LO_FLAGS_DIRECT_IO = 16,
> + LO_FLAGS_NO_DEALLOC = 32,
> };
>
> /* LO_FLAGS that can be set using LOOP_SET_STATUS(64) */
> -#define LOOP_SET_STATUS_SETTABLE_FLAGS (LO_FLAGS_AUTOCLEAR | LO_FLAGS_PARTSCAN)
> +#define LOOP_SET_STATUS_SETTABLE_FLAGS (LO_FLAGS_AUTOCLEAR \
> + | LO_FLAGS_PARTSCAN \
> + | LO_FLAGS_NO_DEALLOC)
>
> /* LO_FLAGS that can be cleared using LOOP_SET_STATUS(64) */
> -#define LOOP_SET_STATUS_CLEARABLE_FLAGS (LO_FLAGS_AUTOCLEAR)
> +#define LOOP_SET_STATUS_CLEARABLE_FLAGS (LO_FLAGS_AUTOCLEAR \
> + | LO_FLAGS_NO_DEALLOC)
>
> /* LO_FLAGS that can be set using LOOP_CONFIGURE */
> -#define LOOP_CONFIGURE_SETTABLE_FLAGS (LO_FLAGS_READ_ONLY | LO_FLAGS_AUTOCLEAR \
> - | LO_FLAGS_PARTSCAN | LO_FLAGS_DIRECT_IO)
> +#define LOOP_CONFIGURE_SETTABLE_FLAGS (LO_FLAGS_READ_ONLY \
> + | LO_FLAGS_AUTOCLEAR \
> + | LO_FLAGS_PARTSCAN \
> + | LO_FLAGS_DIRECT_IO \
> + | LO_FLAGS_NO_DEALLOC)
>
> #include <asm/posix_types.h> /* for __kernel_old_dev_t */
> #include <linux/types.h> /* for __u64 */
> --
> 2.30.2
>
Hi,
On 2022/8/10 06:19, Darrick J. Wong wrote:
>
> Considering that discard isn't required to do anything, why not
> echo 0 | sudo tee /sys/block/loopX/queue/discard_max_bytes ?
>
Thanks for reviewing! This will disable discard completely (>=5.19),
thus the filesystem of backing file has no knowledge about what can be
freed. In contrast, my patch convert REQ_OP_DISCARD to
FALLOC_FL_ZERO_RANGE, the discarded parts of backing file is flagged
zeroed. So there is possibility for the filesystem of backing file to
discard (trim) the zeroed range, improving the write performance on, for
example, SSDs.
However, it seems only XFS with realtime discard enabled can do trim on
FALLOC_FL_ZERO_RANGE. All other filesystems (and XFS without realtime
discard) can't do trim on extents flagged zeroed. Batch discard like
FITRIM (used by `fstirm' tool) also can't help here, because no
filesystem track `allocated but flagged zeroed' extents at filesystem
level. I will probably write another patch to add the ability to trim
zeroed extents in single file to FITRIM. (Currently, FITRIM work on
filesystem level, not file level)
Best Regards,
Zhang Boyang
On 2022/8/6 23:30, Zhang Boyang wrote:
> I will also submit patches to util-linux to provide userspace support if
> this patch is merged.
For anyone who want to test the LO_FLAGS_NO_DEALLOC feature, the
userspace support patch for util-linux is done at
https://github.com/zhangboyang/util-linux/tree/losetup-nodealloc-202207
Below is an example typescript. In the example, a 10G test file is
created and used as backing file of /dev/loop0. Then the mkfs.ext4 will
discard most blocks in /dev/loop0. As you can see in the output of `du'
and `filefrag', the extents is not freed nor changed location, thus file
fragmentation is avoided. (The filesystem of backing file is ext4.
Things are different in XFS or Btrfs, they tend to allocate new extents
elsewhere when doing FALLOC_FL_ZERO_RANGE.)
Best Regards,
Zhang Boyang
root@debian:~# fallocate -l 10G /data/test
root@debian:~# du -sh /data/test
11G /data/test
root@debian:~# filefrag -v /data/test
Filesystem type is: ef53
File size of /data/test is 10737418240 (2621440 blocks of 4096 bytes)
ext: logical_offset: physical_offset: length: expected:
flags:
0: 0.. 32767: 37715968.. 37748735: 32768:
unwritten
1: 32768.. 524287: 37781504.. 38273023: 491520: 37748736:
unwritten
2: 524288.. 1015807: 38305792.. 38797311: 491520: 38273024:
unwritten
3: 1015808.. 1507327: 38830080.. 39321599: 491520: 38797312:
unwritten
4: 1507328.. 1998847: 39354368.. 39845887: 491520: 39321600:
unwritten
5: 1998848.. 2490367: 39878656.. 40370175: 491520: 39845888:
unwritten
6: 2490368.. 2621439: 40402944.. 40534015: 131072: 40370176:
last,unwritten,eof
/data/test: 7 extents found
root@debian:~# ./losetup -f --no-dealloc /data/test
root@debian:~# ./losetup
NAME SIZELIMIT OFFSET AUTOCLEAR RO BACK-FILE DIO NO-DEALLOC LOG-SEC
/dev/loop0 0 0 0 0 /data/test 0 1 512
root@debian:~# mkfs.ext4 /dev/loop0
mke2fs 1.46.2 (28-Feb-2021)
Discarding device blocks: done
Creating filesystem with 2621440 4k blocks and 655360 inodes
Filesystem UUID: 4abd9157-6e40-4842-bb33-e6807fc5c231
Superblock backups stored on blocks:
32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632
Allocating group tables: done
Writing inode tables: done
Creating journal (16384 blocks): done
Writing superblocks and filesystem accounting information: done
root@debian:~# ./losetup -d /dev/loop0
root@debian:~# du -sh /data/test
11G /data/test
root@debian:~# filefrag -v /data/test
Filesystem type is: ef53
File size of /data/test is 10737418240 (2621440 blocks of 4096 bytes)
ext: logical_offset: physical_offset: length: expected:
flags:
0: 0.. 1027: 37715968.. 37716995: 1028:
1: 1028.. 1042: 37716996.. 37717010: 15:
unwritten
2: 1043.. 1043: 37717011.. 37717011: 1:
3: 1044.. 1058: 37717012.. 37717026: 15:
unwritten
4: 1059.. 1059: 37717027.. 37717027: 1:
5: 1060.. 9250: 37717028.. 37725218: 8191:
unwritten
6: 9251.. 9256: 37725219.. 37725224: 6:
7: 9257.. 32767: 37725225.. 37748735: 23511:
unwritten
8: 32768.. 32770: 37781504.. 37781506: 3: 37748736:
9: 32771.. 98303: 37781507.. 37847039: 65533:
unwritten
10: 98304.. 98306: 37847040.. 37847042: 3:
11: 98307.. 163839: 37847043.. 37912575: 65533:
unwritten
12: 163840.. 163842: 37912576.. 37912578: 3:
13: 163843.. 229375: 37912579.. 37978111: 65533:
unwritten
14: 229376.. 229378: 37978112.. 37978114: 3:
15: 229379.. 294911: 37978115.. 38043647: 65533:
unwritten
16: 294912.. 294914: 38043648.. 38043650: 3:
17: 294915.. 524287: 38043651.. 38273023: 229373:
unwritten
18: 524288.. 524288: 38305792.. 38305792: 1: 38273024:
19: 524289.. 819199: 38305793.. 38600703: 294911:
unwritten
20: 819200.. 819202: 38600704.. 38600706: 3:
21: 819203.. 884735: 38600707.. 38666239: 65533:
unwritten
22: 884736.. 884738: 38666240.. 38666242: 3:
23: 884739.. 1015807: 38666243.. 38797311: 131069:
unwritten
24: 1015808.. 1048575: 38830080.. 38862847: 32768: 38797312:
unwritten
25: 1048576.. 1048577: 38862848.. 38862849: 2:
26: 1048578.. 1081343: 38862850.. 38895615: 32766:
unwritten
27: 1081344.. 1081344: 38895616.. 38895616: 1:
28: 1081345.. 1507327: 38895617.. 39321599: 425983:
unwritten
29: 1507328.. 1572863: 39354368.. 39419903: 65536: 39321600:
unwritten
30: 1572864.. 1572864: 39419904.. 39419904: 1:
31: 1572865.. 1605631: 39419905.. 39452671: 32767:
unwritten
32: 1605632.. 1605634: 39452672.. 39452674: 3:
33: 1605635.. 1998847: 39452675.. 39845887: 393213:
unwritten
34: 1998848.. 2097151: 39878656.. 39976959: 98304: 39845888:
unwritten
35: 2097152.. 2097152: 39976960.. 39976960: 1:
36: 2097153.. 2097166: 39976961.. 39976974: 14:
unwritten
37: 2097167.. 2097167: 39976975.. 39976975: 1:
38: 2097168.. 2490367: 39976976.. 40370175: 393200:
unwritten
39: 2490368.. 2621439: 40402944.. 40534015: 131072: 40370176:
last,unwritten,eof
/data/test: 7 extents found