From: Allison Henderson <[email protected]>
Add support for fallocate2 ioctl, which is xfs' own version of fallocate.
Struct xfs_fallocate2 is passed in the ioctl, and xfs_fallocate2.alignment
allows the user to specify required extent alignment. This is key for
atomic write support, as we expect extents to be aligned on
atomic_write_unit_max boundaries.
The alignment flag is not sticky, so further extent mutation will not
obey this original alignment request. In addition, extent lengths should
always be a multiple of atomic_write_unit_max, which they are not yet. So
this really just works for scenarios when we were lucky enough to get a
single extent.
The following is sample usage and c code:
mkfs.xfs -f /dev/sda
mount /dev/sda mnt
xfs_fallocate2 mnt/test_file1.img 0 20971520 262144
filefrag -v mnt/test_file1.img
xfs_fallocate2.c
struct xfs_fallocate2 {
int64_t offset; /* bytes */
int64_t length; /* bytes */
uint64_t flags;
uint32_t alignment; /* bytes */
uint32_t padding[9];
};
int main(int argc, char **argv) {
char *file;
int fd, ret;
struct xfs_fallocate2 fa = {};
if (argc != 5) {
printf("expected 5 arguments\n");
exit(0);
}
argv++;
file = *argv;
argv++;
fa.offset = atoi(*argv);
argv++;
fa.length = atoi(*argv);
argv++;
fa.alignment = atoi(*argv);
argv++;
if (fa.alignment)
fa.flags = XFS_FALLOC2_ALIGNED;
fd = open(file, O_RDWR | O_CREAT, 0600);
if (fd < 0)
exit(0);
ret = ioctl(fd, XFS_IOC_FALLOCATE2, &fa);
close(fd);
return ret;
}
Signed-off-by: Allison Henderson <[email protected]>
Signed-off-by: Catherine Hoang <[email protected]>
Signed-off-by: John Garry <[email protected]>
---
fs/xfs/Makefile | 1 +
fs/xfs/libxfs/xfs_attr_remote.c | 2 +-
fs/xfs/libxfs/xfs_bmap.c | 9 ++-
fs/xfs/libxfs/xfs_bmap.h | 4 +-
fs/xfs/libxfs/xfs_da_btree.c | 4 +-
fs/xfs/libxfs/xfs_fs.h | 1 +
fs/xfs/xfs_bmap_util.c | 7 ++-
fs/xfs/xfs_bmap_util.h | 2 +-
fs/xfs/xfs_dquot.c | 2 +-
fs/xfs/xfs_file.c | 19 +++++--
fs/xfs/xfs_fs_staging.c | 99 +++++++++++++++++++++++++++++++++
fs/xfs/xfs_fs_staging.h | 21 +++++++
fs/xfs/xfs_ioctl.c | 4 ++
fs/xfs/xfs_iomap.c | 4 +-
fs/xfs/xfs_reflink.c | 4 +-
fs/xfs/xfs_rtalloc.c | 2 +-
fs/xfs/xfs_symlink.c | 2 +-
security/security.c | 1 +
18 files changed, 168 insertions(+), 20 deletions(-)
create mode 100644 fs/xfs/xfs_fs_staging.c
create mode 100644 fs/xfs/xfs_fs_staging.h
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 92d88dc3c9f7..9b413544d358 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -93,6 +93,7 @@ xfs-y += xfs_aops.o \
xfs_sysfs.o \
xfs_trans.o \
xfs_xattr.o \
+ xfs_fs_staging.o \
kmem.o
# low-level transaction/log code
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..c5f190fef1b5 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -615,7 +615,7 @@ xfs_attr_rmtval_set_blk(
error = xfs_bmapi_write(args->trans, dp,
(xfs_fileoff_t)attr->xattri_lblkno,
attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
- map, &nmap);
+ map, &nmap, 0);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 34de6e6898c4..52a6e2b61228 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3275,7 +3275,9 @@ xfs_bmap_compute_alignments(
struct xfs_alloc_arg *args)
{
struct xfs_mount *mp = args->mp;
- xfs_extlen_t align = 0; /* minimum allocation alignment */
+
+ /* minimum allocation alignment */
+ xfs_extlen_t align = args->alignment;
int stripe_align = 0;
/* stripe alignment for allocation is determined by mount parameters */
@@ -3652,6 +3654,7 @@ xfs_bmap_btalloc(
.datatype = ap->datatype,
.alignment = 1,
.minalignslop = 0,
+ .alignment = ap->align,
};
xfs_fileoff_t orig_offset;
xfs_extlen_t orig_length;
@@ -4279,12 +4282,14 @@ xfs_bmapi_write(
uint32_t flags, /* XFS_BMAPI_... */
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
- int *nmap) /* i/o: mval size/count */
+ int *nmap,
+ xfs_extlen_t align) /* i/o: mval size/count */
{
struct xfs_bmalloca bma = {
.tp = tp,
.ip = ip,
.total = total,
+ .align = align,
};
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index dd08361ca5a6..0573dfc5fa6b 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -26,6 +26,7 @@ struct xfs_bmalloca {
xfs_fileoff_t offset; /* offset in file filling in */
xfs_extlen_t length; /* i/o length asked/allocated */
xfs_fsblock_t blkno; /* starting block of new extent */
+ xfs_extlen_t align;
struct xfs_btree_cur *cur; /* btree cursor */
struct xfs_iext_cursor icur; /* incore extent cursor */
@@ -189,7 +190,8 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
int *nmap, uint32_t flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
- xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
+ xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap,
+ xfs_extlen_t align);
int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
xfs_extnum_t nexts);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..e6581254092f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2174,7 +2174,7 @@ xfs_da_grow_inode_int(
nmap = 1;
error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
- args->total, &map, &nmap);
+ args->total, &map, &nmap, 0);
if (error)
return error;
@@ -2196,7 +2196,7 @@ xfs_da_grow_inode_int(
nmap = min(XFS_BMAP_MAX_NMAP, c);
error = xfs_bmapi_write(tp, dp, b, c,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- args->total, &mapp[mapi], &nmap);
+ args->total, &mapp[mapi], &nmap, 0);
if (error)
goto out_free_map;
if (nmap < 1)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 1cfd5bc6520a..829316ca01ea 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -831,6 +831,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
+#define XFS_IOC_FALLOCATE2 _IOR ('X', 129, struct xfs_fallocate2)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a09dd2606479..a0c55af6f051 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -776,10 +776,12 @@ int
xfs_alloc_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ xfs_off_t align)
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
+ xfs_filblks_t align_fsb;
xfs_filblks_t allocated_fsb;
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
@@ -811,6 +813,7 @@ xfs_alloc_file_space(
nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+ align_fsb = XFS_B_TO_FSB(mp, align);
allocatesize_fsb = endoffset_fsb - startoffset_fsb;
/*
@@ -872,7 +875,7 @@ xfs_alloc_file_space(
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
- &nimaps);
+ &nimaps, align_fsb);
if (error)
goto error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..476f610ad617 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, xfs_off_t align);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8fb90da89787..475e1a56d1b0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -328,7 +328,7 @@ xfs_dquot_disk_alloc(
/* Create the block mapping. */
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
- &nmaps);
+ &nmaps, 0);
if (error)
goto err_cancel;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 705250f9f90a..9b1db42a8d33 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -883,12 +883,13 @@ static inline bool xfs_file_sync_writes(struct file *filp)
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
-STATIC long
-xfs_file_fallocate(
+long
+_xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ loff_t alignment)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1035,7 +1036,7 @@ xfs_file_fallocate(
}
if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len);
+ error = xfs_alloc_file_space(ip, offset, len, alignment);
if (error)
goto out_unlock;
}
@@ -1073,6 +1074,16 @@ xfs_file_fallocate(
return error;
}
+STATIC long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ return _xfs_file_fallocate(file, mode, offset, len, 0);
+}
+
STATIC int
xfs_file_fadvise(
struct file *file,
diff --git a/fs/xfs/xfs_fs_staging.c b/fs/xfs/xfs_fs_staging.c
new file mode 100644
index 000000000000..1d635c0a9f49
--- /dev/null
+++ b/fs/xfs/xfs_fs_staging.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs_staging.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+
+#include "linux/security.h"
+#include "linux/fsnotify.h"
+
+extern long _xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ loff_t alignment);
+
+int xfs_fallocate2( struct file *filp,
+ void __user *arg)
+{
+ struct inode *inode = file_inode(filp);
+ //struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_fallocate2 fallocate2;
+ int ret;
+
+ if (copy_from_user(&fallocate2, arg, sizeof(fallocate2)))
+ return -EFAULT;
+
+ if (fallocate2.flags & XFS_FALLOC2_ALIGNED) {
+ if (!fallocate2.alignment || !is_power_of_2(fallocate2.alignment))
+ return -EINVAL;
+
+ if (fallocate2.offset % fallocate2.alignment)
+ return -EINVAL;
+
+ if (fallocate2.length % fallocate2.alignment)
+ return -EINVAL;
+ } else if (fallocate2.alignment) {
+ return -EINVAL;
+ }
+
+ /* These are all just copied from vfs_fallocate() */
+ if (fallocate2.offset < 0 || fallocate2.length <= 0)
+ return -EINVAL;
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ /*
+ * We cannot allow any fallocate operation on an active swapfile
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ /*
+ * Revalidate the write permissions, in case security policy has
+ * changed since the files were opened.
+ */
+ ret = security_file_permission(filp, MAY_WRITE);
+ if (ret)
+ return ret;
+
+ if (S_ISFIFO(inode->i_mode))
+ return -ESPIPE;
+
+ if (S_ISDIR(inode->i_mode))
+ return -EISDIR;
+
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -ENODEV;
+
+ /* Check for wrap through zero too */
+ if (((fallocate2.offset + fallocate2.length) > inode->i_sb->s_maxbytes) ||
+ ((fallocate2.offset + fallocate2.length) < 0))
+ return -EFBIG;
+
+ if (!filp->f_op->fallocate)
+ return -EOPNOTSUPP;
+
+ file_start_write(filp);
+ ret = _xfs_file_fallocate(filp, 0, fallocate2.offset, fallocate2.length, fallocate2.alignment);
+
+ if (ret == 0)
+ fsnotify_modify(filp);
+
+ file_end_write(filp);
+
+ return ret;
+}
diff --git a/fs/xfs/xfs_fs_staging.h b/fs/xfs/xfs_fs_staging.h
new file mode 100644
index 000000000000..a82e61063dba
--- /dev/null
+++ b/fs/xfs/xfs_fs_staging.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ */
+#ifndef __XFS_FS_STAGING_H__
+#define __XFS_FS_STAGING_H__
+
+struct xfs_fallocate2 {
+ s64 offset; /* bytes */
+ s64 length; /* bytes */
+ u64 flags;
+ u32 alignment; /* bytes */
+ u32 padding[8];
+};
+
+#define XFS_FALLOC2_ALIGNED (1U << 0)
+
+int xfs_fallocate2( struct file *filp,
+ void __user *arg);
+
+#endif /* __XFS_FS_STAGING_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..6e60fce44068 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -4,6 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_fs_staging.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
@@ -2149,6 +2150,9 @@ xfs_file_ioctl(
return error;
}
+ case XFS_IOC_FALLOCATE2:
+ return xfs_fallocate2(filp, arg);
+
default:
return -ENOTTY;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 285885c308bd..a4389a0c4bf2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -306,7 +306,7 @@ xfs_iomap_write_direct(
*/
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
- imap, &nimaps);
+ imap, &nimaps, 0);
if (error)
goto out_trans_cancel;
@@ -614,7 +614,7 @@ xfs_iomap_write_unwritten(
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_CONVERT, resblks, &imap,
- &nimaps);
+ &nimaps, 0);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f5dc46ce9803..a2e5ba6cf7f3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -420,7 +420,7 @@ xfs_reflink_fill_cow_hole(
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
- &nimaps);
+ &nimaps, 0);
if (error)
goto out_trans_cancel;
@@ -490,7 +490,7 @@ xfs_reflink_fill_delalloc(
error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
cmap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
- cmap, &nimaps);
+ cmap, &nimaps, 0);
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 16534e9873f6..a57a8a4d8294 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -817,7 +817,7 @@ xfs_growfs_rt_alloc(
*/
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_METADATA, 0, &map, &nmap);
+ XFS_BMAPI_METADATA, 0, &map, &nmap, 0);
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 85e433df6a3f..2a4524bf34a5 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -269,7 +269,7 @@ xfs_symlink(
nmaps = XFS_SYMLINK_MAPS;
error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_METADATA, resblks, mval, &nmaps);
+ XFS_BMAPI_METADATA, resblks, mval, &nmaps, 0);
if (error)
goto out_trans_cancel;
diff --git a/security/security.c b/security/security.c
index cf6cc576736f..d53b1b6c2d59 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1593,6 +1593,7 @@ int security_file_permission(struct file *file, int mask)
return fsnotify_perm(file, mask);
}
+EXPORT_SYMBOL(security_file_permission);
int security_file_alloc(struct file *file)
{
--
2.31.1
On Wed, May 03, 2023 at 06:38:17PM +0000, John Garry wrote:
> From: Allison Henderson <[email protected]>
>
> Add support for fallocate2 ioctl, which is xfs' own version of fallocate.
> Struct xfs_fallocate2 is passed in the ioctl, and xfs_fallocate2.alignment
> allows the user to specify required extent alignment. This is key for
> atomic write support, as we expect extents to be aligned on
> atomic_write_unit_max boundaries.
This approach of adding filesystem specific ioctls for minor behavioural
modifiers to existing syscalls is not a sustainable development
model.
If we want fallocate() operations to apply filesystem atomic write
constraints to operations, then add a new modifier flag to
fallocate(), say FALLOC_FL_ATOMIC. The filesystem can then
look up it's atomic write alignment constraints and apply them to
the operation being performed appropriately.
> The alignment flag is not sticky, so further extent mutation will not
> obey this original alignment request.
IOWs, you want the specific allocation to behave exactly as if an
extent size hint of the given alignment had been set on that inode.
Which could be done with:
ioctl(FS_IOC_FSGETXATTR, &fsx)
old_extsize = fsx.fsx_extsize;
fsx.fsx_extsize = atomic_align_size;
ioctl(FS_IOC_FSSETXATTR, &fsx)
fallocate(....)
fsx.fsx_extsize = old_extsize;
ioctl(FS_IOC_FSSETXATTR, &fsx)
Yeah, messy, but if an application is going to use atomic writes,
then setting an extent size hint of the atomic write granularity the
application will use at file create time makes a whole lot of sense.
This will largely guarantee that any allocation will be aligned to
atomic IO constraints even when non atomic IO operations are
performed on that inode. Hence when the application needs to do an
atomic IO, it's not going to fail because previous allocation was
not correctly aligned.
All that we'd then need to do for atomic IO is ensure that we fail
the allocation early if we can't allocate fully sized and aligned
extents rather than falling back to unaligned extents when there are
no large enough contiguous free spaces for aligned extents to be
allocated. i.e. when RWF_ATOMIC or FALLOC_FL_ATOMIC are set by the
application...
> In addition, extent lengths should
> always be a multiple of atomic_write_unit_max,
Yup, that's what extent size hint based allocation does - it rounds
both down and up to hint alignment...
....
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 34de6e6898c4..52a6e2b61228 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3275,7 +3275,9 @@ xfs_bmap_compute_alignments(
> struct xfs_alloc_arg *args)
> {
> struct xfs_mount *mp = args->mp;
> - xfs_extlen_t align = 0; /* minimum allocation alignment */
> +
> + /* minimum allocation alignment */
> + xfs_extlen_t align = args->alignment;
> int stripe_align = 0;
This doesn't do what you think it should. For one, it will get
overwritten by extent size hints that are set, hence the user will
not get the alignment they expected in that case.
Secondly, args->alignment is an internal alignment control for
stripe alignment used later in the allocator when doing file
extenstion allocations. Overloading it to pass a user alignment
here means that initial data allocations will have alignments set
without actually having set up the allocator parameters for aligned
allocation correctly.
This will lead to unexpected allocation failure as the filesystem
fills as the reservations needed for allocation to succeed won't
match what is actually required for allocation to succeed. It will
also cause problematic behaviour for fallback allocation algorithms
that expect only to be called with args->alignment = 1...
> /* stripe alignment for allocation is determined by mount parameters */
> @@ -3652,6 +3654,7 @@ xfs_bmap_btalloc(
> .datatype = ap->datatype,
> .alignment = 1,
> .minalignslop = 0,
> + .alignment = ap->align,
> };
> xfs_fileoff_t orig_offset;
> xfs_extlen_t orig_length;
> @@ -4279,12 +4282,14 @@ xfs_bmapi_write(
> uint32_t flags, /* XFS_BMAPI_... */
> xfs_extlen_t total, /* total blocks needed */
> struct xfs_bmbt_irec *mval, /* output: map values */
> - int *nmap) /* i/o: mval size/count */
> + int *nmap,
> + xfs_extlen_t align) /* i/o: mval size/count */
As per above - IMO this is not the right way to specify aligment for
atomic IO. A XFS_BMAPI_ATOMIC flag is probably the right thing to
add from the caller - this also communicates the specific allocation
failure behaviour required, too.
Then xfs_bmap_compute_alignments() can pull the alignment
from the relevant buftarg similar to how it already pulls preset
alignments for extent size hints and/or realtime devices. And then
the allocator can attempt exact aligned allocation for maxlen, then
if that fails an exact aligned allocation for minlen, and if both of
those fail then we return ENOSPC without attempting any unaligned
allocations...
This also gets rid of the need to pass another parameter to
xfs_bmapi_write(), and it's trivial to plumb into the XFS iomap and
fallocate code paths....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Thu, May 04, 2023 at 09:26:16AM +1000, Dave Chinner wrote:
> On Wed, May 03, 2023 at 06:38:17PM +0000, John Garry wrote:
> > From: Allison Henderson <[email protected]>
> >
> > Add support for fallocate2 ioctl, which is xfs' own version of fallocate.
> > Struct xfs_fallocate2 is passed in the ioctl, and xfs_fallocate2.alignment
> > allows the user to specify required extent alignment. This is key for
> > atomic write support, as we expect extents to be aligned on
> > atomic_write_unit_max boundaries.
>
> This approach of adding filesystem specific ioctls for minor behavioural
> modifiers to existing syscalls is not a sustainable development
> model.
To be fair to John and Allison, I told them to shove all the new UAPI
bits into a xfs_fs_staging.h because of that conversation you and
Catherine and I had a month or two ago (the fsuuid ioctls) about putting
new interfaces in an obviously marked staging file, using that to
prototype and discover the interface that we really wanted, and only
then talk about hoisting it to the VFS.
Hence this fallocate2 because we weren't sure if syscalls for aligned
allocations should explicitly define the alignment or get it from the
extent size hint, if there should be an explicit flag mandating aligned
allocation, etc.
> If we want fallocate() operations to apply filesystem atomic write
> constraints to operations, then add a new modifier flag to
> fallocate(), say FALLOC_FL_ATOMIC. The filesystem can then
> look up it's atomic write alignment constraints and apply them to
> the operation being performed appropriately.
>
> > The alignment flag is not sticky, so further extent mutation will not
> > obey this original alignment request.
>
> IOWs, you want the specific allocation to behave exactly as if an
> extent size hint of the given alignment had been set on that inode.
> Which could be done with:
>
> ioctl(FS_IOC_FSGETXATTR, &fsx)
> old_extsize = fsx.fsx_extsize;
> fsx.fsx_extsize = atomic_align_size;
> ioctl(FS_IOC_FSSETXATTR, &fsx)
Eww, multiple threads doing fallocates can clobber each other here.
> fallocate(....)
> fsx.fsx_extsize = old_extsize;
> ioctl(FS_IOC_FSSETXATTR, &fsx)
Also, you can't set extsize if the data fork has any mappings in it,
so you can't set the old value. But perhaps it's not so bad to expect
that programs will set this up once and not change the underlying
storage?
I'm not actually sure why you can't change the extent size hint. Why is
that?
> Yeah, messy, but if an application is going to use atomic writes,
> then setting an extent size hint of the atomic write granularity the
> application will use at file create time makes a whole lot of sense.
> This will largely guarantee that any allocation will be aligned to
> atomic IO constraints even when non atomic IO operations are
> performed on that inode. Hence when the application needs to do an
> atomic IO, it's not going to fail because previous allocation was
> not correctly aligned.
>
> All that we'd then need to do for atomic IO is ensure that we fail
> the allocation early if we can't allocate fully sized and aligned
> extents rather than falling back to unaligned extents when there are
> no large enough contiguous free spaces for aligned extents to be
> allocated. i.e. when RWF_ATOMIC or FALLOC_FL_ATOMIC are set by the
> application...
Right.
>
> > In addition, extent lengths should
> > always be a multiple of atomic_write_unit_max,
>
> Yup, that's what extent size hint based allocation does - it rounds
> both down and up to hint alignment...
>
> ....
>
> > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > index 34de6e6898c4..52a6e2b61228 100644
> > --- a/fs/xfs/libxfs/xfs_bmap.c
> > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > @@ -3275,7 +3275,9 @@ xfs_bmap_compute_alignments(
> > struct xfs_alloc_arg *args)
> > {
> > struct xfs_mount *mp = args->mp;
> > - xfs_extlen_t align = 0; /* minimum allocation alignment */
> > +
> > + /* minimum allocation alignment */
> > + xfs_extlen_t align = args->alignment;
> > int stripe_align = 0;
>
>
> This doesn't do what you think it should. For one, it will get
> overwritten by extent size hints that are set, hence the user will
> not get the alignment they expected in that case.
>
> Secondly, args->alignment is an internal alignment control for
> stripe alignment used later in the allocator when doing file
> extenstion allocations. Overloading it to pass a user alignment
> here means that initial data allocations will have alignments set
> without actually having set up the allocator parameters for aligned
> allocation correctly.
>
> This will lead to unexpected allocation failure as the filesystem
> fills as the reservations needed for allocation to succeed won't
> match what is actually required for allocation to succeed. It will
> also cause problematic behaviour for fallback allocation algorithms
> that expect only to be called with args->alignment = 1...
>
> > /* stripe alignment for allocation is determined by mount parameters */
> > @@ -3652,6 +3654,7 @@ xfs_bmap_btalloc(
> > .datatype = ap->datatype,
> > .alignment = 1,
> > .minalignslop = 0,
> > + .alignment = ap->align,
> > };
> > xfs_fileoff_t orig_offset;
> > xfs_extlen_t orig_length;
>
> > @@ -4279,12 +4282,14 @@ xfs_bmapi_write(
> > uint32_t flags, /* XFS_BMAPI_... */
> > xfs_extlen_t total, /* total blocks needed */
> > struct xfs_bmbt_irec *mval, /* output: map values */
> > - int *nmap) /* i/o: mval size/count */
> > + int *nmap,
> > + xfs_extlen_t align) /* i/o: mval size/count */
>
>
> As per above - IMO this is not the right way to specify aligment for
> atomic IO. A XFS_BMAPI_ATOMIC flag is probably the right thing to
> add from the caller - this also communicates the specific allocation
> failure behaviour required, too.
>
> Then xfs_bmap_compute_alignments() can pull the alignment
> from the relevant buftarg similar to how it already pulls preset
> alignments for extent size hints and/or realtime devices. And then
> the allocator can attempt exact aligned allocation for maxlen, then
> if that fails an exact aligned allocation for minlen, and if both of
> those fail then we return ENOSPC without attempting any unaligned
> allocations...
>
> This also gets rid of the need to pass another parameter to
> xfs_bmapi_write(), and it's trivial to plumb into the XFS iomap and
> fallocate code paths....
I too prefer a XFS_BMAPI_ALLOC_ALIGNED flag to all this extra plumbing,
having now seen the extra plumbing.
--D
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
On Fri, May 05, 2023 at 03:23:33PM -0700, Darrick J. Wong wrote:
> On Thu, May 04, 2023 at 09:26:16AM +1000, Dave Chinner wrote:
> > On Wed, May 03, 2023 at 06:38:17PM +0000, John Garry wrote:
> > If we want fallocate() operations to apply filesystem atomic write
> > constraints to operations, then add a new modifier flag to
> > fallocate(), say FALLOC_FL_ATOMIC. The filesystem can then
> > look up it's atomic write alignment constraints and apply them to
> > the operation being performed appropriately.
> >
> > > The alignment flag is not sticky, so further extent mutation will not
> > > obey this original alignment request.
> >
> > IOWs, you want the specific allocation to behave exactly as if an
> > extent size hint of the given alignment had been set on that inode.
> > Which could be done with:
> >
> > ioctl(FS_IOC_FSGETXATTR, &fsx)
> > old_extsize = fsx.fsx_extsize;
> > fsx.fsx_extsize = atomic_align_size;
> > ioctl(FS_IOC_FSSETXATTR, &fsx)
>
> Eww, multiple threads doing fallocates can clobber each other here.
Sure, this was just an example of how the same behaviour could be be
acheived without the new ioctl. Locking and other trivialities were
left as an exercise for the reader.
>
> > fallocate(....)
> > fsx.fsx_extsize = old_extsize;
> > ioctl(FS_IOC_FSSETXATTR, &fsx)
>
> Also, you can't set extsize if the data fork has any mappings in it,
> so you can't set the old value. But perhaps it's not so bad to expect
> that programs will set this up once and not change the underlying
> storage?
>
> I'm not actually sure why you can't change the extent size hint. Why is
> that?
Hysterical raisins, I think.
IIUC, it was largely to do with the fact that pre-existing
allocation could not be realigned, so once allocation has been done
the extent size hint can't guarantee extent size hint aligned/sized
extents are actually allocated for the file.
Cheers,
Dave.
--
Dave Chinner
[email protected]