Hole punching has already been implemented by XFS and OCFS2, and has the
potential to be implemented on both BTRFS and EXT4 so we need a generic way to
get to this feature. The simplest way in my mind is to add FALLOC_FL_PUNCH_HOLE
to fallocate() since it already looks like the normal fallocate() operation.
I've tested this patch with XFS and BTRFS to make sure XFS did what it's
supposed to do and that BTRFS failed like it was supposed to. Thank you,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/open.c | 2 +-
include/linux/falloc.h | 1 +
2 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 4197b9e..ab8dedf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EINVAL;
/* Return error if mode is not supported */
- if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+ if (mode && (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)))
return -EOPNOTSUPP;
if (!(file->f_mode & FMODE_WRITE))
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 3c15510..851cba2 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -2,6 +2,7 @@
#define _FALLOC_H_
#define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */
+#define FALLOC_FL_PUNCH_HOLE 0X02 /* de-allocates range */
#ifdef __KERNEL__
--
1.6.6.1
Gfs2 doesn't have the ability to punch holes yet, so make sure we return
EOPNOTSUPP if we try to use hole punching through fallocate. This support can
be added later. Thanks,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/gfs2/ops_inode.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7..93ecc9f 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1439,6 +1439,10 @@ static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode && (mode & FALLOC_FL_KEEP_SIZE))
+ return -EOPNOTSUPP;
+
offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
sdp->sd_sb.sb_bsize_shift;
--
1.6.6.1
This patch just makes ocfs2 use its UNRESERVP ioctl when we get the hole punch
flag in fallocate. I didn't test it, but it seems simple enough. Thanks,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/ocfs2/file.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b4c04..ad23a18 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1992,6 +1992,7 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_space_resv sr;
int change_size = 1;
+ int cmd = OCFS2_IOC_RESVSP64;
if (!ocfs2_writes_unwritten_extents(osb))
return -EOPNOTSUPP;
@@ -2002,12 +2003,15 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
if (mode & FALLOC_FL_KEEP_SIZE)
change_size = 0;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = OCFS2_IOC_UNRESVSP64;
+
sr.l_whence = 0;
sr.l_start = (s64)offset;
sr.l_len = (s64)len;
- return __ocfs2_change_file_space(NULL, inode, offset,
- OCFS2_IOC_RESVSP64, &sr, change_size);
+ return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+ change_size);
}
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
--
1.6.6.1
This patch simply allows XFS to handle the hole punching flag in fallocate
properly. I've tested this with a little program that does a bunch of random
hole punching with FL_KEEP_SIZE and without it to make sure it does the right
thing. Thanks,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/xfs/linux-2.6/xfs_iops.c | 12 +++++++++---
1 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107ef..99df347 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -516,6 +516,7 @@ xfs_vn_fallocate(
loff_t new_size = 0;
xfs_flock64_t bf;
xfs_inode_t *ip = XFS_I(inode);
+ int cmd = XFS_IOC_RESVSP;
/* preallocation on directories not yet supported */
error = -ENODEV;
@@ -528,17 +529,22 @@ xfs_vn_fallocate(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = XFS_IOC_UNRESVSP;
+
/* check the new inode size is valid before allocating */
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
- new_size = offset + len;
+ if (cmd == XFS_IOC_UNRESVSP)
+ new_size = offset;
+ else
+ new_size = offset + len;
error = inode_newsize_ok(inode, new_size);
if (error)
goto out_unlock;
}
- error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
- 0, XFS_ATTR_NOLOCK);
+ error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
if (error)
goto out_unlock;
--
1.6.6.1
Btrfs doesn't have the ability to punch holes yet, so make sure we return
EOPNOTSUPP if we try to use hole punching through fallocate. This support can
be added later. Thanks,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/btrfs/inode.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 78877d7..c590add 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6936,6 +6936,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
alloc_start = offset & ~mask;
alloc_end = (offset + len + mask) & ~mask;
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode && (mode & FALLOC_FL_KEEP_SIZE))
+ return -EOPNOTSUPP;
+
/*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
--
1.6.6.1
Ext4 doesn't have the ability to punch holes yet, so make sure we return
EOPNOTSUPP if we try to use hole punching through fallocate. This support can
be added later. Thanks,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/ext4/extents.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48..9bff6bd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3622,6 +3622,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode && (mode & FALLOC_FL_KEEP_SIZE))
+ return -EOPNOTSUPP;
+
/*
* currently supporting (pre)allocate mode for extent-based
* files _only_
--
1.6.6.1
On Mon, Nov 08, 2010 at 03:32:02PM -0500, Josef Bacik wrote:
> Hole punching has already been implemented by XFS and OCFS2, and has the
> potential to be implemented on both BTRFS and EXT4 so we need a generic way to
> get to this feature. The simplest way in my mind is to add FALLOC_FL_PUNCH_HOLE
> to fallocate() since it already looks like the normal fallocate() operation.
> I've tested this patch with XFS and BTRFS to make sure XFS did what it's
> supposed to do and that BTRFS failed like it was supposed to. Thank you,
>
> Signed-off-by: Josef Bacik <[email protected]>
> ---
> fs/open.c | 2 +-
> include/linux/falloc.h | 1 +
> 2 files changed, 2 insertions(+), 1 deletions(-)
>
> diff --git a/fs/open.c b/fs/open.c
> index 4197b9e..ab8dedf 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -223,7 +223,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> return -EINVAL;
>
> /* Return error if mode is not supported */
> - if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
> + if (mode && (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)))
> return -EOPNOTSUPP;
>
> if (!(file->f_mode & FMODE_WRITE))
> diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> index 3c15510..851cba2 100644
> --- a/include/linux/falloc.h
> +++ b/include/linux/falloc.h
> @@ -2,6 +2,7 @@
> #define _FALLOC_H_
>
> #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */
> +#define FALLOC_FL_PUNCH_HOLE 0X02 /* de-allocates range */
Hole punching was not included originally in fallocate() for a
variety of reasons. IIRC, they were along the lines of:
1 de-allocating of blocks in an allocation syscall is wrong.
People wanted a new syscall for this functionality.
2 no glibc interface needs it
3 at the time, only XFS supported punching holes, so there
is not need to support it in a generic interface
4 the use cases presented were not considered compelling
enough to justify the additional complexity (!)
In the end, I gave up arguing for it to be included because just
getting the FALLOC_FL_KEEP_SIZE functionality was a hard enough
battle.
Anyway, #3 isn't the case any more, #4 was just an excuse not to
support anything ext4 couldn't do and lots of apps are calling
fallocate directly (because glibc can't use FALLOC_FL_KEEP_SIZE) so
#2 isn't an issue, either. I guess that leaves #1 to be debated;
I don't think there is any problem with doing what you propose.
What I will suggest is that this requires a generic xfstest to be
written and support added to xfs_io to enable that test (and others)
to issue hole punches. Something along the lines of test 242 which I
wrote for testing all the edge case of XFS_IOC_ZERO_RANGE (*) would be
good.
Cheers,
Dave.
(*) fallocate() version:
http://git.kernel.org/?p=linux/kernel/git/dgc/xfsdev.git;a=commitdiff;h=45f3e1831e3abc8bd12ec1e6c548f73a8dd9e36d
--
Dave Chinner
[email protected]
On Mon, Nov 08, 2010 at 03:32:03PM -0500, Josef Bacik wrote:
> This patch simply allows XFS to handle the hole punching flag in fallocate
> properly. I've tested this with a little program that does a bunch of random
> hole punching with FL_KEEP_SIZE and without it to make sure it does the right
> thing. Thanks,
>
> Signed-off-by: Josef Bacik <[email protected]>
> ---
> fs/xfs/linux-2.6/xfs_iops.c | 12 +++++++++---
> 1 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
> index 96107ef..99df347 100644
> --- a/fs/xfs/linux-2.6/xfs_iops.c
> +++ b/fs/xfs/linux-2.6/xfs_iops.c
> @@ -516,6 +516,7 @@ xfs_vn_fallocate(
> loff_t new_size = 0;
> xfs_flock64_t bf;
> xfs_inode_t *ip = XFS_I(inode);
> + int cmd = XFS_IOC_RESVSP;
>
> /* preallocation on directories not yet supported */
> error = -ENODEV;
> @@ -528,17 +529,22 @@ xfs_vn_fallocate(
>
> xfs_ilock(ip, XFS_IOLOCK_EXCL);
>
> + if (mode & FALLOC_FL_PUNCH_HOLE)
> + cmd = XFS_IOC_UNRESVSP;
> +
> /* check the new inode size is valid before allocating */
> if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> offset + len > i_size_read(inode)) {
> - new_size = offset + len;
> + if (cmd == XFS_IOC_UNRESVSP)
> + new_size = offset;
> + else
> + new_size = offset + len;
What semantic is FALLOC_FL_KEEP_SIZE supposed to convey during a
hole punch? Doesn't this just turn the hole punch operation into
a truncate? If so, what's the point of punching the hole when you
can just call ftruncate() to get the same result?
I'd suggest that FALLOC_FL_PUNCH_HOLE should, by definition, not
change the file size, and have no option to be able to change it.
This needs to be defined and documented - can you include a man
page update in this series that defines the expected behaviour
of FALLOC_FL_PUNCH_HOLE?
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Nov 09, 2010 at 12:22:54PM +1100, Dave Chinner wrote:
> On Mon, Nov 08, 2010 at 03:32:03PM -0500, Josef Bacik wrote:
> > This patch simply allows XFS to handle the hole punching flag in fallocate
> > properly. I've tested this with a little program that does a bunch of random
> > hole punching with FL_KEEP_SIZE and without it to make sure it does the right
> > thing. Thanks,
> >
> > Signed-off-by: Josef Bacik <[email protected]>
> > ---
> > fs/xfs/linux-2.6/xfs_iops.c | 12 +++++++++---
> > 1 files changed, 9 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
> > index 96107ef..99df347 100644
> > --- a/fs/xfs/linux-2.6/xfs_iops.c
> > +++ b/fs/xfs/linux-2.6/xfs_iops.c
> > @@ -516,6 +516,7 @@ xfs_vn_fallocate(
> > loff_t new_size = 0;
> > xfs_flock64_t bf;
> > xfs_inode_t *ip = XFS_I(inode);
> > + int cmd = XFS_IOC_RESVSP;
> >
> > /* preallocation on directories not yet supported */
> > error = -ENODEV;
> > @@ -528,17 +529,22 @@ xfs_vn_fallocate(
> >
> > xfs_ilock(ip, XFS_IOLOCK_EXCL);
> >
> > + if (mode & FALLOC_FL_PUNCH_HOLE)
> > + cmd = XFS_IOC_UNRESVSP;
> > +
> > /* check the new inode size is valid before allocating */
> > if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> > offset + len > i_size_read(inode)) {
> > - new_size = offset + len;
> > + if (cmd == XFS_IOC_UNRESVSP)
> > + new_size = offset;
> > + else
> > + new_size = offset + len;
>
> What semantic is FALLOC_FL_KEEP_SIZE supposed to convey during a
> hole punch? Doesn't this just turn the hole punch operation into
> a truncate? If so, what's the point of punching the hole when you
> can just call ftruncate() to get the same result?
>
Well your UNRESVSP can do the same thing as a ftruncate so I figured it was ok
to just go ahead and use it rather than add complexity, especially since I don't
understand this crazy fs of yours ;).
> I'd suggest that FALLOC_FL_PUNCH_HOLE should, by definition, not
> change the file size, and have no option to be able to change it.
> This needs to be defined and documented - can you include a man
> page update in this series that defines the expected behaviour
> of FALLOC_FL_PUNCH_HOLE?
Oh I see you mean don't allow hole punch to change the size at all. I was
thinking of doing this originally, but like I said above I figured that there
was no harm in doing the equivalent of an ftruncate. But you are right, theres
no sense in duplicating functionality, I'll change it to keep PUNCH_HOLE from
changin the size. Now I just need to figure out where that man page is...
Josef
On Tue, Nov 09, 2010 at 12:12:22PM +1100, Dave Chinner wrote:
> On Mon, Nov 08, 2010 at 03:32:02PM -0500, Josef Bacik wrote:
> > Hole punching has already been implemented by XFS and OCFS2, and has the
> > potential to be implemented on both BTRFS and EXT4 so we need a generic way to
> > get to this feature. The simplest way in my mind is to add FALLOC_FL_PUNCH_HOLE
> > to fallocate() since it already looks like the normal fallocate() operation.
> > I've tested this patch with XFS and BTRFS to make sure XFS did what it's
> > supposed to do and that BTRFS failed like it was supposed to. Thank you,
> >
> > Signed-off-by: Josef Bacik <[email protected]>
> > ---
> > fs/open.c | 2 +-
> > include/linux/falloc.h | 1 +
> > 2 files changed, 2 insertions(+), 1 deletions(-)
> >
> > diff --git a/fs/open.c b/fs/open.c
> > index 4197b9e..ab8dedf 100644
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -223,7 +223,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> > return -EINVAL;
> >
> > /* Return error if mode is not supported */
> > - if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
> > + if (mode && (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)))
> > return -EOPNOTSUPP;
> >
> > if (!(file->f_mode & FMODE_WRITE))
> > diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> > index 3c15510..851cba2 100644
> > --- a/include/linux/falloc.h
> > +++ b/include/linux/falloc.h
> > @@ -2,6 +2,7 @@
> > #define _FALLOC_H_
> >
> > #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */
> > +#define FALLOC_FL_PUNCH_HOLE 0X02 /* de-allocates range */
>
> Hole punching was not included originally in fallocate() for a
> variety of reasons. IIRC, they were along the lines of:
>
> 1 de-allocating of blocks in an allocation syscall is wrong.
> People wanted a new syscall for this functionality.
> 2 no glibc interface needs it
> 3 at the time, only XFS supported punching holes, so there
> is not need to support it in a generic interface
> 4 the use cases presented were not considered compelling
> enough to justify the additional complexity (!)
>
> In the end, I gave up arguing for it to be included because just
> getting the FALLOC_FL_KEEP_SIZE functionality was a hard enough
> battle.
>
> Anyway, #3 isn't the case any more, #4 was just an excuse not to
> support anything ext4 couldn't do and lots of apps are calling
> fallocate directly (because glibc can't use FALLOC_FL_KEEP_SIZE) so
> #2 isn't an issue, either. I guess that leaves #1 to be debated;
> I don't think there is any problem with doing what you propose.
>
> What I will suggest is that this requires a generic xfstest to be
> written and support added to xfs_io to enable that test (and others)
> to issue hole punches. Something along the lines of test 242 which I
> wrote for testing all the edge case of XFS_IOC_ZERO_RANGE (*) would be
> good.
Sounds good. Do you want me to build my PUNCH_HOLE patch ontop of your
ZERO_RANGE patch? Thanks,
Josef
On Tue, Nov 09, 2010 at 12:12:22PM +1100, Dave Chinner wrote:
> Hole punching was not included originally in fallocate() for a
> variety of reasons. IIRC, they were along the lines of:
>
> 1 de-allocating of blocks in an allocation syscall is wrong.
> People wanted a new syscall for this functionality.
> 2 no glibc interface needs it
> 3 at the time, only XFS supported punching holes, so there
> is not need to support it in a generic interface
> 4 the use cases presented were not considered compelling
> enough to justify the additional complexity (!)
>
> In the end, I gave up arguing for it to be included because just
> getting the FALLOC_FL_KEEP_SIZE functionality was a hard enough
> battle.
>
> Anyway, #3 isn't the case any more, #4 was just an excuse not to
> support anything ext4 couldn't do and lots of apps are calling
> fallocate directly (because glibc can't use FALLOC_FL_KEEP_SIZE) so
> #2 isn't an issue, either.
I don't recall anyone arguing #4 because of ext4, but I get very tired
of the linux-fsdevel bike-shed painting parties, so I often will
concede whatever is necessary just to get the !@#! interface in,
assuming we could add more flags later....
glibc does support fallocate(), BTW; it's just posix_fallocate() that
doesn't use FALLOC_FL_KEEP_SIZE.
> I guess that leaves #1 to be debated;
> I don't think there is any problem with doing what you propose.
I don't have a problem either.
As a completely separate proposal, what do people think about an
FALLOCATE_FL_ZEROIZE after which time the blocks are allocated, but
reading from them returns zero. This could be done either by (a)
sending a discard in the case of devices where discard_zeros_data is
true and discard_granularty is less than the fs block size, or (b) by
setting the uninitialized flag in the extent tree.
- Ted
On Mon, Nov 08, 2010 at 09:05:09PM -0500, Josef Bacik wrote:
> On Tue, Nov 09, 2010 at 12:22:54PM +1100, Dave Chinner wrote:
> > On Mon, Nov 08, 2010 at 03:32:03PM -0500, Josef Bacik wrote:
> > > This patch simply allows XFS to handle the hole punching flag in fallocate
> > > properly. I've tested this with a little program that does a bunch of random
> > > hole punching with FL_KEEP_SIZE and without it to make sure it does the right
> > > thing. Thanks,
> > >
> > > Signed-off-by: Josef Bacik <[email protected]>
> > > ---
> > > fs/xfs/linux-2.6/xfs_iops.c | 12 +++++++++---
> > > 1 files changed, 9 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
> > > index 96107ef..99df347 100644
> > > --- a/fs/xfs/linux-2.6/xfs_iops.c
> > > +++ b/fs/xfs/linux-2.6/xfs_iops.c
> > > @@ -516,6 +516,7 @@ xfs_vn_fallocate(
> > > loff_t new_size = 0;
> > > xfs_flock64_t bf;
> > > xfs_inode_t *ip = XFS_I(inode);
> > > + int cmd = XFS_IOC_RESVSP;
> > >
> > > /* preallocation on directories not yet supported */
> > > error = -ENODEV;
> > > @@ -528,17 +529,22 @@ xfs_vn_fallocate(
> > >
> > > xfs_ilock(ip, XFS_IOLOCK_EXCL);
> > >
> > > + if (mode & FALLOC_FL_PUNCH_HOLE)
> > > + cmd = XFS_IOC_UNRESVSP;
> > > +
> > > /* check the new inode size is valid before allocating */
> > > if (!(mode & FALLOC_FL_KEEP_SIZE) &&
> > > offset + len > i_size_read(inode)) {
> > > - new_size = offset + len;
> > > + if (cmd == XFS_IOC_UNRESVSP)
> > > + new_size = offset;
> > > + else
> > > + new_size = offset + len;
> >
> > What semantic is FALLOC_FL_KEEP_SIZE supposed to convey during a
> > hole punch? Doesn't this just turn the hole punch operation into
> > a truncate? If so, what's the point of punching the hole when you
> > can just call ftruncate() to get the same result?
> >
>
> Well your UNRESVSP can do the same thing as a ftruncate
Actually, it can't because it leaves the file size unchanged. Like
XFS_IOC_RESVSP, the use case is to allow punching out pre-allocated
blocks beyond EOF (that were put there by XFS_IOC_RESVSP). e.g:
# xfs_io -f \
> -c "truncate 20k" \
> -c "resvsp 16k 16k" \
> -c "unresvsp 24k 4k" \
> -c "bmap -v" \
> -c "stat" \
> test.out
test.out:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
0: [0..31]: hole 32
1: [32..47]: 136..151 0 (136..151) 16 10000
2: [48..55]: hole 8
3: [56..63]: 160..167 0 (160..167) 8 10000
fd.path = "test.out"
fd.flags = non-sync,non-direct,read-write
stat.ino = 134
stat.type = regular file
stat.size = 20480
stat.blocks = 24
fsxattr.xflags = 0x2 [prealloc]
fsxattr.projid = 0
fsxattr.extsize = 0
fsxattr.nextents = 2
fsxattr.naextents = 0
dioattr.mem = 0x200
dioattr.miniosz = 512
dioattr.maxiosz = 2147483136
#
Which leaves us witha file size of 20k and two unwritten extents
from 16-24k and 28-32k.
> so I figured it was ok
> to just go ahead and use it rather than add complexity, especially since I don't
> understand this crazy fs of yours ;).
Ask, and ye shall receive enlightenment. :)
> > I'd suggest that FALLOC_FL_PUNCH_HOLE should, by definition, not
> > change the file size, and have no option to be able to change it.
> > This needs to be defined and documented - can you include a man
> > page update in this series that defines the expected behaviour
> > of FALLOC_FL_PUNCH_HOLE?
>
> Oh I see you mean don't allow hole punch to change the size at all.
Exactly. If you can preallocate beyond EOF, then you need to be able
to punch beyond EOF, too. The only other way to remove persistent
preallocation beyond EOF is to truncate, but that doesn't allow you
to only punch specific ranges beyond EOF...
> I was thinking of doing this originally, but like I said above I
> figured that there was no harm in doing the equivalent of an
> ftruncate. But you are right, theres no sense in duplicating
> functionality, I'll change it to keep PUNCH_HOLE from changin the
> size. Now I just need to figure out where that man page is...
>From the man page: http://www.kernel.org/doc/man-pages/
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Mon, Nov 08, 2010 at 10:30:38PM -0500, Ted Ts'o wrote:
> On Tue, Nov 09, 2010 at 12:12:22PM +1100, Dave Chinner wrote:
> > Hole punching was not included originally in fallocate() for a
> > variety of reasons. IIRC, they were along the lines of:
> >
> > 1 de-allocating of blocks in an allocation syscall is wrong.
> > People wanted a new syscall for this functionality.
....
> > I guess that leaves #1 to be debated;
> > I don't think there is any problem with doing what you propose.
>
> I don't have a problem either.
>
> As a completely separate proposal, what do people think about an
> FALLOCATE_FL_ZEROIZE after which time the blocks are allocated, but
> reading from them returns zero.
That's exactly the new XFS_IOC_ZERO_RANGE ioctl in 2.6.36 does
(commit 447223520520b17d3b6d0631aa4838fbaf8eddb4 "xfs: Introduce
XFS_IOC_ZERO_RANGE") The git commit I pointed to in the last email
is the rudimentary fallocate() interface support I have for that
code which goes along with an xfs_io patch I have. Given that there
seems to be interest for this operation, I'll flesh it out into a
proper patch....
> This could be done either by (a)
> sending a discard in the case of devices where discard_zeros_data is
> true and discard_granularty is less than the fs block size, or (b) by
> setting the uninitialized flag in the extent tree.
Implementation is up to the filesystem. However, XFS does (b)
because:
1) it was extremely simple to implement (one of the
advantages of having an exceedingly complex allocation
interface to begin with :P)
2) conversion is atomic, fast and reliable
3) it is independent of the underlying storage; and
4) reads of unwritten extents operate at memory speed,
not disk speed.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Mon, Nov 8, 2010 at 8:32 PM, Josef Bacik <[email protected]> wrote:
Hi Josef,
> Btrfs doesn't have the ability to punch holes yet, so make sure we return
> EOPNOTSUPP if we try to use hole punching through fallocate. ?This support can
> be added later. ?Thanks,
>
> Signed-off-by: Josef Bacik <[email protected]>
> ---
> ?fs/btrfs/inode.c | ? ?4 ++++
> ?1 files changed, 4 insertions(+), 0 deletions(-)
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 78877d7..c590add 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -6936,6 +6936,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
> ? ? ? ?alloc_start = offset & ~mask;
> ? ? ? ?alloc_end = ?(offset + len + mask) & ~mask;
>
> + ? ? ? /* We only support the FALLOC_FL_KEEP_SIZE mode */
> + ? ? ? if (mode && (mode & FALLOC_FL_KEEP_SIZE))
> + ? ? ? ? ? ? ? return -EOPNOTSUPP;
> +
This test looks rather odd. Why do we need to test that mode is
non-zero AND that mode has a specific bit set? Is there a missing !
here?
On Tue, Nov 09, 2010 at 10:05:34AM +0000, Will Newton wrote:
> On Mon, Nov 8, 2010 at 8:32 PM, Josef Bacik <[email protected]> wrote:
>
> Hi Josef,
>
> > Btrfs doesn't have the ability to punch holes yet, so make sure we return
> > EOPNOTSUPP if we try to use hole punching through fallocate. ?This support can
> > be added later. ?Thanks,
> >
> > Signed-off-by: Josef Bacik <[email protected]>
> > ---
> > ?fs/btrfs/inode.c | ? ?4 ++++
> > ?1 files changed, 4 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 78877d7..c590add 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -6936,6 +6936,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
> > ? ? ? ?alloc_start = offset & ~mask;
> > ? ? ? ?alloc_end = ?(offset + len + mask) & ~mask;
> >
> > + ? ? ? /* We only support the FALLOC_FL_KEEP_SIZE mode */
> > + ? ? ? if (mode && (mode & FALLOC_FL_KEEP_SIZE))
> > + ? ? ? ? ? ? ? return -EOPNOTSUPP;
> > +
>
> This test looks rather odd. Why do we need to test that mode is
> non-zero AND that mode has a specific bit set? Is there a missing !
> here?
Yeah I'm missing a !, I copy and pasted the wrong bit when I went around adding
this check to everybody, I'll be fixing it up for the next go around. Thanks,
Josef
On Tue, Nov 09, 2010 at 12:12:22PM +1100, Dave Chinner wrote:
> On Mon, Nov 08, 2010 at 03:32:02PM -0500, Josef Bacik wrote:
> > Hole punching has already been implemented by XFS and OCFS2, and has the
> > potential to be implemented on both BTRFS and EXT4 so we need a generic way to
> > get to this feature. The simplest way in my mind is to add FALLOC_FL_PUNCH_HOLE
> > to fallocate() since it already looks like the normal fallocate() operation.
> > I've tested this patch with XFS and BTRFS to make sure XFS did what it's
> > supposed to do and that BTRFS failed like it was supposed to. Thank you,
> >
> > Signed-off-by: Josef Bacik <[email protected]>
> > ---
> > fs/open.c | 2 +-
> > include/linux/falloc.h | 1 +
> > 2 files changed, 2 insertions(+), 1 deletions(-)
> >
> > diff --git a/fs/open.c b/fs/open.c
> > index 4197b9e..ab8dedf 100644
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -223,7 +223,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> > return -EINVAL;
> >
> > /* Return error if mode is not supported */
> > - if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
> > + if (mode && (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)))
> > return -EOPNOTSUPP;
> >
> > if (!(file->f_mode & FMODE_WRITE))
> > diff --git a/include/linux/falloc.h b/include/linux/falloc.h
> > index 3c15510..851cba2 100644
> > --- a/include/linux/falloc.h
> > +++ b/include/linux/falloc.h
> > @@ -2,6 +2,7 @@
> > #define _FALLOC_H_
> >
> > #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */
> > +#define FALLOC_FL_PUNCH_HOLE 0X02 /* de-allocates range */
>
> Hole punching was not included originally in fallocate() for a
> variety of reasons. IIRC, they were along the lines of:
>
> 1 de-allocating of blocks in an allocation syscall is wrong.
> People wanted a new syscall for this functionality.
> 2 no glibc interface needs it
> 3 at the time, only XFS supported punching holes, so there
> is not need to support it in a generic interface
> 4 the use cases presented were not considered compelling
> enough to justify the additional complexity (!)
>
> In the end, I gave up arguing for it to be included because just
> getting the FALLOC_FL_KEEP_SIZE functionality was a hard enough
> battle.
>
> Anyway, #3 isn't the case any more, #4 was just an excuse not to
> support anything ext4 couldn't do and lots of apps are calling
> fallocate directly (because glibc can't use FALLOC_FL_KEEP_SIZE) so
> #2 isn't an issue, either. I guess that leaves #1 to be debated;
> I don't think there is any problem with doing what you propose.
>
> What I will suggest is that this requires a generic xfstest to be
> written and support added to xfs_io to enable that test (and others)
> to issue hole punches. Something along the lines of test 242 which I
> wrote for testing all the edge case of XFS_IOC_ZERO_RANGE (*) would be
> good.
>
So this was relatively simple, adding a flag to falloc for xfs_io and such. Got
a test going and it worked great on XFS. Then I went to make sure it worked on
non-XFS, and thats where I've run into pain. Turns out xfs_io -c "bmap" only
works on XFS. So I thought to myself "well how hard could it be to make this
thing use fiemap?", hahaha I'm an idiot. So I've been adding a xfs_io -c
"fiemap" that spits things out similar to bmap, and it will probably be tomorrow
when I finish it.
So good news is my simple patches seem to work just fine for hole-punch, bad
news is its going to take me another day to have all the infrastructure to test
it on non-XFS filesystems.
Also did you want me to rebase my patches on your fallocate() version of
ZERO_RANGE? Thanks,
Josef
On Tue, Nov 09, 2010 at 03:42:42PM +1100, Dave Chinner wrote:
> Implementation is up to the filesystem. However, XFS does (b)
> because:
>
> 1) it was extremely simple to implement (one of the
> advantages of having an exceedingly complex allocation
> interface to begin with :P)
> 2) conversion is atomic, fast and reliable
> 3) it is independent of the underlying storage; and
> 4) reads of unwritten extents operate at memory speed,
> not disk speed.
Yeah, I was thinking that using a device-style TRIM might be better
since future attempts to write to it won't require a separate seek to
modify the extent tree. But yeah, there are a bunch of advantages of
simply mutating the extent tree.
While we're on the subject of changes to fallocate, what do people
think of FALLOC_FL_EXPOSE_OLD_DATA, which requires either root
privileges or (if capabilities are in use) CAP_DAC_OVERRIDE &&
CAP_MAC_OVERRIDE && CAP_SYS_ADMIN. This would allow a trusted process
to fallocate blocks with the extent already marked initialized. I've
had two requests for such functionality for ext4 already.
(Take for example a trusted cluster filesystem backend that checks the
object checksum before returning any data to the user; and if the
check fails the cluster file system will try to use some other replica
stored on some other server.)
- Ted
On Tue 09-11-10 16:41:47, Ted Ts'o wrote:
> On Tue, Nov 09, 2010 at 03:42:42PM +1100, Dave Chinner wrote:
> > Implementation is up to the filesystem. However, XFS does (b)
> > because:
> >
> > 1) it was extremely simple to implement (one of the
> > advantages of having an exceedingly complex allocation
> > interface to begin with :P)
> > 2) conversion is atomic, fast and reliable
> > 3) it is independent of the underlying storage; and
> > 4) reads of unwritten extents operate at memory speed,
> > not disk speed.
>
> Yeah, I was thinking that using a device-style TRIM might be better
> since future attempts to write to it won't require a separate seek to
> modify the extent tree. But yeah, there are a bunch of advantages of
> simply mutating the extent tree.
>
> While we're on the subject of changes to fallocate, what do people
> think of FALLOC_FL_EXPOSE_OLD_DATA, which requires either root
> privileges or (if capabilities are in use) CAP_DAC_OVERRIDE &&
> CAP_MAC_OVERRIDE && CAP_SYS_ADMIN. This would allow a trusted process
> to fallocate blocks with the extent already marked initialized. I've
> had two requests for such functionality for ext4 already.
>
> (Take for example a trusted cluster filesystem backend that checks the
> object checksum before returning any data to the user; and if the
> check fails the cluster file system will try to use some other replica
> stored on some other server.)
Hum, could you elaborate a bit? I fail to see how above fallocate() flag
could be used to help solving this problem... Just curious...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, Nov 09, 2010 at 04:41:47PM -0500, Ted Ts'o wrote:
> On Tue, Nov 09, 2010 at 03:42:42PM +1100, Dave Chinner wrote:
> > Implementation is up to the filesystem. However, XFS does (b)
> > because:
> >
> > 1) it was extremely simple to implement (one of the
> > advantages of having an exceedingly complex allocation
> > interface to begin with :P)
> > 2) conversion is atomic, fast and reliable
> > 3) it is independent of the underlying storage; and
> > 4) reads of unwritten extents operate at memory speed,
> > not disk speed.
>
> Yeah, I was thinking that using a device-style TRIM might be better
> since future attempts to write to it won't require a separate seek to
> modify the extent tree. But yeah, there are a bunch of advantages of
> simply mutating the extent tree.
>
> While we're on the subject of changes to fallocate, what do people
> think of FALLOC_FL_EXPOSE_OLD_DATA, which requires either root
> privileges or (if capabilities are in use) CAP_DAC_OVERRIDE &&
> CAP_MAC_OVERRIDE && CAP_SYS_ADMIN. This would allow a trusted process
> to fallocate blocks with the extent already marked initialized. I've
> had two requests for such functionality for ext4 already.
We removed that ability from XFS about three years ago because it's
a massive security hole. e.g. what happens if the file is world
readable, even though the process that called
FALLOC_FL_EXPOSE_OLD_DATA was privileged and was allowed to expose
such data? Or the file is chmod 777 after being exposed?
The historical reason for such behaviour existing in XFS was that in
1997 the CPU and IO latency cost of unwritten extent conversion was
significant, so users with real physical security (i.e. marines with
guns) were able to make use of fast preallocation with no conversion
overhead without caring about the security implications. These days,
the performance overhead of unwritten extent conversion is minimal -
I generally can't measure a difference in IO performance as a result
of it - so there is simply no good reaѕon for leaving such a gaping
security hole in the system.
If anyone wants to read the underlying data, then use fiemap to map
the physical blocks and read it directly from the block device. That
requires root privileges but does not open any new stale data
exposure problems....
> (Take for example a trusted cluster filesystem backend that checks the
> object checksum before returning any data to the user; and if the
> check fails the cluster file system will try to use some other replica
> stored on some other server.)
IOWs, all they want to do is avoid the unwritten extent conversion
overhead. Time has shown that a bad security/performance tradeoff
decision was made 13 years ago in XFS, so I see little reason to
repeat it for ext4 today....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Nov 9, 2010 at 6:40 PM, Dave Chinner <[email protected]> wrote:
> On Tue, Nov 09, 2010 at 04:41:47PM -0500, Ted Ts'o wrote:
>> On Tue, Nov 09, 2010 at 03:42:42PM +1100, Dave Chinner wrote:
>> > Implementation is up to the filesystem. However, XFS does (b)
>> > because:
>> >
>> > 1) it was extremely simple to implement (one of the
>> > advantages of having an exceedingly complex allocation
>> > interface to begin with :P)
>> > 2) conversion is atomic, fast and reliable
>> > 3) it is independent of the underlying storage; and
>> > 4) reads of unwritten extents operate at memory speed,
>> > not disk speed.
>>
>> Yeah, I was thinking that using a device-style TRIM might be better
>> since future attempts to write to it won't require a separate seek to
>> modify the extent tree. But yeah, there are a bunch of advantages of
>> simply mutating the extent tree.
>>
>> While we're on the subject of changes to fallocate, what do people
>> think of FALLOC_FL_EXPOSE_OLD_DATA, which requires either root
>> privileges or (if capabilities are in use) CAP_DAC_OVERRIDE &&
>> CAP_MAC_OVERRIDE && CAP_SYS_ADMIN. This would allow a trusted process
>> to fallocate blocks with the extent already marked initialized. I've
>> had two requests for such functionality for ext4 already.
>
> We removed that ability from XFS about three years ago because it's
> a massive security hole. e.g. what happens if the file is world
> readable, even though the process that called
> FALLOC_FL_EXPOSE_OLD_DATA was privileged and was allowed to expose
> such data? Or the file is chmod 777 after being exposed?
>
> The historical reason for such behaviour existing in XFS was that in
> 1997 the CPU and IO latency cost of unwritten extent conversion was
> significant, so users with real physical security (i.e. marines with
> guns) were able to make use of fast preallocation with no conversion
> overhead without caring about the security implications. These days,
> the performance overhead of unwritten extent conversion is minimal -
> I generally can't measure a difference in IO performance as a result
> of it - so there is simply no good reaѕon for leaving such a gaping
> security hole in the system.
>
> If anyone wants to read the underlying data, then use fiemap to map
> the physical blocks and read it directly from the block device. That
> requires root privileges but does not open any new stale data
> exposure problems....
>
>> (Take for example a trusted cluster filesystem backend that checks the
>> object checksum before returning any data to the user; and if the
>> check fails the cluster file system will try to use some other replica
>> stored on some other server.)
>
> IOWs, all they want to do is avoid the unwritten extent conversion
> overhead. Time has shown that a bad security/performance tradeoff
> decision was made 13 years ago in XFS, so I see little reason to
> repeat it for ext4 today....
I'd make use of FALLOC_FL_EXPOSE_OLD_DATA. It's not the CPU overhead
of extent conversion. It's that extent conversion causes more metadata
operations than what you'd have otherwise, which means systems that
want to use O_DIRECT and make sure the data doesn't go away either
have to write O_DIRECT|O_DSYNC or need to call fdatasync().
cluster file system implementor,
Larry
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Tue, Jan 11, 2011 at 04:13:42PM -0500, Lawrence Greenfield wrote:
> > IOWs, all they want to do is avoid the unwritten extent conversion
> > overhead. Time has shown that a bad security/performance tradeoff
> > decision was made 13 years ago in XFS, so I see little reason to
> > repeat it for ext4 today....
I suspect things may have changed somewhat; both in terms of
requirements and nature of cluter file systems, and the performance of
various storage systems (including PCIe-attached flash devices).
> I'd make use of FALLOC_FL_EXPOSE_OLD_DATA. It's not the CPU overhead
> of extent conversion. It's that extent conversion causes more metadata
> operations than what you'd have otherwise, which means systems that
> want to use O_DIRECT and make sure the data doesn't go away either
> have to write O_DIRECT|O_DSYNC or need to call fdatasync().
>
> cluster file system implementor,
One possibility might be to make it an optional feature which is only
enabled via a mount option. That way someone would have to explicit
ask for this feature two ways (via a new flag to fallocate) and a
mount option.
It might not make sense for XFS, but for people who are using ext4 as
the local storage file system back-end, and are doing all sorts of
things to get the best performance, including disabling the journal, I
suspect it really would make sense. So it could always be an
optional-to-implement flag, that not all file systems should feel
obliged to implement.
- Ted
On Tue, Jan 11, 2011 at 04:30:07PM -0500, Ted Ts'o wrote:
> On Tue, Jan 11, 2011 at 04:13:42PM -0500, Lawrence Greenfield wrote:
> > > IOWs, all they want to do is avoid the unwritten extent conversion
> > > overhead. Time has shown that a bad security/performance tradeoff
> > > decision was made 13 years ago in XFS, so I see little reason to
> > > repeat it for ext4 today....
>
> I suspect things may have changed somewhat; both in terms of
> requirements and nature of cluter file systems, and the performance of
> various storage systems (including PCIe-attached flash devices).
We can throw 1000x more CPU power and memory at the problem than
we could 13 years ago. IOW the system balance hasn't changed (even
considering pci-e SSDs) compared to 13 years. Hence if it was a bad
tradeoff 13 years ago, it's still a bad tradeoff today.
> > I'd make use of FALLOC_FL_EXPOSE_OLD_DATA. It's not the CPU overhead
> > of extent conversion. It's that extent conversion causes more metadata
> > operations than what you'd have otherwise, which means systems that
> > want to use O_DIRECT and make sure the data doesn't go away either
> > have to write O_DIRECT|O_DSYNC or need to call fdatasync().
> > cluster file system implementor,
>
> One possibility might be to make it an optional feature which is only
> enabled via a mount option. That way someone would have to explicit
> ask for this feature two ways (via a new flag to fallocate) and a
> mount option.
Proliferation of mount options just to enable feature X of API Y for
filesystem Z is not a good idea. Either you enable it via the
fallocate API or you don't allow it at all.
> It might not make sense for XFS, but for people who are using ext4
> as the local storage file system back-end,
How does this differ from a local filesystem? Are you talking about
storage nodes for clustered/cloudy storage?
If so, I know of quite a few places that use XFS for this purpose
and they all seem to measure storage in petabytes made up of small
boxes containing anywhere between 30-100TB each. The only request
for additional preallocation functionality I've got from people
running such applications recently is for XFS_IOC_ZERO_RANGE. This
is quite relevant, because that specifically converts allocated
extents to unwritten extents. i.e. they like to be able to
efficiently re-initialise allocated space to zeros rather than
have it contain stale data.
> and are doing all sorts of things to get the best performance,
> including disabling the journal, I suspect it really would make
> sense.
That's not really a convincing argument for a new interface that
needs to be maintained forever.
> So it could always be an
> optional-to-implement flag, that not all file systems should feel
> obliged to implement.
It could, but it still needs better justification.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Jan 11, 2011 at 04:13:42PM -0500, Lawrence Greenfield wrote:
> On Tue, Nov 9, 2010 at 6:40 PM, Dave Chinner <[email protected]> wrote:
> > The historical reason for such behaviour existing in XFS was that in
> > 1997 the CPU and IO latency cost of unwritten extent conversion was
> > significant,
.....
> >> (Take for example a trusted cluster filesystem backend that checks the
> >> object checksum before returning any data to the user; and if the
> >> check fails the cluster file system will try to use some other replica
> >> stored on some other server.)
> >
> > IOWs, all they want to do is avoid the unwritten extent conversion
> > overhead. Time has shown that a bad security/performance tradeoff
> > decision was made 13 years ago in XFS, so I see little reason to
> > repeat it for ext4 today....
>
> I'd make use of FALLOC_FL_EXPOSE_OLD_DATA. It's not the CPU overhead
> of extent conversion. It's that extent conversion causes more metadata
> operations than what you'd have otherwise,
Yes, that's the "IO latency" part of the cost I mentioned above.
> which means systems that
> want to use O_DIRECT and make sure the data doesn't go away either
> have to write O_DIRECT|O_DSYNC or need to call fdatasync().
Seriously, we tell application writers _all the time_ that they
*must* use fsync/fdatasync to guarantee their data is on stable
storage and that they cannot rely on side-effects of filesystem or
storage specific behaviours (like ext3 ordered mode) to do that job
for them.
You're suggesting that by introducing FALLOC_FL_EXPOSE_OLD_DATA,
applications can rely on filesystem/storage specific behaviour to
guarantee data is on stable storage without the use of
fdatasync/fsync. Wht you describe is definitely storage specific,
because volatile write caches still needs the fdatasync to issue a
cache flush.
Do you see the same conflict here that I do?
> cluster file system implementor
Which one?
Cheers,
Dave.
--
Dave Chinner
[email protected]
On 01/12/2011 07:44 AM, Dave Chinner wrote:
> On Tue, Jan 11, 2011 at 04:13:42PM -0500, Lawrence Greenfield wrote:
>> On Tue, Nov 9, 2010 at 6:40 PM, Dave Chinner<[email protected]> wrote:
>>> The historical reason for such behaviour existing in XFS was that in
>>> 1997 the CPU and IO latency cost of unwritten extent conversion was
>>> significant,
> .....
>
>>>> (Take for example a trusted cluster filesystem backend that checks the
>>>> object checksum before returning any data to the user; and if the
>>>> check fails the cluster file system will try to use some other replica
>>>> stored on some other server.)
>>> IOWs, all they want to do is avoid the unwritten extent conversion
>>> overhead. Time has shown that a bad security/performance tradeoff
>>> decision was made 13 years ago in XFS, so I see little reason to
>>> repeat it for ext4 today....
>> I'd make use of FALLOC_FL_EXPOSE_OLD_DATA. It's not the CPU overhead
>> of extent conversion. It's that extent conversion causes more metadata
>> operations than what you'd have otherwise,
> Yes, that's the "IO latency" part of the cost I mentioned above.
>
>> which means systems that
>> want to use O_DIRECT and make sure the data doesn't go away either
>> have to write O_DIRECT|O_DSYNC or need to call fdatasync().
> Seriously, we tell application writers _all the time_ that they
> *must* use fsync/fdatasync to guarantee their data is on stable
> storage and that they cannot rely on side-effects of filesystem or
> storage specific behaviours (like ext3 ordered mode) to do that job
> for them.
>
> You're suggesting that by introducing FALLOC_FL_EXPOSE_OLD_DATA,
> applications can rely on filesystem/storage specific behaviour to
> guarantee data is on stable storage without the use of
> fdatasync/fsync. Wht you describe is definitely storage specific,
> because volatile write caches still needs the fdatasync to issue a
> cache flush.
>
> Do you see the same conflict here that I do?
>
The very concept seems quite "non-enterprise". I also agree that the cost of
maintaining extra mount options (and code) for something that no sane end user
would ever do seems to be a loss.
Why wouldn't you want to convert the punched hole to an unwritten extent?
Thanks!
Ric