This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags. Turns out
using fiemap in things like cp cause more problems than it solves, so lets try
and give userspace an interface that doesn't suck. So we have
-SEEK_HOLE: this moves the file pos to the nearest hole in the file from the
given position. If the given position is a hole then pos won't move. A "hole"
is defined by whatever the fs feels like defining it to be. In simple things
like ext2/3 it will simplly mean an unallocated space in the file. For more
complex things where you have preallocated space then that is left up to the
filesystem. Since preallocated space is supposed to return all 0's it is
perfectly legitimate to have SEEK_HOLE dump you out at the start of a
preallocated extent, but then again if this is not something you can do and be
sure the extent isn't in the middle of being converted to a real extent then it
is also perfectly legitimate to skip preallocated extents and only park f_pos at
a truly unallocated section.
-SEEK_DATA: this is obviously a little more self-explanatory. Again the only
ambiguity comes in with preallocated extents. If you have an fs that can't
reliably tell that the preallocated extent is in the process of turning into a
real extent, it is correct for SEEK_DATA to park you at a preallocated extent.
Thanks,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/read_write.c | 3 +++
include/linux/fs.h | 4 +++-
2 files changed, 6 insertions(+), 1 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 5520f8a..5446d61 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -64,6 +64,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
return file->f_pos;
offset += file->f_pos;
break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ return -EINVAL;
}
if (offset < 0 && !unsigned_offsets(file))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dbd860a..1b72e0c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -31,7 +31,9 @@
#define SEEK_SET 0 /* seek relative to beginning of file */
#define SEEK_CUR 1 /* seek relative to current file position */
#define SEEK_END 2 /* seek relative to end of file */
-#define SEEK_MAX SEEK_END
+#define SEEK_HOLE 3 /* seek to the closest hole */
+#define SEEK_DATA 4 /* seek to the closest data */
+#define SEEK_MAX SEEK_DATA
struct fstrim_range {
__u64 start;
--
1.7.2.3
In order to handle SEEK_HOLE/SEEK_DATA we need to implement our own llseek.
Basically for the normal SEEK_*'s we will just defer to the generic helper, and
for SEEK_HOLE/SEEK_DATA we will use our fiemap helper to figure out the nearest
hole or data. Currently this helper doesn't check for delalloc bytes for
prealloc space, so for now treat prealloc as data until that is fixed. Thanks,
Signed-off-by: Josef Bacik <[email protected]>
---
fs/btrfs/ctree.h | 3 ++
fs/btrfs/file.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 87 insertions(+), 1 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d5f043e..d2991c8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2474,6 +2474,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
u64 end, struct list_head *list);
/* inode.c */
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create);
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index cd5e82e..f0a7a33 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1406,8 +1406,91 @@ out:
return ret;
}
+static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+{
+ struct extent_map *em;
+ struct extent_state *cached_state = NULL;
+ u64 lockstart = *offset;
+ u64 lockend = i_size_read(inode);
+ u64 start = *offset;
+ u64 len = i_size_read(inode);
+ int ret = 0;
+
+ lockend = max_t(u64, BTRFS_I(inode)->root->sectorsize, lockend);
+ len = lockend;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ &cached_state, GFP_NOFS);
+ while (1) {
+ em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = -ENXIO;
+ break;
+ }
+
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ if (origin == SEEK_HOLE) {
+ *offset = em->start;
+ free_extent_map(em);
+ break;
+ }
+ } else {
+ if (origin == SEEK_DATA) {
+ *offset = em->start;
+ free_extent_map(em);
+ break;
+ }
+ }
+ start = em->start + em->len;
+ if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+ free_extent_map(em);
+ ret = -ENXIO;
+ break;
+ }
+ free_extent_map(em);
+ }
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ return ret;
+}
+
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+
+ mutex_lock(&inode->i_mutex);
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ offset = generic_file_llseek_unlocked(file, offset, origin);
+ goto out;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = find_desired_extent(inode, &offset, origin);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ return -EINVAL;
+ if (offset > inode->i_sb->s_maxbytes)
+ return -EINVAL;
+
+ /* Special lock needed here? */
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
const struct file_operations btrfs_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = btrfs_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
--
1.7.2.3
On Apr 21, 2011, at 3:42 PM, Josef Bacik wrote:
> + case SEEK_DATA:
> + case SEEK_HOLE:
> + return -EINVAL;
> }
Maybe we should be returning EOPNOTSUPP?
-- Ted
On 04/21/2011 01:45 PM, Theodore Tso wrote:
> On Apr 21, 2011, at 3:42 PM, Josef Bacik wrote:
>> + case SEEK_DATA:
>> + case SEEK_HOLE:
>> + return -EINVAL;
>> }
>
> Maybe we should be returning EOPNOTSUPP?
>
> -- Ted
For SEEK_HOLE yes. But we could let the default SEEK_DATA be a null-op.
Josef Bacik <josef <at> redhat.com> writes:
> This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags. Turns
> out using fiemap in things like cp cause more problems than it solves, so
> lets try and give userspace an interface that doesn't suck. So we have
>
> -SEEK_HOLE: this moves the file pos to the nearest hole in the file from the
> given position. If the given position is a hole then pos won't move. A "hole"
> is defined by whatever the fs feels like defining it to be.
You absolutely need to match Solaris semantics, which are documented as follows:
� If whence is SEEK_HOLE, the offset of the start of the next hole
greater than or equal to the supplied offset is returned. The def-
inition of a hole is provided near the end of the DESCRIPTION.
� If whence is SEEK_DATA, the file pointer is set to the start of
the next non-hole file region greater than or equal to the sup-
plied offset.
A "hole" is defined as a contiguous range of bytes in a file, all hav-
ing the value of zero, but not all zeros in a file are guaranteed to be
represented as holes returned with SEEK_HOLE. Filesystems are allowed
to expose ranges of zeros with SEEK_HOLE, but not required to. Applica-
tions can use SEEK_HOLE to optimise their behavior for ranges of zeros,
but must not depend on it to find all such ranges in a file. The exis-
tence of a hole at the end of every data region allows for easy pro-
gramming and implies that a virtual hole exists at the end of the file.
Applications should use fpathconf(_PC_MIN_HOLE_SIZE) or path-
conf(_PC_MIN_HOLE_SIZE) to determine if a filesystem supports
SEEK_HOLE. See fpathconf(2).
For filesystems that do not supply information about holes, the file
will be represented as one entire data region.
ENXIO For SEEK_DATA, there are no more data regions past the
supplied offset. For SEEK_HOLE, there are no more holes
past the supplied offset.
Note that in that definition, SEEK_HOLE does _not_ reposition the file offset
(it returns the offset of the next hole, which might be at the end of the file
since all files have a virtual hole at the end, but leaves the position
unchanged). I'd have to write a test program on Solaris to see whether that
definition is actually true, or if it is a bug in the Solaris man page.
Making SEEK_HOLE blindly fail is therefore not useful; you could just as
easily make it succeed and return the offset of the last byte of the file for
all file systems (if the offset requested is within bounds, or fail with
ENXIO if it is out of bounds), and have a valid, working, and minimal
implementation already in place.
>
> -SEEK_DATA: this is obviously a little more self-explanatory. Again the only
> ambiguity comes in with preallocated extents. If you have an fs that can't
> reliably tell that the preallocated extent is in the process of turning into a
> real extent, it is correct for SEEK_DATA to park you at a preallocated extent.
In contrast to SEEK_HOLE, SEEK_DATA _does_ reposition the file, or fail with
ENXIO. Back to your minimal implementation, it would be valid to always fail
with ENXIO (treating every file as a single block of data followed by a single
virtual hole at file end, so there is no next data after any current position).
> +++ b/fs/read_write.c
> @@ -64,6 +64,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t
offset, int origin)
> return file->f_pos;
> offset += file->f_pos;
> break;
> + case SEEK_DATA:
> + case SEEK_HOLE:
> + return -EINVAL;
Whatever you do, returning EINVAL is not nice, especially when it seems pretty
easy to provide a working alternative that assumes all files are a single data
block until fine-tuned otherwise on a per-filesystem basis.
Also, based on the Solaris documentation, you probably ought to couple this
with fpathconf(_PC_MIN_HOLE_SIZE).
Eric Blake
On Thu, Apr 21, 2011 at 6:28 PM, Eric Blake <[email protected]> wrote:
> Josef Bacik <josef <at> redhat.com> writes:
>
>> This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags. Turns
>> out using fiemap in things like cp cause more problems than it solves, so
>> lets try and give userspace an interface that doesn't suck. So we have
>>
>> -SEEK_HOLE: this moves the file pos to the nearest hole in the file from the
>> given position. If the given position is a hole then pos won't move. A "hole"
>> is defined by whatever the fs feels like defining it to be.
>
> You absolutely need to match Solaris semantics, which are documented as follows:
>
> � If whence is SEEK_HOLE, the offset of the start of the next hole
> greater than or equal to the supplied offset is returned. The def-
> inition of a hole is provided near the end of the DESCRIPTION.
>
> � If whence is SEEK_DATA, the file pointer is set to the start of
> the next non-hole file region greater than or equal to the sup-
> plied offset.
>
> A "hole" is defined as a contiguous range of bytes in a file, all hav-
> ing the value of zero, but not all zeros in a file are guaranteed to be
> represented as holes returned with SEEK_HOLE. Filesystems are allowed
> to expose ranges of zeros with SEEK_HOLE, but not required to. Applica-
> tions can use SEEK_HOLE to optimise their behavior for ranges of zeros,
> but must not depend on it to find all such ranges in a file. The exis-
> tence of a hole at the end of every data region allows for easy pro-
> gramming and implies that a virtual hole exists at the end of the file.
> Applications should use fpathconf(_PC_MIN_HOLE_SIZE) or path-
> conf(_PC_MIN_HOLE_SIZE) to determine if a filesystem supports
> SEEK_HOLE. See fpathconf(2).
>
> For filesystems that do not supply information about holes, the file
> will be represented as one entire data region.
>
> ENXIO For SEEK_DATA, there are no more data regions past the
> supplied offset. For SEEK_HOLE, there are no more holes
> past the supplied offset.
>
> Note that in that definition, SEEK_HOLE does _not_ reposition the file offset
> (it returns the offset of the next hole, which might be at the end of the file
> since all files have a virtual hole at the end, but leaves the position
> unchanged). I'd have to write a test program on Solaris to see whether that
> definition is actually true, or if it is a bug in the Solaris man page.
>
lseek's purpose is to reposition the file position, so I'd imagine
this is just a bug in the man page.
> Making SEEK_HOLE blindly fail is therefore not useful; you could just as
> easily make it succeed and return the offset of the last byte of the file for
> all file systems (if the offset requested is within bounds, or fail with
> ENXIO if it is out of bounds), and have a valid, working, and minimal
> implementation already in place.
>
Except you can have blocks allocated past i_size, so returning i_size
isn't necessarily correct. Obviously the idea is to have most of the
filesystems doing the correct thing, but for my first go around I just
was going to do one since I expect quite a bit of repainting for this
bikeshed is yet to be done. But I guess if you have data past i_size
doing this would encourage the various fs people to fix it.
>>
>> -SEEK_DATA: this is obviously a little more self-explanatory. Again the only
>> ambiguity comes in with preallocated extents. If you have an fs that can't
>> reliably tell that the preallocated extent is in the process of turning into a
>> real extent, it is correct for SEEK_DATA to park you at a preallocated extent.
>
> In contrast to SEEK_HOLE, SEEK_DATA _does_ reposition the file, or fail with
> ENXIO. Back to your minimal implementation, it would be valid to always fail
> with ENXIO (treating every file as a single block of data followed by a single
> virtual hole at file end, so there is no next data after any current position).
>
>> +++ b/fs/read_write.c
>> @@ -64,6 +64,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t
> offset, int origin)
>> return file->f_pos;
>> offset += file->f_pos;
>> break;
>> + case SEEK_DATA:
>> + case SEEK_HOLE:
>> + return -EINVAL;
>
> Whatever you do, returning EINVAL is not nice, especially when it seems pretty
> easy to provide a working alternative that assumes all files are a single data
> block until fine-tuned otherwise on a per-filesystem basis.
>
Agreed, I'll fix it. Thanks,
Josef
On Thu, Apr 21, 2011 at 04:45:54PM -0400, Theodore Tso wrote:
>
> On Apr 21, 2011, at 3:42 PM, Josef Bacik wrote:
>
> > + case SEEK_DATA:
> > + case SEEK_HOLE:
> > + return -EINVAL;
> > }
>
> Maybe we should be returning EOPNOTSUPP?
That's definitely wrong ... -ENOSYS might be better!
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
We'll also need:
- a patch to lseek(2) to document the new flags
- some testcases for xfstests, specially dealing with things that
were problematic in FIEMAP, e.g. data in delalloc extents, making
sure stuff in unwrittent extents partially converted actually gets
copied, etc.
[Eric: please don't drop the Cc list, thanks!]
On Thu, Apr 21, 2011 at 09:22:55PM -0400, Josef Bacik wrote:
> > since all files have a virtual hole at the end, but leaves the position
> > unchanged). ??I'd have to write a test program on Solaris to see whether that
> > definition is actually true, or if it is a bug in the Solaris man page.
> >
>
> lseek's purpose is to reposition the file position, so I'd imagine
> this is just a bug in the man page.
I would be surprised if the bug is around for such a long time, but
otherwise I concur.
> Except you can have blocks allocated past i_size, so returning i_size
> isn't necessarily correct. Obviously the idea is to have most of the
> filesystems doing the correct thing, but for my first go around I just
> was going to do one since I expect quite a bit of repainting for this
> bikeshed is yet to be done. But I guess if you have data past i_size
> doing this would encourage the various fs people to fix it.
Trying to be smart about our semantics is not helpful here. The
interface has been out in Solaris for a while, and we'll have to either
stick to it or use different names for the interface. And staying
compatible is far more useful for userspace programmers.
On 2011.04.22 at 00:50 -0400, Christoph Hellwig wrote:
> [Eric: please don't drop the Cc list, thanks!]
>
> On Thu, Apr 21, 2011 at 09:22:55PM -0400, Josef Bacik wrote:
> > > since all files have a virtual hole at the end, but leaves the position
> > > unchanged). ??I'd have to write a test program on Solaris to see whether that
> > > definition is actually true, or if it is a bug in the Solaris man page.
> > >
> >
> > lseek's purpose is to reposition the file position, so I'd imagine
> > this is just a bug in the man page.
>
> I would be surprised if the bug is around for such a long time, but
> otherwise I concur.
It's a bug. Let me quote what Jeff Bonwick wrote on his blog:
?I'm not sure where you got the impression that either SEEK_HOLE or
SEEK_DATA doesn't set the file pointer. They do. (If they didn't, that
would be weird, and we'd call it out explicitly.)?
http://blogs.sun.com/bonwick/entry/seek_hole_and_seek_data
--
Markus
On 04/21/2011 07:22 PM, Josef Bacik wrote:
>> You absolutely need to match Solaris semantics, which are documented as follows:
>>
>> � If whence is SEEK_HOLE, the offset of the start of the next hole
>> greater than or equal to the supplied offset is returned. The def-
>> inition of a hole is provided near the end of the DESCRIPTION.
>>
>> Note that in that definition, SEEK_HOLE does _not_ reposition the file offset
>> (it returns the offset of the next hole, which might be at the end of the file
>> since all files have a virtual hole at the end, but leaves the position
>> unchanged). I'd have to write a test program on Solaris to see whether that
>> definition is actually true, or if it is a bug in the Solaris man page.
>
> lseek's purpose is to reposition the file position, so I'd imagine
> this is just a bug in the man page.
Confirmed that the Solaris man page is buggy and does not accurately
describe the semantics that are actually implemented. My testing showed:
lseek(fd, off, SEEK_HOLE) has four cases:
off < end, in data: repositions to the next hole as if by lseek(fd,
next_hole, SEEK_CUR); the next hole is guaranteed to exist, even if at
end of file
off < end, in hole, and later in file has data: repositions as if by
lseek(fd, off, SEEK_CUR)
off < end, in hole, and no further data in file: repositions as if by
lseek(fd, 0, SEEK_END)
off == end: fails with ENXIO
lseek(fd, off, SEEK_DATA) has four cases:
off < end, in data: repositions as if by lseek(fd, off, SEEK_CUR)
off < end, in hole, and later in file has data: repositions to the next
data as if by lseek(fd, next_data, SEEK_CUR)
off < end, in hole, and remainder of file is hold: fails with ENXIO
off == end: fails with ENXIO
Here's how I tested:
$ uname -sr
SunOS 5.10
$ cat foo.c
#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
static int
make_sparse (const char *name)
{
int fd = open (name, O_RDWR | O_CREAT | O_TRUNC, 0600);
if (fd < 0)
return fd;
printf ("creating %s\n", name);
if (write (fd, "a", 1) != 1)
goto cleanup;
if (lseek (fd, 1024*1024, SEEK_CUR) == -1)
goto cleanup;
if (write (fd, "b", 1) != 1)
goto cleanup;
if (lseek (fd, 0, SEEK_SET) != 0)
goto cleanup;
return fd;
cleanup:
close (fd);
return -1;
}
int
main (int argc, char **argv)
{
off_t off;
off_t end;
int fd = argc > 2 ? open (argv[1], O_RDONLY)
: make_sparse (argc > 1 ? argv[1] : "file");
perror(NULL);
if (fd < 0)
return 2;
printf ("fpathconf gives %ld, ENXIO is %d\n",
fpathconf (fd, _PC_MIN_HOLE_SIZE), ENXIO);
puts ("\ntesting at start");
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_HOLE);
printf ("HOLE gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_DATA);
printf ("DATA gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_HOLE);
printf ("HOLE gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
puts ("\ntesting at end");
errno = 0;
off = end = lseek (fd, 0, SEEK_END);
printf ("end at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, end, SEEK_HOLE);
printf ("HOLE gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, end, SEEK_DATA);
printf ("DATA gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, end - 1, SEEK_HOLE);
printf ("HOLE gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, end - 1, SEEK_DATA);
printf ("DATA gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
puts ("\ntesting at offset 1");
errno = 0;
off = lseek (fd, 1, SEEK_HOLE);
printf ("HOLE gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 1, SEEK_DATA);
printf ("DATA gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
puts ("\ntesting at offset 200000");
errno = 0;
off = lseek (fd, 200000, SEEK_HOLE);
printf ("HOLE gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 200000, SEEK_DATA);
printf ("DATA gives offset %lld, errno %d\n", (long long) off, errno);
errno = 0;
off = lseek (fd, 0, SEEK_CUR);
printf ("CUR at offset %lld, errno %d\n", (long long) off, errno);
if (close (fd))
return 3;
return 0;
}
$ ./foo
creating file
Error 0
fpathconf gives 512, ENXIO is 6
testing at start
CUR at offset 0, errno 0
HOLE gives offset 131072, errno 0
CUR at offset 131072, errno 0
DATA gives offset 0, errno 0
CUR at offset 0, errno 0
HOLE gives offset 131072, errno 0
CUR at offset 131072, errno 0
testing at end
end at offset 1048578, errno 0
HOLE gives offset -1, errno 6
CUR at offset 1048578, errno 0
DATA gives offset -1, errno 6
CUR at offset 1048578, errno 0
HOLE gives offset 1048578, errno 0
CUR at offset 1048578, errno 0
DATA gives offset 1048577, errno 0
CUR at offset 1048577, errno 0
testing at offset 1
HOLE gives offset 131072, errno 0
CUR at offset 131072, errno 0
DATA gives offset 1, errno 0
CUR at offset 1, errno 0
testing at offset 200000
HOLE gives offset 200000, errno 0
CUR at offset 200000, errno 0
DATA gives offset 1048576, errno 0
CUR at offset 1048576, errno 0
$ gtruncate -s 1M sparse
$ ./foo sparse 2
Error 0
fpathconf gives 512, ENXIO is 6
testing at start
CUR at offset 0, errno 0
HOLE gives offset 0, errno 0
CUR at offset 0, errno 0
DATA gives offset -1, errno 6
CUR at offset 0, errno 0
HOLE gives offset 0, errno 0
CUR at offset 0, errno 0
testing at end
end at offset 1048576, errno 0
HOLE gives offset -1, errno 6
CUR at offset 1048576, errno 0
DATA gives offset -1, errno 6
CUR at offset 1048576, errno 0
HOLE gives offset 1048576, errno 0
CUR at offset 1048576, errno 0
DATA gives offset -1, errno 6
CUR at offset 1048576, errno 0
testing at offset 1
HOLE gives offset 1, errno 0
CUR at offset 1, errno 0
DATA gives offset -1, errno 6
CUR at offset 1, errno 0
testing at offset 200000
HOLE gives offset 1048576, errno 0
CUR at offset 1048576, errno 0
DATA gives offset -1, errno 6
CUR at offset 1048576, errno 0
$
--
Eric Blake [email protected] +1-801-349-2682
Libvirt virtualization library http://libvirt.org
On 04/22/2011 05:28 AM, Markus Trippelsdorf wrote:
> On 2011.04.22 at 00:50 -0400, Christoph Hellwig wrote:
>> [Eric: please don't drop the Cc list, thanks!]
That's the fault of gmane. My apologies for not being subscribed in the
first place, and for gmane's refusal to cc all. Now that I'm on the
cc-chain, it won't happen again for this thread.
>>> lseek's purpose is to reposition the file position, so I'd imagine
>>> this is just a bug in the man page.
>>
>> I would be surprised if the bug is around for such a long time, but
>> otherwise I concur.
>
> It's a bug. Let me quote what Jeff Bonwick wrote on his blog:
>
> ?I'm not sure where you got the impression that either SEEK_HOLE or
> SEEK_DATA doesn't set the file pointer. They do. (If they didn't, that
> would be weird, and we'd call it out explicitly.)?
>
> http://blogs.sun.com/bonwick/entry/seek_hole_and_seek_data
That blog also mentioned the useful idea of adding FIND_HOLE and
FIND_DATA, not implemented in Solaris, but which could easily be
provided as additional lseek constants in Linux to locate the start of
the next chunk without repositioning and which could ease application
programmer's life a bit. After all, cp wants to know where data ends
without repositioning (FIND_HOLE), read() that much data which
repositions in the process, then skip to the next chunk of data
(SEEK_DATA) - two lseek() calls per iteration if we have 4 constants,
but 3 per iteration if we only have SEEK_HOLE and have to manually rewind.
--
Eric Blake [email protected] +1-801-349-2682
Libvirt virtualization library http://libvirt.org
On 04/22/2011 05:28 AM, Markus Trippelsdorf wrote:
>> I would be surprised if the bug is around for such a long time, but
>> otherwise I concur.
>
> It's a bug. Let me quote what Jeff Bonwick wrote on his blog:
>
> ?I'm not sure where you got the impression that either SEEK_HOLE or
> SEEK_DATA doesn't set the file pointer. They do. (If they didn't, that
> would be weird, and we'd call it out explicitly.)?
>
> http://blogs.sun.com/bonwick/entry/seek_hole_and_seek_data
I've created a request to standardize SEEK_HOLE and SEEK_DATA in the
next revision of POSIX; comments are welcome to make sure that everyone
is happy with wording:
http://austingroupbugs.net/view.php?id=415
--
Eric Blake [email protected] +1-801-349-2682
Libvirt virtualization library http://libvirt.org
On 04/22/2011 04:50 AM, Eric Blake wrote:
> That blog also mentioned the useful idea of adding FIND_HOLE and
> FIND_DATA, not implemented in Solaris, but which could easily be
> provided as additional lseek constants in Linux to locate the start of
> the next chunk without repositioning and which could ease application
> programmer's life a bit. After all, cp wants to know where data ends
> without repositioning (FIND_HOLE), read() that much data which
> repositions in the process, then skip to the next chunk of data
> (SEEK_DATA) - two lseek() calls per iteration if we have 4 constants,
> but 3 per iteration if we only have SEEK_HOLE and have to manually rewind.
while(1) {
read(block);
if (block_all_zeroes)
lseek(SEEK_DATA);
}
What's wrong with the above? If this is the case, even SEEK_HOLE
is not needed but should be added as it is already in Solaris.
My problem with FIND_* is that we are messing with the well understood
semantics of lseek().
And if generic_file_llseek_unlocked() treats SEEK_DATA as SEEK_CUR and
SEEK_HOLE as SEEK_END (both with zero offset) then we don't even
have to bother with the finding the "correct" error code.
On 04/22/2011 10:28 AM, Sunil Mushran wrote:
> On 04/22/2011 04:50 AM, Eric Blake wrote:
>> That blog also mentioned the useful idea of adding FIND_HOLE and
>> FIND_DATA, not implemented in Solaris, but which could easily be
>> provided as additional lseek constants in Linux to locate the start of
>> the next chunk without repositioning and which could ease application
>> programmer's life a bit. After all, cp wants to know where data ends
>> without repositioning (FIND_HOLE), read() that much data which
>> repositions in the process, then skip to the next chunk of data
>> (SEEK_DATA) - two lseek() calls per iteration if we have 4 constants,
>> but 3 per iteration if we only have SEEK_HOLE and have to manually
>> rewind.
>
> while(1) {
> read(block);
> if (block_all_zeroes)
> lseek(SEEK_DATA);
> }
>
> What's wrong with the above? If this is the case, even SEEK_HOLE
> is not needed but should be added as it is already in Solaris.
Because you don't know if the block is the same size as the minimum
hole, and because some systems require rather large holes (my Solaris
testing on a zfs system didn't have holes until 128k), that's a rather
large amount of reading just to prove that the block has all zeros to
know that it is even worth trying the lseek(SEEK_DATA). My gut feel is
that doing the lseek(SEEK_HOLE) up front coupled with seeking back to
the same position is more efficient than manually checking for a run of
zeros (less cache pollution, works with 4k read buffers without having
to know filesystem hole size).
>
> My problem with FIND_* is that we are messing with the well understood
> semantics of lseek().
You'll notice I didn't propose any FIND_* constants for POSIX.
> And if generic_file_llseek_unlocked() treats SEEK_DATA as SEEK_CUR and
You meant SEEK_SET not SEEK_CUR, but...
> SEEK_HOLE as SEEK_END (both with zero offset) then we don't even
> have to bother with the finding the "correct" error code.
...that's still not compatible with Solaris. On a file with size of 0
bytes, lseek(fd, 1, SEEK_SET) and lseek(fd, 0, SEEK_END) will both
succeed, but lseek(fd, 1, SEEK_DATA) and lseek(fd, 0, SEEK_HOLE) must
fail with ENXIO (the offset was at or beyond the size of the file).
For a file with no holes, Solaris semantics behave as if:
off_t lseek(int fildes, off_t offset, int whence)
{
off_t cur, end;
switch (whence)
{
case SEEK_HOLE:
case SEEK_DATA:
cur = lseek(fildes, 0, SEEK_CUR);
if (cur < 0)
return cur;
end = lseek(fildes, 0, SEEK_END);
if (end < 0)
return end;
if (offset < end)
return whence == SEEK_HOLE ? end : lseek(fildes, offset,
SEEK_SET);
lseek(fildes, cur, SEEK_SET);
errno = ENXIO;
return -1;
default:
... /* Existing implementation */
}
}
--
Eric Blake [email protected] +1-801-349-2682
Libvirt virtualization library http://libvirt.org
On 04/22/2011 09:40 AM, Eric Blake wrote:
> On 04/22/2011 10:28 AM, Sunil Mushran wrote:
>> while(1) {
>> read(block);
>> if (block_all_zeroes)
>> lseek(SEEK_DATA);
>> }
>>
>> What's wrong with the above? If this is the case, even SEEK_HOLE
>> is not needed but should be added as it is already in Solaris.
> Because you don't know if the block is the same size as the minimum
> hole, and because some systems require rather large holes (my Solaris
> testing on a zfs system didn't have holes until 128k), that's a rather
> large amount of reading just to prove that the block has all zeros to
> know that it is even worth trying the lseek(SEEK_DATA). My gut feel is
> that doing the lseek(SEEK_HOLE) up front coupled with seeking back to
> the same position is more efficient than manually checking for a run of
> zeros (less cache pollution, works with 4k read buffers without having
> to know filesystem hole size).
Holes are an implementation detail.
cp can read whatever blocksize it chooses. If that block contains
zero, it would signal cp that maybe it should SEEK_DATA and skip
reading all those blocks. That's all. We are not trying to achieve
perfection. We are just trying to reduce cpu waste.
If the fs supports SEEK_*, then great. If it does not, then it is no
worse than before.
On 04/22/2011 10:57 AM, Sunil Mushran wrote:
> On 04/22/2011 09:40 AM, Eric Blake wrote:
>> On 04/22/2011 10:28 AM, Sunil Mushran wrote:
>>> while(1) {
>>> read(block);
>>> if (block_all_zeroes)
>>> lseek(SEEK_DATA);
>>> }
>>>
>>> What's wrong with the above? If this is the case, even SEEK_HOLE
>>> is not needed but should be added as it is already in Solaris.
>
> Holes are an implementation detail.
Nobody's arguing that. And on Solaris, a file system with no holes
support tells you that up front - lseek(fd, 0, SEEK_HOLE) returns the
end of the file (unless the file is 0 bytes, then it fails with ENXIO).
>
> cp can read whatever blocksize it chooses. If that block contains
> zero, it would signal cp that maybe it should SEEK_DATA and skip
> reading all those blocks. That's all. We are not trying to achieve
> perfection. We are just trying to reduce cpu waste.
>
> If the fs supports SEEK_*, then great. If it does not, then it is no
> worse than before.
But providing just SEEK_DATA _is_ worse than before if you don't provide
the correct SEEK_HOLE everywhere. Because then your algorithm of trying
lseek(SEEK_DATA) after every run of zeros in the hopes of an
optimization is a wasted syscall, since it will just return your current
offset every time, so you end up with more syscalls than if you had used
the single lseek(SEEK_DATA) that returns the end of the file up front,
and known that the remainder of the file has no holes to even try
seeking past.
--
Eric Blake [email protected] +1-801-349-2682
Libvirt virtualization library http://libvirt.org
On 04/22/2011 10:03 AM, Eric Blake wrote:
>> cp can read whatever blocksize it chooses. If that block contains
>> zero, it would signal cp that maybe it should SEEK_DATA and skip
>> reading all those blocks. That's all. We are not trying to achieve
>> perfection. We are just trying to reduce cpu waste.
>>
>> If the fs supports SEEK_*, then great. If it does not, then it is no
>> worse than before.
> But providing just SEEK_DATA _is_ worse than before if you don't provide
> the correct SEEK_HOLE everywhere. Because then your algorithm of trying
> lseek(SEEK_DATA) after every run of zeros in the hopes of an
> optimization is a wasted syscall, since it will just return your current
> offset every time, so you end up with more syscalls than if you had used
> the single lseek(SEEK_DATA) that returns the end of the file up front,
> and known that the remainder of the file has no holes to even try
> seeking past.
You are over-optimizing. strace any process on your box and you
will find numerous wasted syscalls.
On 2011-04-22, at 11:08 AM, Sunil Mushran <[email protected]> wrote:
> On 04/22/2011 10:03 AM, Eric Blake wrote:
>>> cp can read whatever blocksize it chooses. If that block contains
>>> zero, it would signal cp that maybe it should SEEK_DATA and skip
>>> reading all those blocks. That's all. We are not trying to achieve
>>> perfection. We are just trying to reduce cpu waste.
>>>
>>> If the fs supports SEEK_*, then great. If it does not, then it is no
>>> worse than before.
>> But providing just SEEK_DATA _is_ worse than before if you don't provide
>> the correct SEEK_HOLE everywhere. Because then your algorithm of trying
>> lseek(SEEK_DATA) after every run of zeros in the hopes of an
>> optimization is a wasted syscall, since it will just return your current
>> offset every time, so you end up with more syscalls than if you had used
>> the single lseek(SEEK_DATA) that returns the end of the file up front,
>> and known that the remainder of the file has no holes to even try
>> seeking past.
>
> You are over-optimizing. strace any process on your box and you
> will find numerous wasted syscalls.
Sure, there are lots of wasted syscalls, but in this case the cost of doing extra SEEK_DATA and SEEK_HOLE operations may be fairly costly. This involves scanning the whole disk layout, possibly over a network, which might need tens or hundreds of disk seeks to read the metadata, unlike regular SEEK_SET.
Even SEEK_END is somewhat costly on a network filesystem, since the syscall needs to lock the file size in order to determine the end of the file, which can block other threads from writing to the file.
So while I agree that the Linux mantra that "syscalls are cheap" is true if those syscalls don't actually do any work, I think it also ignores the cost of what some syscalls need to do behind the scenes.
Cheers, Andreas-
Hi,
Josef Bacik wrote:
> This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags. Turns out
> using fiemap in things like cp cause more problems than it solves, so lets try
> and give userspace an interface that doesn't suck. So we have
That's easy to believe, but could you elaborate? What problem does
using fiemap cause? I assume the answer is somewhere in somewhere in
the thread
http://thread.gmane.org/gmane.comp.file-systems.xfs.general/37895/focus=24404
but it would be nice to have a summary in the commit log for
posterity.
Thanks for working on this.
On 04/22/2011 01:10 PM, Jonathan Nieder wrote:
> Hi,
>
> Josef Bacik wrote:
>
>> This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags. Turns out
>> using fiemap in things like cp cause more problems than it solves, so lets try
>> and give userspace an interface that doesn't suck. So we have
> That's easy to believe, but could you elaborate? What problem does
> using fiemap cause? I assume the answer is somewhere in somewhere in
> the thread
>
> http://thread.gmane.org/gmane.comp.file-systems.xfs.general/37895/focus=24404
>
> but it would be nice to have a summary in the commit log for
> posterity.
http://article.gmane.org/gmane.comp.file-systems.ext4/24465
On 04/22/2011 11:06 AM, Andreas Dilger wrote:
> Sure, there are lots of wasted syscalls, but in this case the cost of doing extra SEEK_DATA and SEEK_HOLE operations may be fairly costly. This involves scanning the whole disk layout, possibly over a network, which might need tens or hundreds of disk seeks to read the metadata, unlike regular SEEK_SET.
>
> Even SEEK_END is somewhat costly on a network filesystem, since the syscall needs to lock the file size in order to determine the end of the file, which can block other threads from writing to the file.
>
> So while I agree that the Linux mantra that "syscalls are cheap" is true if those syscalls don't actually do any work, I think it also ignores the cost of what some syscalls need to do behind the scenes.
You have a point. I was just reviewing the possible patch for ocfs2
and it looks heavy.
One option is to scrap SEEK_* and make another syscall llfind() with FIND_*.
Or, do both.
Sunil Mushran wrote:
> On 04/22/2011 04:50 AM, Eric Blake wrote:
> >That blog also mentioned the useful idea of adding FIND_HOLE and
> >FIND_DATA, not implemented in Solaris, but which could easily be
> >provided as additional lseek constants in Linux to locate the start of
> >the next chunk without repositioning and which could ease application
> >programmer's life a bit. After all, cp wants to know where data ends
> >without repositioning (FIND_HOLE), read() that much data which
> >repositions in the process, then skip to the next chunk of data
> >(SEEK_DATA) - two lseek() calls per iteration if we have 4 constants,
> >but 3 per iteration if we only have SEEK_HOLE and have to manually rewind.
>
> while(1) {
> read(block);
> if (block_all_zeroes)
> lseek(SEEK_DATA);
> }
>
> What's wrong with the above? If this is the case, even SEEK_HOLE
> is not needed but should be added as it is already in Solaris.
Apart from the obvious waste of effort (scanning *all* data for zeros
is cheap but not free if the file is mostly non-hole zeros), you can't
do a pread() version of the above in parallel over different parts of
the same file/device.
> My problem with FIND_* is that we are messing with the well understood
> semantics of lseek().
fcntl() looks a better fit for FIND_HOLE/DATA anyway.
-- Jamie
On Thu, 21 Apr 2011 15:42:33 EDT, Josef Bacik said:
> -SEEK_HOLE: this moves the file pos to the nearest hole in the file from the
> given position.
Do we want the semantic to be "the nearest" hole? Or did we really want "the
next" hole? Loops like a bullet loaded in the chamber and pointed at the
programmer's foot if they aren't allowing for the fact that this can go
*backwards* in the file if the closest hole is towards the beginning. Good way
to end up in an infinite loop or other messy...
Consider the obvious implementation of "skip over a hole" - lseek(SEEK_HOLE),
lseek(SEEK_DATA). and start reading data because we've skipped over the hole.
Wrong - the second seek may have gone backwards if the data was only 4K away
but the hole was 64K in size...
On 04/24/2011 11:49 AM, Jamie Lokier wrote:
>> My problem with FIND_* is that we are messing with the well understood
>> semantics of lseek().
>
> fcntl() looks a better fit for FIND_HOLE/DATA anyway.
With fcntl(), it would have to be something like:
off_t offset = start;
if (fcntl (fd, F_FIND_HOLE, &offset) != 0)
; // find failed
// offset is now set to the next location after start
That is, offset has to be passed as an input _and_ output parameter,
since we can't use the int return value of fcntl to convey off_t
information, and since the optional third argument of fcntl has
traditionally been no wider than a pointer which is not the case for
off_t on 32-bit platforms.
--
Eric Blake [email protected] +1-801-349-2682
Libvirt virtualization library http://libvirt.org
Eric Blake wrote:
> On 04/24/2011 11:49 AM, Jamie Lokier wrote:
> >> My problem with FIND_* is that we are messing with the well understood
> >> semantics of lseek().
> >
> > fcntl() looks a better fit for FIND_HOLE/DATA anyway.
>
> With fcntl(), it would have to be something like:
>
> off_t offset = start;
> if (fcntl (fd, F_FIND_HOLE, &offset) != 0)
> ; // find failed
> // offset is now set to the next location after start
Yes that, or a pointer-to-struct in the style of other fcntl()
operations.
A struct offers more flexibiliy such as a limit on search distance
(may be useful on filesystems where searching reads a lot of metadata
and you don't really want to scan all of a large file), and whether to
include unwritten prealloc space or written-known-zeros space.
I thought of fcntl() because historically it's often how you get quirky
information about files and how to read them, on many OSes.
-- Jamie
Hi Eric,
On 2011-04-22 07:06 -0600, Eric Blake wrote:
> I've created a request to standardize SEEK_HOLE and SEEK_DATA in the
> next revision of POSIX; comments are welcome to make sure that everyone
> is happy with wording:
> http://austingroupbugs.net/view.php?id=415
Reading through your proposal, I think there is one thing that could be
clarified: the meaning of "the last hole" in the file. Consider the
following two file layouts -- in the "diagrams", a bar (|) indicates a
boundary between holes and non-hole data, and a double bar (||)
indicates end-of-file.
* File A (sparse file created by lseek/write beyond end-of-file):
data | hole 0 | data || hole 1 (virtual)
* File B (sparse file created by truncate beyond end-of-file):
data | hole 0 || hole 1 (virtual)
Excluding the error description, the term "the last hole" is used in
two places in your proposal:
* (for SEEK_HOLE): if offset falls within "the last hole", then the
file offset may be set to the file size instead.
* (for SEEK_DATA): it shall be an error ... if offset falls within the
last hole.
I imagine that both of these conditions are intended to address the
case where the offset falls within hole 0 in File B, that is, when
there is no non-hole data beyond the specified offset but the offset
is nevertheless less than the file size. However, this looks (to me)
like the penultimate hole in the file, not the last hole. Furthermore,
these conditions are presumably *not* intended to apply to the
penultimate hole in File A, which has data after it.
I think my confusion can be avoided by talking about the last non-hole
data byte in the file (which is unambigious), instead of by talking
about the last hole. For instance, the SEEK_HOLE/SEEK_DATA descriptions
could be written as follows:
If whence is SEEK_HOLE, the file offset shall be set to the smallest
location of a byte within a hole and not less than offset, except that
if offset falls beyond the last byte not within a hole, then the file
offset may be set to the file size instead. It shall be an error if
offset is greater or equal to the size of the file.
If whence is SEEK_DATA, the file offset shall be set to the smallest
location of a byte not within a hole and not less than offset. It shall
be an error if no such byte exists.
plus a corresponding update to the ENXIO description:
... or the whence argument is SEEK_DATA and the offset falls beyond
the last byte not within a hole.
--
Nick Bowler, Elliptic Technologies (http://www.elliptictech.com/)
On 04/25/2011 09:02 AM, Nick Bowler wrote:
Hi Nick,
> * File A (sparse file created by lseek/write beyond end-of-file):
>
> data | hole 0 | data || hole 1 (virtual)
>
> * File B (sparse file created by truncate beyond end-of-file):
>
> data | hole 0 || hole 1 (virtual)
>
> Excluding the error description, the term "the last hole" is used in
> two places in your proposal:
>
> * (for SEEK_HOLE): if offset falls within "the last hole", then the
> file offset may be set to the file size instead.
>
> * (for SEEK_DATA): it shall be an error ... if offset falls within the
> last hole.
>
> I imagine that both of these conditions are intended to address the
> case where the offset falls within hole 0 in File B, that is, when
> there is no non-hole data beyond the specified offset but the offset
> is nevertheless less than the file size.
Correct.
> However, this looks (to me)
> like the penultimate hole in the file, not the last hole. Furthermore,
> these conditions are presumably *not* intended to apply to the
> penultimate hole in File A, which has data after it.
Good catch.
>
> I think my confusion can be avoided by talking about the last non-hole
> data byte in the file (which is unambigious), instead of by talking
> about the last hole. For instance, the SEEK_HOLE/SEEK_DATA descriptions
> could be written as follows:
>
> If whence is SEEK_HOLE, the file offset shall be set to the smallest
> location of a byte within a hole and not less than offset, except that
> if offset falls beyond the last byte not within a hole, then the file
> offset may be set to the file size instead. It shall be an error if
> offset is greater or equal to the size of the file.
>
> If whence is SEEK_DATA, the file offset shall be set to the smallest
> location of a byte not within a hole and not less than offset. It shall
> be an error if no such byte exists.
>
> plus a corresponding update to the ENXIO description:
>
> ... or the whence argument is SEEK_DATA and the offset falls beyond
> the last byte not within a hole.
I've added your improved wording as a comment at
http://austingroupbugs.net/view.php?id=415
--
Eric Blake [email protected] +1-801-349-2682
Libvirt virtualization library http://libvirt.org