IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
arguments.
Signed-off-by: Lennert Buytenhek <[email protected]>
---
This seems to work OK, but I'd appreciate a review from someone more
familiar with io_uring internals than I am, as I'm not entirely sure
I did everything quite right.
A dumb test program for IORING_OP_GETDENTS64 is available here:
https://krautbox.wantstofly.org/~buytenh/uringfind.c
This does more or less what find(1) does: it scans recursively through
a directory tree and prints the names of all directories and files it
encounters along the way -- but then using io_uring. (The uring version
prints the names of encountered files and directories in an order that's
determined by SQE completion order, which is somewhat nondeterministic
and likely to differ between runs.)
On a directory tree with 14-odd million files in it that's on a
six-drive (spinning disk) btrfs raid, find(1) takes:
# echo 3 > /proc/sys/vm/drop_caches
# time find /mnt/repo > /dev/null
real 24m7.815s
user 0m15.015s
sys 0m48.340s
#
And the io_uring version takes:
# echo 3 > /proc/sys/vm/drop_caches
# time ./uringfind /mnt/repo > /dev/null
real 10m29.064s
user 0m4.347s
sys 0m1.677s
#
These timings are repeatable and consistent to within a few seconds.
(btrfs seems to be sending most metadata reads to the same drive in the
array during this test, even though this filesystem is using the raid1c4
profile for metadata, so I suspect that more drive-level parallelism can
be extracted with some btrfs tweaks.)
The fully cached case also shows some speedup for the io_uring version:
# time find /mnt/repo > /dev/null
real 0m5.223s
user 0m1.926s
sys 0m3.268s
#
vs:
# time ./uringfind /mnt/repo > /dev/null
real 0m3.604s
user 0m2.417s
sys 0m0.793s
#
That said, the point of this patch isn't primarily to enable
lightning-fast find(1) or du(1), but more to complete the set of
filesystem I/O primitives available via io_uring, so that applications
can do all of their filesystem I/O using the same mechanism, without
having to manually punt some of their work out to worker threads -- and
indeed, an object storage backend server that I wrote a while ago can
run with a pure io_uring based event loop with this patch.
One open question is whether IORING_OP_GETDENTS64 should be more like
pread(2)?and allow passing in a starting offset to read from the
directory from. (This would require some more surgery in fs/readdir.c.)
fs/io_uring.c | 51 ++++++++++++++++++++++++++++++++++++++++++
fs/readdir.c | 25 ++++++++++++++------
include/linux/fs.h | 4 +++
include/uapi/linux/io_uring.h | 1
4 files changed, 73 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 985a9e3f976d..5d79b9668ee0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -572,6 +572,12 @@ struct io_unlink {
struct filename *filename;
};
+struct io_getdents64 {
+ struct file *file;
+ struct linux_dirent64 __user *dirent;
+ unsigned int count;
+};
+
struct io_completion {
struct file *file;
struct list_head list;
@@ -699,6 +705,7 @@ struct io_kiocb {
struct io_shutdown shutdown;
struct io_rename rename;
struct io_unlink unlink;
+ struct io_getdents64 getdents64;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
@@ -987,6 +994,11 @@ static const struct io_op_def io_op_defs[] = {
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_GETDENTS64] = {
+ .needs_file = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
};
enum io_mem_account {
@@ -4552,6 +4564,40 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
return 0;
}
+static int io_getdents64_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_getdents64 *getdents64 = &req->getdents64;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index)
+ return -EINVAL;
+
+ getdents64->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ getdents64->count = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_getdents64(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_getdents64 *getdents64 = &req->getdents64;
+ int ret;
+
+ /* getdents64 always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = vfs_getdents64(req->file, getdents64->dirent, getdents64->count);
+ if (ret < 0) {
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail_links(req);
+ }
+ io_req_complete(req, ret);
+ return 0;
+}
+
#if defined(CONFIG_NET)
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
@@ -6078,6 +6124,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_renameat_prep(req, sqe);
case IORING_OP_UNLINKAT:
return io_unlinkat_prep(req, sqe);
+ case IORING_OP_GETDENTS64:
+ return io_getdents64_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6337,6 +6385,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
case IORING_OP_UNLINKAT:
ret = io_unlinkat(req, force_nonblock);
break;
+ case IORING_OP_GETDENTS64:
+ ret = io_getdents64(req, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/fs/readdir.c b/fs/readdir.c
index 19434b3c982c..5310677d5d36 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -348,10 +348,9 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
return -EFAULT;
}
-SYSCALL_DEFINE3(getdents64, unsigned int, fd,
- struct linux_dirent64 __user *, dirent, unsigned int, count)
+int vfs_getdents64(struct file *file, struct linux_dirent64 __user *dirent,
+ unsigned int count)
{
- struct fd f;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
.count = count,
@@ -359,11 +358,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(file, &buf.ctx);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
@@ -376,6 +371,20 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
+ return error;
+}
+
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+ struct linux_dirent64 __user *, dirent, unsigned int, count)
+{
+ struct fd f;
+ int error;
+
+ f = fdget_pos(fd);
+ if (!f.file)
+ return -EBADF;
+
+ error = vfs_getdents64(f.file, dirent, count);
fdput_pos(f);
return error;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..602202a8fc1f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3109,6 +3109,10 @@ extern const struct inode_operations simple_symlink_inode_operations;
extern int iterate_dir(struct file *, struct dir_context *);
+struct linux_dirent64;
+int vfs_getdents64(struct file *file, struct linux_dirent64 __user *dirent,
+ unsigned int count);
+
int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
int flags);
int vfs_fstat(int fd, struct kstat *stat);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d31a2a1e8ef9..5602414735f7 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -137,6 +137,7 @@ enum {
IORING_OP_SHUTDOWN,
IORING_OP_RENAMEAT,
IORING_OP_UNLINKAT,
+ IORING_OP_GETDENTS64,
/* this goes last, obviously */
IORING_OP_LAST,
On 1/23/21 4:41 AM, Lennert Buytenhek wrote:
> IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
> arguments.
>
> Signed-off-by: Lennert Buytenhek <[email protected]>
> ---
> This seems to work OK, but I'd appreciate a review from someone more
> familiar with io_uring internals than I am, as I'm not entirely sure
> I did everything quite right.
>
> A dumb test program for IORING_OP_GETDENTS64 is available here:
>
> https://krautbox.wantstofly.org/~buytenh/uringfind.c
>
> This does more or less what find(1) does: it scans recursively through
> a directory tree and prints the names of all directories and files it
> encounters along the way -- but then using io_uring. (The uring version
> prints the names of encountered files and directories in an order that's
> determined by SQE completion order, which is somewhat nondeterministic
> and likely to differ between runs.)
>
> On a directory tree with 14-odd million files in it that's on a
> six-drive (spinning disk) btrfs raid, find(1) takes:
>
> # echo 3 > /proc/sys/vm/drop_caches
> # time find /mnt/repo > /dev/null
>
> real 24m7.815s
> user 0m15.015s
> sys 0m48.340s
> #
>
> And the io_uring version takes:
>
> # echo 3 > /proc/sys/vm/drop_caches
> # time ./uringfind /mnt/repo > /dev/null
>
> real 10m29.064s
> user 0m4.347s
> sys 0m1.677s
> #
>
> These timings are repeatable and consistent to within a few seconds.
>
> (btrfs seems to be sending most metadata reads to the same drive in the
> array during this test, even though this filesystem is using the raid1c4
> profile for metadata, so I suspect that more drive-level parallelism can
> be extracted with some btrfs tweaks.)
>
> The fully cached case also shows some speedup for the io_uring version:
>
> # time find /mnt/repo > /dev/null
>
> real 0m5.223s
> user 0m1.926s
> sys 0m3.268s
> #
>
> vs:
>
> # time ./uringfind /mnt/repo > /dev/null
>
> real 0m3.604s
> user 0m2.417s
> sys 0m0.793s
> #
>
> That said, the point of this patch isn't primarily to enable
> lightning-fast find(1) or du(1), but more to complete the set of
> filesystem I/O primitives available via io_uring, so that applications
> can do all of their filesystem I/O using the same mechanism, without
> having to manually punt some of their work out to worker threads -- and
> indeed, an object storage backend server that I wrote a while ago can
> run with a pure io_uring based event loop with this patch.
The results look nice for sure. Once concern is that io_uring generally
guarantees that any state passed in is stable once submit is done. For
the below implementation, that doesn't hold as the linux_dirent64 isn't
used until later in the process. That means if you do:
submit_getdents64(ring)
{
struct linux_dirent64 dent;
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(ring);
io_uring_prep_getdents64(sqe, ..., &dent);
io_uring_submit(ring);
}
other_func(ring)
{
struct io_uring_cqe *cqe;
submit_getdents64(ring);
io_uring_wait_cqe(ring, &cqe);
}
then the kernel side might get garbage by the time the sqe is actually
submitted. This is true because you don't use it inline, only from the
out-of-line async context. Usually this is solved by having the prep
side copy in the necessary state, eg see io_openat2_prep() for how we
make filename and open_how stable by copying them into kernel memory.
That ensures that if/when these operations need to go async and finish
out-of-line, the contents are stable and there's no requirement for the
application to keep them valid once submission is done.
Not sure how best to solve that, since the vfs side relies heavily on
linux_dirent64 being a user pointer...
Outside of that, implementation looks straight forward.
--
Jens Axboe
On Sat, Jan 23, 2021 at 10:37:25AM -0700, Jens Axboe wrote:
> > IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
> > arguments.
> >
> > Signed-off-by: Lennert Buytenhek <[email protected]>
> > ---
> > This seems to work OK, but I'd appreciate a review from someone more
> > familiar with io_uring internals than I am, as I'm not entirely sure
> > I did everything quite right.
> >
> > A dumb test program for IORING_OP_GETDENTS64 is available here:
> >
> > https://krautbox.wantstofly.org/~buytenh/uringfind.c
> >
> > This does more or less what find(1) does: it scans recursively through
> > a directory tree and prints the names of all directories and files it
> > encounters along the way -- but then using io_uring. (The uring version
> > prints the names of encountered files and directories in an order that's
> > determined by SQE completion order, which is somewhat nondeterministic
> > and likely to differ between runs.)
> >
> > On a directory tree with 14-odd million files in it that's on a
> > six-drive (spinning disk) btrfs raid, find(1) takes:
> >
> > # echo 3 > /proc/sys/vm/drop_caches
> > # time find /mnt/repo > /dev/null
> >
> > real 24m7.815s
> > user 0m15.015s
> > sys 0m48.340s
> > #
> >
> > And the io_uring version takes:
> >
> > # echo 3 > /proc/sys/vm/drop_caches
> > # time ./uringfind /mnt/repo > /dev/null
> >
> > real 10m29.064s
> > user 0m4.347s
> > sys 0m1.677s
> > #
> >
> > These timings are repeatable and consistent to within a few seconds.
> >
> > (btrfs seems to be sending most metadata reads to the same drive in the
> > array during this test, even though this filesystem is using the raid1c4
> > profile for metadata, so I suspect that more drive-level parallelism can
> > be extracted with some btrfs tweaks.)
> >
> > The fully cached case also shows some speedup for the io_uring version:
> >
> > # time find /mnt/repo > /dev/null
> >
> > real 0m5.223s
> > user 0m1.926s
> > sys 0m3.268s
> > #
> >
> > vs:
> >
> > # time ./uringfind /mnt/repo > /dev/null
> >
> > real 0m3.604s
> > user 0m2.417s
> > sys 0m0.793s
> > #
> >
> > That said, the point of this patch isn't primarily to enable
> > lightning-fast find(1) or du(1), but more to complete the set of
> > filesystem I/O primitives available via io_uring, so that applications
> > can do all of their filesystem I/O using the same mechanism, without
> > having to manually punt some of their work out to worker threads -- and
> > indeed, an object storage backend server that I wrote a while ago can
> > run with a pure io_uring based event loop with this patch.
>
> The results look nice for sure.
Thanks! And thank you for having a look.
> Once concern is that io_uring generally
> guarantees that any state passed in is stable once submit is done. For
> the below implementation, that doesn't hold as the linux_dirent64 isn't
> used until later in the process. That means if you do:
>
> submit_getdents64(ring)
> {
> struct linux_dirent64 dent;
> struct io_uring_sqe *sqe;
>
> sqe = io_uring_get_sqe(ring);
> io_uring_prep_getdents64(sqe, ..., &dent);
> io_uring_submit(ring);
> }
>
> other_func(ring)
> {
> struct io_uring_cqe *cqe;
>
> submit_getdents64(ring);
> io_uring_wait_cqe(ring, &cqe);
>
> }
>
> then the kernel side might get garbage by the time the sqe is actually
> submitted. This is true because you don't use it inline, only from the
> out-of-line async context. Usually this is solved by having the prep
> side copy in the necessary state, eg see io_openat2_prep() for how we
> make filename and open_how stable by copying them into kernel memory.
> That ensures that if/when these operations need to go async and finish
> out-of-line, the contents are stable and there's no requirement for the
> application to keep them valid once submission is done.
>
> Not sure how best to solve that, since the vfs side relies heavily on
> linux_dirent64 being a user pointer...
No data is passed into the kernel on a getdents64(2) call via user
memory, i.e. getdents64(2) only ever writes into the supplied
linux_dirent64 user pointer, it never reads from it. The only things
that we need to keep stable here are the linux_dirent64 pointer itself
and the 'count' argument and those are both passed in via the SQE, and
we READ_ONCE() them from the SQE in the prep function. I think that's
probably the source of confusion here?
Cheers,
Lennert
On 1/23/21 11:16 AM, Lennert Buytenhek wrote:
> On Sat, Jan 23, 2021 at 10:37:25AM -0700, Jens Axboe wrote:
>
>>> IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
>>> arguments.
>>>
>>> Signed-off-by: Lennert Buytenhek <[email protected]>
>>> ---
>>> This seems to work OK, but I'd appreciate a review from someone more
>>> familiar with io_uring internals than I am, as I'm not entirely sure
>>> I did everything quite right.
>>>
>>> A dumb test program for IORING_OP_GETDENTS64 is available here:
>>>
>>> https://krautbox.wantstofly.org/~buytenh/uringfind.c
>>>
>>> This does more or less what find(1) does: it scans recursively through
>>> a directory tree and prints the names of all directories and files it
>>> encounters along the way -- but then using io_uring. (The uring version
>>> prints the names of encountered files and directories in an order that's
>>> determined by SQE completion order, which is somewhat nondeterministic
>>> and likely to differ between runs.)
>>>
>>> On a directory tree with 14-odd million files in it that's on a
>>> six-drive (spinning disk) btrfs raid, find(1) takes:
>>>
>>> # echo 3 > /proc/sys/vm/drop_caches
>>> # time find /mnt/repo > /dev/null
>>>
>>> real 24m7.815s
>>> user 0m15.015s
>>> sys 0m48.340s
>>> #
>>>
>>> And the io_uring version takes:
>>>
>>> # echo 3 > /proc/sys/vm/drop_caches
>>> # time ./uringfind /mnt/repo > /dev/null
>>>
>>> real 10m29.064s
>>> user 0m4.347s
>>> sys 0m1.677s
>>> #
>>>
>>> These timings are repeatable and consistent to within a few seconds.
>>>
>>> (btrfs seems to be sending most metadata reads to the same drive in the
>>> array during this test, even though this filesystem is using the raid1c4
>>> profile for metadata, so I suspect that more drive-level parallelism can
>>> be extracted with some btrfs tweaks.)
>>>
>>> The fully cached case also shows some speedup for the io_uring version:
>>>
>>> # time find /mnt/repo > /dev/null
>>>
>>> real 0m5.223s
>>> user 0m1.926s
>>> sys 0m3.268s
>>> #
>>>
>>> vs:
>>>
>>> # time ./uringfind /mnt/repo > /dev/null
>>>
>>> real 0m3.604s
>>> user 0m2.417s
>>> sys 0m0.793s
>>> #
>>>
>>> That said, the point of this patch isn't primarily to enable
>>> lightning-fast find(1) or du(1), but more to complete the set of
>>> filesystem I/O primitives available via io_uring, so that applications
>>> can do all of their filesystem I/O using the same mechanism, without
>>> having to manually punt some of their work out to worker threads -- and
>>> indeed, an object storage backend server that I wrote a while ago can
>>> run with a pure io_uring based event loop with this patch.
>>
>> The results look nice for sure.
>
> Thanks! And thank you for having a look.
>
>
>> Once concern is that io_uring generally
>> guarantees that any state passed in is stable once submit is done. For
>> the below implementation, that doesn't hold as the linux_dirent64 isn't
>> used until later in the process. That means if you do:
>>
>> submit_getdents64(ring)
>> {
>> struct linux_dirent64 dent;
>> struct io_uring_sqe *sqe;
>>
>> sqe = io_uring_get_sqe(ring);
>> io_uring_prep_getdents64(sqe, ..., &dent);
>> io_uring_submit(ring);
>> }
>>
>> other_func(ring)
>> {
>> struct io_uring_cqe *cqe;
>>
>> submit_getdents64(ring);
>> io_uring_wait_cqe(ring, &cqe);
>>
>> }
>>
>> then the kernel side might get garbage by the time the sqe is actually
>> submitted. This is true because you don't use it inline, only from the
>> out-of-line async context. Usually this is solved by having the prep
>> side copy in the necessary state, eg see io_openat2_prep() for how we
>> make filename and open_how stable by copying them into kernel memory.
>> That ensures that if/when these operations need to go async and finish
>> out-of-line, the contents are stable and there's no requirement for the
>> application to keep them valid once submission is done.
>>
>> Not sure how best to solve that, since the vfs side relies heavily on
>> linux_dirent64 being a user pointer...
>
> No data is passed into the kernel on a getdents64(2) call via user
> memory, i.e. getdents64(2) only ever writes into the supplied
> linux_dirent64 user pointer, it never reads from it. The only things
> that we need to keep stable here are the linux_dirent64 pointer itself
> and the 'count' argument and those are both passed in via the SQE, and
> we READ_ONCE() them from the SQE in the prep function. I think that's
> probably the source of confusion here?
Good point, in fact even if we did read from it as well, the fact that
we write to it already means that it must be stable until completion
on the application side anyway. So I guess there's no issue here!
For the "real" patch, I'd split the vfs prep side into a separate one,
and then have patch 2 be the io_uring side only. Then we'll need a
test case that can be added to liburing as well (and necessary changes
on the liburing side, like op code and prep helper).
--
Jens Axboe
On Sat, Jan 23, 2021 at 01:41:52PM +0200, Lennert Buytenhek wrote:
> IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
Could we drop the '64'? We don't, for example, have IOURING_OP_FADVISE64
even though that's the name of the syscall.
On 1/23/21 4:27 PM, Matthew Wilcox wrote:
> On Sat, Jan 23, 2021 at 01:41:52PM +0200, Lennert Buytenhek wrote:
>> IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
>
> Could we drop the '64'? We don't, for example, have IOURING_OP_FADVISE64
> even though that's the name of the syscall.
Agreed, only case we do mimic the names are for things like
IORING_OP_OPENAT2 where it does carry meaning. For this one, it should
just be IORING_OP_GETDENTS.
--
Jens Axboe
Hi,
On 2021-01-23 13:41:52 +0200, Lennert Buytenhek wrote:
> IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
> arguments.
I've wished for this before, this would be awesome.
> One open question is whether IORING_OP_GETDENTS64 should be more like
> pread(2)?and allow passing in a starting offset to read from the
> directory from. (This would require some more surgery in fs/readdir.c.)
That would imo be preferrable from my end - using the fd's position
means that the fd cannot be shared between threads etc.
It's also not clear to me that right now you'd necessarily get correct
results if multiple IORING_OP_GETDENTS64 for the same fd get processed
in different workers. Looking at iterate_dir(), it looks to me that the
locking around the file position would end up being insufficient on
filesystems that implement iterate_shared?
int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
bool shared = false;
int res = -ENOTDIR;
if (file->f_op->iterate_shared)
shared = true;
else if (!file->f_op->iterate)
goto out;
res = security_file_permission(file, MAY_READ);
if (res)
goto out;
if (shared)
res = down_read_killable(&inode->i_rwsem);
else
res = down_write_killable(&inode->i_rwsem);
if (res)
goto out;
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
ctx->pos = file->f_pos;
if (shared)
res = file->f_op->iterate_shared(file, ctx);
else
res = file->f_op->iterate(file, ctx);
file->f_pos = ctx->pos;
fsnotify_access(file);
file_accessed(file);
}
if (shared)
inode_unlock_shared(inode);
else
inode_unlock(inode);
out:
return res;
}
As there's only a shared lock, seems like both would end up with the
same ctx->pos and end up updating f_pos to the same offset (assuming the
same count).
Am I missing something?
Greetings,
Andres Freund
Hi,
On 2021-01-23 15:50:55 -0800, Andres Freund wrote:
> It's also not clear to me that right now you'd necessarily get correct
> results if multiple IORING_OP_GETDENTS64 for the same fd get processed
> in different workers. Looking at iterate_dir(), it looks to me that the
> locking around the file position would end up being insufficient on
> filesystems that implement iterate_shared?
> [...]
> As there's only a shared lock, seems like both would end up with the
> same ctx->pos and end up updating f_pos to the same offset (assuming the
> same count).
>
> Am I missing something?
A minimal and brute force approach to this would be to use
io_op_def.hash_reg_file, but brrr, that doesn't seem like a great way
forward.
Greetings,
Andres Freund
On Sat, Jan 23, 2021 at 03:50:55PM -0800, Andres Freund wrote:
> As there's only a shared lock, seems like both would end up with the
> same ctx->pos and end up updating f_pos to the same offset (assuming the
> same count).
>
> Am I missing something?
This:
f = fdget_pos(fd);
if (!f.file)
return -EBADF;
in the callers. Protection of struct file contents belongs to struct file,
*not* struct inode. Specifically, file->f_pos_lock. *IF* struct file
in question happens to be shared and the file is a regular or directory
(sockets don't need any exclusion on read(2), etc.)
Hi,
On 2021-01-24 01:59:05 +0000, Al Viro wrote:
> On Sat, Jan 23, 2021 at 03:50:55PM -0800, Andres Freund wrote:
>
> > As there's only a shared lock, seems like both would end up with the
> > same ctx->pos and end up updating f_pos to the same offset (assuming the
> > same count).
> >
> > Am I missing something?
>
> This:
> f = fdget_pos(fd);
> if (!f.file)
> return -EBADF;
> in the callers.
Ah. Thanks for the explainer, userspace guy here ;). I hadn't realized
that fdget_pos acquired a lock around the position...
Regards,
Andres
> One open question is whether IORING_OP_GETDENTS64 should be more like
> pread(2) and allow passing in a starting offset to read from the
> directory from. (This would require some more surgery in fs/readdir.c.)
Since directories are seekable this ought to work.
Modulo horrid issues with 32bit file offsets.
You'd need to return the final offset to allow another
read to continue from the end position.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Sat, Jan 23, 2021 at 04:33:34PM -0700, Jens Axboe wrote:
> >> IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same
> >
> > Could we drop the '64'? We don't, for example, have IOURING_OP_FADVISE64
> > even though that's the name of the syscall.
>
> Agreed, only case we do mimic the names are for things like
> IORING_OP_OPENAT2 where it does carry meaning. For this one, it should
> just be IORING_OP_GETDENTS.
OK, I've made this change.
On Sun, Jan 24, 2021 at 10:21:38PM +0000, David Laight wrote:
> > One open question is whether IORING_OP_GETDENTS64 should be more like
> > pread(2)?and allow passing in a starting offset to read from the
> > directory from. (This would require some more surgery in fs/readdir.c.)
>
> Since directories are seekable this ought to work.
> Modulo horrid issues with 32bit file offsets.
The incremental patch below does this. (It doesn't apply cleanly on
top of v1 of the IORING_OP_GETDENTS patch as I have other changes in
my tree -- I'm including it just to illustrate the changes that would
make this work.)
This change seems to work, and makes IORING_OP_GETDENTS take an
explicitly specified directory offset (instead of using the file's
->f_pos), making it more like pread(2), and I like the change from
a conceptual point of view, but it's a bit ugly around
iterate_dir_use_ctx_pos(). Any thoughts on how to do this more
cleanly (without breaking iterate_dir() semantics)?
> You'd need to return the final offset to allow another
> read to continue from the end position.
We can use the ->d_off value as returned in the last struct
linux_dirent64 as the directory offset to continue reading from
with the next IORING_OP_GETDENTS call, illustrated by the patch
to uringfind.c at the bottom.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 13dd29f8ebb3..0f9707ed9294 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -576,6 +576,7 @@ struct io_getdents {
struct file *file;
struct linux_dirent64 __user *dirent;
unsigned int count;
+ loff_t pos;
};
struct io_completion {
@@ -4584,9 +4585,10 @@ static int io_getdents_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index)
+ if (sqe->ioprio || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
+ getdents->pos = READ_ONCE(sqe->off);
getdents->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
getdents->count = READ_ONCE(sqe->len);
return 0;
@@ -4601,7 +4603,8 @@ static int io_getdents(struct io_kiocb *req, bool force_nonblock)
if (force_nonblock)
return -EAGAIN;
- ret = vfs_getdents(req->file, getdents->dirent, getdents->count);
+ ret = vfs_getdents(req->file, getdents->dirent, getdents->count,
+ &getdents->pos);
if (ret < 0) {
if (ret == -ERESTARTSYS)
ret = -EINTR;
diff --git a/fs/readdir.c b/fs/readdir.c
index f52167c1eb61..d6bd78f6350a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -37,7 +37,7 @@
} while (0)
-int iterate_dir(struct file *file, struct dir_context *ctx)
+int iterate_dir_use_ctx_pos(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
bool shared = false;
@@ -60,12 +60,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
- ctx->pos = file->f_pos;
if (shared)
res = file->f_op->iterate_shared(file, ctx);
else
res = file->f_op->iterate(file, ctx);
- file->f_pos = ctx->pos;
fsnotify_access(file);
file_accessed(file);
}
@@ -76,6 +74,17 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
out:
return res;
}
+
+int iterate_dir(struct file *file, struct dir_context *ctx)
+{
+ int res;
+
+ ctx->pos = file->f_pos;
+ res = iterate_dir_use_ctx_pos(file, ctx);
+ file->f_pos = ctx->pos;
+
+ return res;
+}
EXPORT_SYMBOL(iterate_dir);
/*
@@ -349,16 +358,18 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
}
int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
- unsigned int count)
+ unsigned int count, loff_t *pos)
{
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
+ .ctx.pos = *pos,
.count = count,
.current_dir = dirent
};
int error;
- error = iterate_dir(file, &buf.ctx);
+ error = iterate_dir_use_ctx_pos(file, &buf.ctx);
+ *pos = buf.ctx.pos;
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
@@ -384,7 +395,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (!f.file)
return -EBADF;
- error = vfs_getdents(f.file, dirent, count);
+ error = vfs_getdents(f.file, dirent, count, &f.file->f_pos);
fdput_pos(f);
return error;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 114885d3f6c4..4d9d96163f92 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3107,11 +3107,12 @@ const char *simple_get_link(struct dentry *, struct inode *,
struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;
+extern int iterate_dir_use_ctx_pos(struct file *, struct dir_context *);
extern int iterate_dir(struct file *, struct dir_context *);
struct linux_dirent64;
int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
- unsigned int count);
+ unsigned int count, loff_t *pos);
int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
int flags);
Corresponding uringfind.c change:
diff --git a/uringfind.c b/uringfind.c
index 4282296..e140388 100644
--- a/uringfind.c
+++ b/uringfind.c
@@ -22,9 +22,10 @@ struct linux_dirent64 {
};
static inline void io_uring_prep_getdents(struct io_uring_sqe *sqe, int fd,
- void *buf, unsigned int count)
+ void *buf, unsigned int count,
+ uint64_t off)
{
- io_uring_prep_rw(IORING_OP_GETDENTS, sqe, fd, buf, count, 0);
+ io_uring_prep_rw(IORING_OP_GETDENTS, sqe, fd, buf, count, off);
}
@@ -38,6 +39,7 @@ struct dir {
struct dir *parent;
int fd;
+ uint64_t off;
uint8_t buf[16384];
char name[0];
};
@@ -131,7 +133,8 @@ static void schedule_readdir(struct dir *dir)
struct io_uring_sqe *sqe;
sqe = get_sqe();
- io_uring_prep_getdents(sqe, dir->fd, dir->buf, sizeof(dir->buf));
+ io_uring_prep_getdents(sqe, dir->fd, dir->buf, sizeof(dir->buf),
+ dir->off);
io_uring_sqe_set_data(sqe, dir);
}
@@ -145,6 +148,7 @@ static void opendir_completion(struct dir *dir, int ret)
}
dir->fd = ret;
+ dir->off = 0;
schedule_readdir(dir);
}
@@ -179,6 +183,7 @@ static void readdir_completion(struct dir *dir, int ret)
schedule_opendir(dir, dent->d_name);
}
+ dir->off = dent->d_off;
bufp += dent->d_reclen;
}
On Fri, Jan 29, 2021 at 01:07:10AM +0200, Lennert Buytenhek wrote:
> > > One open question is whether IORING_OP_GETDENTS64 should be more like
> > > pread(2)?and allow passing in a starting offset to read from the
> > > directory from. (This would require some more surgery in fs/readdir.c.)
> >
> > Since directories are seekable this ought to work.
> > Modulo horrid issues with 32bit file offsets.
>
> The incremental patch below does this. (It doesn't apply cleanly on
> top of v1 of the IORING_OP_GETDENTS patch as I have other changes in
> my tree -- I'm including it just to illustrate the changes that would
> make this work.)
>
> This change seems to work, and makes IORING_OP_GETDENTS take an
> explicitly specified directory offset (instead of using the file's
> ->f_pos), making it more like pread(2) [...]
...but the fact that this patch avoids taking file->f_pos_lock (as this
proposed version of IORING_OP_GETDENTS avoids using file->f_pos) means
that ->iterate_shared() can then be called concurrently on the same
struct file, which breaks the mutual exclusion guarantees provided here.
If possible, I'd like to keep the ability to explicitly pass in a
directory offset to start reading from into IORING_OP_GETDENTS, so
perhaps we can simply satisfy the mutual exclusion requirement by
taking ->f_pos_lock by hand -- but then I do need to check that it's OK
for ->iterate{,_shared}() to be called successively with discontinuous
offsets without ->llseek() being called in between.
(If that's not OK, then we can either have IORING_OP_GETDENTS just
always start reading at ->f_pos like before (which might then require
adding a IORING_OP_GETDENTS2 at some point in the future if we'll
ever want to change that), or we could have IORING_OP_GETDENTS itself
call ->llseek() for now whenever necessary.)
> , and I like the change from
> a conceptual point of view, but it's a bit ugly around
> iterate_dir_use_ctx_pos(). Any thoughts on how to do this more
> cleanly (without breaking iterate_dir() semantics)?
>
>
> > You'd need to return the final offset to allow another
> > read to continue from the end position.
>
> We can use the ->d_off value as returned in the last struct
> linux_dirent64 as the directory offset to continue reading from
> with the next IORING_OP_GETDENTS call, illustrated by the patch
> to uringfind.c at the bottom.
>
>
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 13dd29f8ebb3..0f9707ed9294 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -576,6 +576,7 @@ struct io_getdents {
> struct file *file;
> struct linux_dirent64 __user *dirent;
> unsigned int count;
> + loff_t pos;
> };
>
> struct io_completion {
> @@ -4584,9 +4585,10 @@ static int io_getdents_prep(struct io_kiocb *req,
>
> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> return -EINVAL;
> - if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index)
> + if (sqe->ioprio || sqe->rw_flags || sqe->buf_index)
> return -EINVAL;
>
> + getdents->pos = READ_ONCE(sqe->off);
> getdents->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> getdents->count = READ_ONCE(sqe->len);
> return 0;
> @@ -4601,7 +4603,8 @@ static int io_getdents(struct io_kiocb *req, bool force_nonblock)
> if (force_nonblock)
> return -EAGAIN;
>
> - ret = vfs_getdents(req->file, getdents->dirent, getdents->count);
> + ret = vfs_getdents(req->file, getdents->dirent, getdents->count,
> + &getdents->pos);
> if (ret < 0) {
> if (ret == -ERESTARTSYS)
> ret = -EINTR;
> diff --git a/fs/readdir.c b/fs/readdir.c
> index f52167c1eb61..d6bd78f6350a 100644
> --- a/fs/readdir.c
> +++ b/fs/readdir.c
> @@ -37,7 +37,7 @@
> } while (0)
>
>
> -int iterate_dir(struct file *file, struct dir_context *ctx)
> +int iterate_dir_use_ctx_pos(struct file *file, struct dir_context *ctx)
> {
> struct inode *inode = file_inode(file);
> bool shared = false;
> @@ -60,12 +60,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
>
> res = -ENOENT;
> if (!IS_DEADDIR(inode)) {
> - ctx->pos = file->f_pos;
> if (shared)
> res = file->f_op->iterate_shared(file, ctx);
> else
> res = file->f_op->iterate(file, ctx);
> - file->f_pos = ctx->pos;
> fsnotify_access(file);
> file_accessed(file);
> }
> @@ -76,6 +74,17 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
> out:
> return res;
> }
> +
> +int iterate_dir(struct file *file, struct dir_context *ctx)
> +{
> + int res;
> +
> + ctx->pos = file->f_pos;
> + res = iterate_dir_use_ctx_pos(file, ctx);
> + file->f_pos = ctx->pos;
> +
> + return res;
> +}
> EXPORT_SYMBOL(iterate_dir);
>
> /*
> @@ -349,16 +358,18 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
> }
>
> int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> - unsigned int count)
> + unsigned int count, loff_t *pos)
> {
> struct getdents_callback64 buf = {
> .ctx.actor = filldir64,
> + .ctx.pos = *pos,
> .count = count,
> .current_dir = dirent
> };
> int error;
>
> - error = iterate_dir(file, &buf.ctx);
> + error = iterate_dir_use_ctx_pos(file, &buf.ctx);
> + *pos = buf.ctx.pos;
> if (error >= 0)
> error = buf.error;
> if (buf.prev_reclen) {
> @@ -384,7 +395,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
> if (!f.file)
> return -EBADF;
>
> - error = vfs_getdents(f.file, dirent, count);
> + error = vfs_getdents(f.file, dirent, count, &f.file->f_pos);
> fdput_pos(f);
> return error;
> }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 114885d3f6c4..4d9d96163f92 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3107,11 +3107,12 @@ const char *simple_get_link(struct dentry *, struct inode *,
> struct delayed_call *);
> extern const struct inode_operations simple_symlink_inode_operations;
>
> +extern int iterate_dir_use_ctx_pos(struct file *, struct dir_context *);
> extern int iterate_dir(struct file *, struct dir_context *);
>
> struct linux_dirent64;
> int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> - unsigned int count);
> + unsigned int count, loff_t *pos);
>
> int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
> int flags);
>
>
>
> Corresponding uringfind.c change:
>
> diff --git a/uringfind.c b/uringfind.c
> index 4282296..e140388 100644
> --- a/uringfind.c
> +++ b/uringfind.c
> @@ -22,9 +22,10 @@ struct linux_dirent64 {
> };
>
> static inline void io_uring_prep_getdents(struct io_uring_sqe *sqe, int fd,
> - void *buf, unsigned int count)
> + void *buf, unsigned int count,
> + uint64_t off)
> {
> - io_uring_prep_rw(IORING_OP_GETDENTS, sqe, fd, buf, count, 0);
> + io_uring_prep_rw(IORING_OP_GETDENTS, sqe, fd, buf, count, off);
> }
>
>
> @@ -38,6 +39,7 @@ struct dir {
>
> struct dir *parent;
> int fd;
> + uint64_t off;
> uint8_t buf[16384];
> char name[0];
> };
> @@ -131,7 +133,8 @@ static void schedule_readdir(struct dir *dir)
> struct io_uring_sqe *sqe;
>
> sqe = get_sqe();
> - io_uring_prep_getdents(sqe, dir->fd, dir->buf, sizeof(dir->buf));
> + io_uring_prep_getdents(sqe, dir->fd, dir->buf, sizeof(dir->buf),
> + dir->off);
> io_uring_sqe_set_data(sqe, dir);
> }
>
> @@ -145,6 +148,7 @@ static void opendir_completion(struct dir *dir, int ret)
> }
>
> dir->fd = ret;
> + dir->off = 0;
> schedule_readdir(dir);
> }
>
> @@ -179,6 +183,7 @@ static void readdir_completion(struct dir *dir, int ret)
> schedule_opendir(dir, dent->d_name);
> }
>
> + dir->off = dent->d_off;
> bufp += dent->d_reclen;
> }
>
On Fri, Jan 29, 2021 at 07:37:03AM +0200, Lennert Buytenhek wrote:
> > > > One open question is whether IORING_OP_GETDENTS64 should be more like
> > > > pread(2)?and allow passing in a starting offset to read from the
> > > > directory from. (This would require some more surgery in fs/readdir.c.)
> > >
> > > Since directories are seekable this ought to work.
> > > Modulo horrid issues with 32bit file offsets.
> >
> > The incremental patch below does this. (It doesn't apply cleanly on
> > top of v1 of the IORING_OP_GETDENTS patch as I have other changes in
> > my tree -- I'm including it just to illustrate the changes that would
> > make this work.)
> >
> > This change seems to work, and makes IORING_OP_GETDENTS take an
> > explicitly specified directory offset (instead of using the file's
> > ->f_pos), making it more like pread(2) [...]
>
> ...but the fact that this patch avoids taking file->f_pos_lock (as this
> proposed version of IORING_OP_GETDENTS avoids using file->f_pos) means
> that ->iterate_shared() can then be called concurrently on the same
> struct file, which breaks the mutual exclusion guarantees provided here.
>
> If possible, I'd like to keep the ability to explicitly pass in a
> directory offset to start reading from into IORING_OP_GETDENTS, so
> perhaps we can simply satisfy the mutual exclusion requirement by
> taking ->f_pos_lock by hand -- but then I do need to check that it's OK
> for ->iterate{,_shared}() to be called successively with discontinuous
> offsets without ->llseek() being called in between.
>
> (If that's not OK, then we can either have IORING_OP_GETDENTS just
> always start reading at ->f_pos like before (which might then require
> adding a IORING_OP_GETDENTS2 at some point in the future if we'll
> ever want to change that), or we could have IORING_OP_GETDENTS itself
> call ->llseek() for now whenever necessary.)
Having IORING_OP_GETDENTS seek to sqe->off if needed seems easy
enough to implement, and it simplifies the other code as well, so
I'll send out a v2 RFC shortly that does this.