Implement an old idea allowing open/accept io_uring requests to register
a newly created file as a io_uring's fixed file instead of placing it
into a task's file table. The switching is encoded in io_uring's SQEs
by setting sqe->buf_index/file_index, so restricted to 2^16-1. Don't
think we need more, but may be a good idea to scrap u32 somewhere
instead.
From the net side only needs a function doing __sys_accept4_file()
but not installing fd, see 2/4.
Only RFC for now, the new functionality is tested only for open yet.
I hope we can remember the author of the idea to add attribution.
Pavel Begunkov (4):
io_uring: allow open directly into fixed fd table
net: add an accept helper not installing fd
io_uring: hand code io_accept()' fd installing
io_uring: accept directly into fixed file table
fs/io_uring.c | 113 +++++++++++++++++++++++++++++-----
include/linux/socket.h | 3 +
include/uapi/linux/io_uring.h | 2 +
net/socket.c | 71 +++++++++++----------
4 files changed, 138 insertions(+), 51 deletions(-)
--
2.32.0
Instead of opening a file into a process's file table as usual and then
registering the fd within io_uring, we can skip the first step and place
it directly into io_uring's fixed file table. Add support for that for
both OPENAT opcodes.
The behaviour is controlled by setting sqe->file_index, where 0 implies
the old behaviour, and non-0 values is translated into index
(file_index-1). The field is u16, hence sets a limit on how many slots
it covers.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 75 ++++++++++++++++++++++++++++++-----
include/uapi/linux/io_uring.h | 2 +
2 files changed, 68 insertions(+), 9 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 599b11152505..17d2c0eb89dc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1075,6 +1075,8 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
static void io_fallback_req_func(struct work_struct *unused);
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ unsigned int issue_flags);
static struct kmem_cache *req_cachep;
@@ -3745,11 +3747,10 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->ioprio))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
-
/* open.how should be already initialised */
if (!(req->open.how.flags & O_PATH) && force_o_largefile())
req->open.how.flags |= O_LARGEFILE;
@@ -3762,6 +3763,10 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
req->open.filename = NULL;
return ret;
}
+ req->buf_index = READ_ONCE(sqe->file_index);
+ if (req->buf_index && (req->open.how.flags & O_CLOEXEC))
+ return -EINVAL;
+
req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
@@ -3804,8 +3809,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
struct file *file;
- bool nonblock_set;
- bool resolve_nonblock;
+ bool resolve_nonblock, nonblock_set;
+ bool fixed = !!req->buf_index;
int ret;
ret = build_open_flags(&req->open.how, &op);
@@ -3824,9 +3829,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
op.open_flag |= O_NONBLOCK;
}
- ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
- if (ret < 0)
- goto err;
+ if (!fixed) {
+ ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
+ if (ret < 0)
+ goto err;
+ }
file = do_filp_open(req->open.dfd, req->open.filename, &op);
if (IS_ERR(file)) {
@@ -3835,7 +3842,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
* marginal gain for something that is now known to be a slower
* path. So just put it, and we'll get a new one when we retry.
*/
- put_unused_fd(ret);
+ if (!fixed)
+ put_unused_fd(ret);
ret = PTR_ERR(file);
/* only retry if RESOLVE_CACHED wasn't already set by application */
@@ -3848,7 +3856,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
file->f_flags &= ~O_NONBLOCK;
fsnotify_open(file);
- fd_install(ret, file);
+
+ if (!fixed)
+ fd_install(ret, file);
+ else
+ ret = io_install_fixed_file(req, file, issue_flags);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -7776,6 +7788,50 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ int i = req->buf_index - 1;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ struct io_fixed_file *file_slot;
+ int ret = -EBADF;
+
+ if (WARN_ON_ONCE(req->buf_index == 0))
+ goto err;
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+ if (file->f_op == &io_uring_fops)
+ goto err;
+ ret = -ENXIO;
+ if (!ctx->file_data)
+ goto err;
+ ret = -EINVAL;
+ if (i > ctx->nr_user_files)
+ goto err;
+
+ i = array_index_nospec(i, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ ret = -EEXIST;
+ if (file_slot->file_ptr)
+ goto err;
+
+ *io_get_tag_slot(ctx->file_data, i) = 0;
+ io_fixed_file_set(file_slot, file);
+ ret = io_sqe_file_register(ctx, file, i);
+ if (ret) {
+ file_slot->file_ptr = 0;
+ goto err;
+ }
+
+ ret = 0;
+err:
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ if (ret)
+ fput(file);
+ return ret;
+}
+
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc)
{
@@ -10231,6 +10287,7 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
+ BUILD_BUG_SQE_ELEM(40, __u16, file_index);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 79126d5cd289..f105deb4da1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -52,6 +52,8 @@ struct io_uring_sqe {
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
+ /* index into fixed files */
+ __u16 file_index;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
--
2.32.0
Introduce and reuse a helper that acts similarly to __sys_accept4_file()
but returns struct file instead of installing file descriptor. Will be
used by io_uring.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/socket.h | 3 ++
net/socket.c | 71 ++++++++++++++++++++++--------------------
2 files changed, 40 insertions(+), 34 deletions(-)
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 0d8e3dcb7f88..d3c1a42a2edd 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile);
+extern struct file *do_accept(struct file *file, unsigned file_flags,
+ struct sockaddr __user *upeer_sockaddr,
+ int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
diff --git a/net/socket.c b/net/socket.c
index bd9233da2497..2857206453f9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1715,32 +1715,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
return __sys_listen(fd, backlog);
}
-int __sys_accept4_file(struct file *file, unsigned file_flags,
+struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
- int __user *upeer_addrlen, int flags,
- unsigned long nofile)
+ int __user *upeer_addrlen, int flags)
{
struct socket *sock, *newsock;
struct file *newfile;
- int err, len, newfd;
+ int err, len;
struct sockaddr_storage address;
- if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
- return -EINVAL;
-
- if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
- flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
-
sock = sock_from_file(file);
- if (!sock) {
- err = -ENOTSOCK;
- goto out;
- }
+ if (!sock)
+ return ERR_PTR(-ENOTSOCK);
- err = -ENFILE;
newsock = sock_alloc();
if (!newsock)
- goto out;
+ return ERR_PTR(-ENFILE);
newsock->type = sock->type;
newsock->ops = sock->ops;
@@ -1751,18 +1741,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
*/
__module_get(newsock->ops->owner);
- newfd = __get_unused_fd_flags(flags, nofile);
- if (unlikely(newfd < 0)) {
- err = newfd;
- sock_release(newsock);
- goto out;
- }
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
- if (IS_ERR(newfile)) {
- err = PTR_ERR(newfile);
- put_unused_fd(newfd);
- goto out;
- }
+ if (IS_ERR(newfile))
+ return newfile;
err = security_socket_accept(sock, newsock);
if (err)
@@ -1787,16 +1768,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
}
/* File flags are not inherited via accept() unlike another OSes. */
-
- fd_install(newfd, newfile);
- err = newfd;
-out:
- return err;
+ return newfile;
out_fd:
fput(newfile);
- put_unused_fd(newfd);
- goto out;
+ return ERR_PTR(err);
+}
+
+int __sys_accept4_file(struct file *file, unsigned file_flags,
+ struct sockaddr __user *upeer_sockaddr,
+ int __user *upeer_addrlen, int flags,
+ unsigned long nofile)
+{
+ struct file *newfile;
+ int newfd;
+ if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+
+ if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+ flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+ newfd = __get_unused_fd_flags(flags, nofile);
+ if (unlikely(newfd < 0))
+ return newfd;
+
+ newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen,
+ flags);
+ if (IS_ERR(newfile)) {
+ put_unused_fd(newfd);
+ return PTR_ERR(newfile);
+ }
+ fd_install(newfd, newfile);
+ return newfd;
}
/*
--
2.32.0
Make io_accept() to handle file descriptor allocations and installation.
A preparation patch for bypassing file tables.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 27 ++++++++++++++++++++-------
1 file changed, 20 insertions(+), 7 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 17d2c0eb89dc..fd0dcee251b0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4682,6 +4682,11 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
+
+ if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
+ accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
return 0;
}
@@ -4690,20 +4695,28 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct io_accept *accept = &req->accept;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
- int ret;
+ struct file *file;
+ int ret, fd;
if (req->file->f_flags & O_NONBLOCK)
req->flags |= REQ_F_NOWAIT;
- ret = __sys_accept4_file(req->file, file_flags, accept->addr,
- accept->addr_len, accept->flags,
- accept->nofile);
- if (ret == -EAGAIN && force_nonblock)
- return -EAGAIN;
- if (ret < 0) {
+ fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+
+ file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+ accept->flags);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
+ } else {
+ fd_install(fd, file);
+ ret = fd;
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
--
2.32.0
As done with open opcodes, allow accept to skip installing fd into
processes' file tables and put it directly into io_uring's fixed file
table. Same restrictions and design as for open.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fd0dcee251b0..f7db43bf7dad 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4675,14 +4675,17 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index)
+ if (sqe->ioprio || sqe->len)
return -EINVAL;
accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
+ req->buf_index = READ_ONCE(sqe->file_index);
+ if (req->buf_index && (accept->flags & SOCK_CLOEXEC))
+ return -EINVAL;
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
@@ -4695,28 +4698,34 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct io_accept *accept = &req->accept;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
+ bool fixed = !!req->buf_index;
struct file *file;
int ret, fd;
if (req->file->f_flags & O_NONBLOCK)
req->flags |= REQ_F_NOWAIT;
- fd = __get_unused_fd_flags(accept->flags, accept->nofile);
- if (unlikely(fd < 0))
- return fd;
-
+ if (!fixed) {
+ fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+ }
file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
accept->flags);
if (IS_ERR(file)) {
+ if (!fixed)
+ put_unused_fd(fd);
ret = PTR_ERR(file);
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
- } else {
+ } else if (!fixed) {
fd_install(fd, file);
ret = fd;
+ } else {
+ ret = io_install_fixed_file(req, file, issue_flags);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
--
2.32.0
On 7/7/21 5:39 AM, Pavel Begunkov wrote:
> Implement an old idea allowing open/accept io_uring requests to register
> a newly created file as a io_uring's fixed file instead of placing it
> into a task's file table. The switching is encoded in io_uring's SQEs
> by setting sqe->buf_index/file_index, so restricted to 2^16-1. Don't
> think we need more, but may be a good idea to scrap u32 somewhere
> instead.
>
> From the net side only needs a function doing __sys_accept4_file()
> but not installing fd, see 2/4.
>
> Only RFC for now, the new functionality is tested only for open yet.
> I hope we can remember the author of the idea to add attribution.
Pretty sure the original suggester of this as Josh, CC'ed.
--
Jens Axboe
Am 07.07.21 um 15:07 schrieb Jens Axboe:
> On 7/7/21 5:39 AM, Pavel Begunkov wrote:
>> Implement an old idea allowing open/accept io_uring requests to register
>> a newly created file as a io_uring's fixed file instead of placing it
>> into a task's file table. The switching is encoded in io_uring's SQEs
>> by setting sqe->buf_index/file_index, so restricted to 2^16-1. Don't
>> think we need more, but may be a good idea to scrap u32 somewhere
>> instead.
>>
>> From the net side only needs a function doing __sys_accept4_file()
>> but not installing fd, see 2/4.
>>
>> Only RFC for now, the new functionality is tested only for open yet.
>> I hope we can remember the author of the idea to add attribution.
>
> Pretty sure the original suggester of this as Josh, CC'ed.
I also requested it for open :-)
metze
On Wed, Jul 07, 2021 at 07:07:52AM -0600, Jens Axboe wrote:
> On 7/7/21 5:39 AM, Pavel Begunkov wrote:
> > Implement an old idea allowing open/accept io_uring requests to register
> > a newly created file as a io_uring's fixed file instead of placing it
> > into a task's file table. The switching is encoded in io_uring's SQEs
> > by setting sqe->buf_index/file_index, so restricted to 2^16-1. Don't
> > think we need more, but may be a good idea to scrap u32 somewhere
> > instead.
> >
> > From the net side only needs a function doing __sys_accept4_file()
> > but not installing fd, see 2/4.
> >
> > Only RFC for now, the new functionality is tested only for open yet.
> > I hope we can remember the author of the idea to add attribution.
>
> Pretty sure the original suggester of this as Josh, CC'ed.
Thanks for working on this, Pavel!
Original thread at
https://lore.kernel.org/io-uring/20200715004209.GA334456@localhost/T/ in
case that helps.
On 7/7/21 7:59 AM, Stefan Metzmacher wrote:
> Am 07.07.21 um 15:07 schrieb Jens Axboe:
>> On 7/7/21 5:39 AM, Pavel Begunkov wrote:
>>> Implement an old idea allowing open/accept io_uring requests to register
>>> a newly created file as a io_uring's fixed file instead of placing it
>>> into a task's file table. The switching is encoded in io_uring's SQEs
>>> by setting sqe->buf_index/file_index, so restricted to 2^16-1. Don't
>>> think we need more, but may be a good idea to scrap u32 somewhere
>>> instead.
>>>
>>> From the net side only needs a function doing __sys_accept4_file()
>>> but not installing fd, see 2/4.
>>>
>>> Only RFC for now, the new functionality is tested only for open yet.
>>> I hope we can remember the author of the idea to add attribution.
>>
>> Pretty sure the original suggester of this as Josh, CC'ed.
>
> I also requested it for open :-)
Indeed! I honestly forget the details, as some of it is implementation
detail. I think Josh was the first to suggest a private fd table could
be used, but that's mostly implementation detail as the point was to be
able to know which fd would be assigned.
But I think we're all in agreement here, it's a nifty feature :-)
--
Jens Axboe
在 2021/7/7 下午7:39, Pavel Begunkov 写道:
> Implement an old idea allowing open/accept io_uring requests to register
> a newly created file as a io_uring's fixed file instead of placing it
> into a task's file table. The switching is encoded in io_uring's SQEs
> by setting sqe->buf_index/file_index, so restricted to 2^16-1. Don't
> think we need more, but may be a good idea to scrap u32 somewhere
> instead.
>
> From the net side only needs a function doing __sys_accept4_file()
> but not installing fd, see 2/4.
>
> Only RFC for now, the new functionality is tested only for open yet.
> I hope we can remember the author of the idea to add attribution.
>
Great feature! I believe this one leverages linked sqes, we may need to
remind users to be careful when they use this feature in shared sqthread
mode since linked sqes may be splited.
> Pavel Begunkov (4):
> io_uring: allow open directly into fixed fd table
> net: add an accept helper not installing fd
> io_uring: hand code io_accept()' fd installing
> io_uring: accept directly into fixed file table
>
> fs/io_uring.c | 113 +++++++++++++++++++++++++++++-----
> include/linux/socket.h | 3 +
> include/uapi/linux/io_uring.h | 2 +
> net/socket.c | 71 +++++++++++----------
> 4 files changed, 138 insertions(+), 51 deletions(-)
>
On 7/7/21 4:04 PM, Josh Triplett wrote:
> On Wed, Jul 07, 2021 at 07:07:52AM -0600, Jens Axboe wrote:
>> On 7/7/21 5:39 AM, Pavel Begunkov wrote:
>>> Implement an old idea allowing open/accept io_uring requests to register
>>> a newly created file as a io_uring's fixed file instead of placing it
>>> into a task's file table. The switching is encoded in io_uring's SQEs
>>> by setting sqe->buf_index/file_index, so restricted to 2^16-1. Don't
>>> think we need more, but may be a good idea to scrap u32 somewhere
>>> instead.
>>>
>>> From the net side only needs a function doing __sys_accept4_file()
>>> but not installing fd, see 2/4.
>>>
>>> Only RFC for now, the new functionality is tested only for open yet.
>>> I hope we can remember the author of the idea to add attribution.
>>
>> Pretty sure the original suggester of this as Josh, CC'ed.
>
> Thanks for working on this, Pavel!
>
> Original thread at
Totally was thinking it was only a discussion but not an actual patch,
and I even did comment on it! Sorry Josh, would have persuaded you to
finish it, if I remembered that.
> https://lore.kernel.org/io-uring/20200715004209.GA334456@localhost/T/ in
> case that helps.
>
--
Pavel Begunkov