2020-06-08 18:11:57

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH 0/4] remove work.func

As discussed, removing ->func from io_wq_work and moving
it into io-wq.

Pavel Begunkov (4):
io_uring: don't derive close state from ->func
io_uring: remove custom ->func handlers
io_uring: don't arm a timeout through work.func
io_wq: add per-wq work handler instead of per work

fs/io-wq.c | 10 ++-
fs/io-wq.h | 7 +-
fs/io_uring.c | 221 +++++++++++++++-----------------------------------
3 files changed, 74 insertions(+), 164 deletions(-)

--
2.24.0


2020-06-08 18:12:03

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH 1/4] io_uring: don't derive close state from ->func

Relying on having a specific work.func is dangerous, even if an opcode
handler set it itself. E.g. io_wq_assign_next() can modify it.

io_close() sets a custom work.func to indicate that
__close_fd_get_file() was already called. Fortunately, there is no bugs
with io_wq_assign_next() and close yet.

Still, do it safe and always be prepared to be called through
io_wq_submit_work(). Zero req->close.put_file in prep, and call
__close_fd_get_file() IFF it's NULL.

Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 50 +++++++++++++++++---------------------------------
1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3aebbf96c123..9acd695cc473 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3479,53 +3479,37 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->close.fd == req->ctx->ring_fd)
return -EBADF;

+ req->close.put_file = NULL;
return 0;
}

-/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req)
-{
- int ret;
-
- ret = filp_close(req->close.put_file, req->work.files);
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- fput(req->close.put_file);
- io_put_req(req);
-}
-
-static void io_close_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- /* not cancellable, don't do io_req_cancelled() */
- __io_close_finish(req);
- io_steal_work(req, workptr);
-}
-
static int io_close(struct io_kiocb *req, bool force_nonblock)
{
+ struct io_close *close = &req->close;
int ret;

- req->close.put_file = NULL;
- ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
- if (ret < 0)
- return (ret == -ENOENT) ? -EBADF : ret;
+ /* might be already done during nonblock submission */
+ if (!close->put_file) {
+ ret = __close_fd_get_file(close->fd, &close->put_file);
+ if (ret < 0)
+ return (ret == -ENOENT) ? -EBADF : ret;
+ }

/* if the file has a flush method, be safe and punt to async */
- if (req->close.put_file->f_op->flush && force_nonblock) {
+ if (close->put_file->f_op->flush && force_nonblock) {
/* avoid grabbing files - we don't need the files */
req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
- req->work.func = io_close_finish;
return -EAGAIN;
}

- /*
- * No ->flush(), safely close from here and just punt the
- * fput() to async context.
- */
- __io_close_finish(req);
+ /* No ->flush() or already async, safely close from here */
+ ret = filp_close(close->put_file, req->work.files);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ fput(close->put_file);
+ close->put_file = NULL;
+ io_put_req(req);
return 0;
}

--
2.24.0

2020-06-08 18:13:24

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH 3/4] io_uring: don't arm a timeout through work.func

Remove io_link_work_cb() -- the last custom work.func.
Not the prettiest thing, but works. Instead of queueing a linked timeout
in io_link_work_cb() mark a request with REQ_F_QUEUE_TIMEOUT and do
enqueueing based on the flag in io_wq_submit_work().

Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 29 ++++++++++++++++++-----------
1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce7f815658a3..adf18ff9fdb9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -541,6 +541,7 @@ enum {
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
+ REQ_F_QUEUE_TIMEOUT_BIT,

/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -596,6 +597,8 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
+ /* needs to queue linked timeout */
+ REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
};

struct async_poll {
@@ -1579,16 +1582,6 @@ static void io_free_req(struct io_kiocb *req)
io_queue_async_work(nxt);
}

-static void io_link_work_cb(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *link;
-
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- io_queue_linked_timeout(link);
- io_wq_submit_work(workptr);
-}
-
static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
{
struct io_kiocb *link;
@@ -1600,7 +1593,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
*workptr = &nxt->work;
link = io_prep_linked_timeout(nxt);
if (link)
- nxt->work.func = io_link_work_cb;
+ nxt->flags |= REQ_F_QUEUE_TIMEOUT;
}

/*
@@ -5333,12 +5326,26 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}

+static void io_arm_async_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ /* link head's timeout is queued in io_queue_async_work() */
+ if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
+ return;
+
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ io_queue_linked_timeout(link);
+}
+
static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
int ret = 0;

+ io_arm_async_linked_timeout(req);
+
/* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
IO_WQ_WORK_CANCEL) {
--
2.24.0

2020-06-08 18:16:01

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH 2/4] io_uring: remove custom ->func handlers

In preparation of getting rid of work.func, this removes almost all
custom instances of it, leaving only io_wq_submit_work() and
io_link_work_cb(). And the last one will be dealt later.

Nothing fancy, just routinely remove *_finish() function and inline
what's left. E.g. remove io_fsync_finish() + inline __io_fsync() into
io_fsync().

As no users of io_req_cancelled() are left, delete it as well. The patch
adds extra switch lookup on cold-ish path, but that's overweighted by
nice diffstat and other benefits of the following patches.

Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 139 ++++++++++----------------------------------------
1 file changed, 27 insertions(+), 112 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9acd695cc473..ce7f815658a3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2940,23 +2940,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}

-static bool io_req_cancelled(struct io_kiocb *req)
-{
- if (req->work.flags & IO_WQ_WORK_CANCEL) {
- req_set_fail_links(req);
- io_cqring_add_event(req, -ECANCELED);
- io_put_req(req);
- return true;
- }
-
- return false;
-}
-
-static void __io_fsync(struct io_kiocb *req)
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
{
loff_t end = req->sync.off + req->sync.len;
int ret;

+ /* fsync always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
@@ -2964,53 +2956,9 @@ static void __io_fsync(struct io_kiocb *req)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fsync(req);
- io_steal_work(req, workptr);
-}
-
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
-{
- /* fsync always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_fsync_finish;
- return -EAGAIN;
- }
- __io_fsync(req);
return 0;
}

-static void __io_fallocate(struct io_kiocb *req)
-{
- int ret;
-
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
- ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
- req->sync.len);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
-}
-
-static void io_fallocate_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fallocate(req);
- io_steal_work(req, workptr);
-}
-
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -3028,13 +2976,20 @@ static int io_fallocate_prep(struct io_kiocb *req,

static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
+ int ret;
+
/* fallocate always requiring blocking context */
- if (force_nonblock) {
- req->work.func = io_fallocate_finish;
+ if (force_nonblock)
return -EAGAIN;
- }

- __io_fallocate(req);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+ ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+ req->sync.len);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
return 0;
}

@@ -3531,38 +3486,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}

-static void __io_sync_file_range(struct io_kiocb *req)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
{
int ret;

+ /* sync_file_range always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_sync_file_range(req);
- io_steal_work(req, workptr);
-}
-
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
-{
- /* sync_file_range always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_sync_file_range_finish;
- return -EAGAIN;
- }
-
- __io_sync_file_range(req);
return 0;
}

@@ -3984,49 +3921,27 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}

-static int __io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
struct io_accept *accept = &req->accept;
- unsigned file_flags;
+ unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
int ret;

- file_flags = force_nonblock ? O_NONBLOCK : 0;
ret = __sys_accept4_file(req->file, file_flags, accept->addr,
accept->addr_len, accept->flags,
accept->nofile);
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
req_set_fail_links(req);
+ }
io_cqring_add_event(req, ret);
io_put_req(req);
return 0;
}

-static void io_accept_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_accept(req, false);
- io_steal_work(req, workptr);
-}
-
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
-{
- int ret;
-
- ret = __io_accept(req, force_nonblock);
- if (ret == -EAGAIN && force_nonblock) {
- req->work.func = io_accept_finish;
- return -EAGAIN;
- }
- return 0;
-}
-
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
--
2.24.0

2020-06-08 18:20:12

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [PATCH 0/4] remove work.func

On 08/06/2020 21:08, Pavel Begunkov wrote:
> As discussed, removing ->func from io_wq_work and moving
> it into io-wq.

Xiaoguang Wang, until Jens goes back and picks this up, I'll
also keep the patchset in my github [1]. Just in case you'd
want to play with it.

https://github.com/isilence/linux/commits/rem_work_func

>
> Pavel Begunkov (4):
> io_uring: don't derive close state from ->func
> io_uring: remove custom ->func handlers
> io_uring: don't arm a timeout through work.func
> io_wq: add per-wq work handler instead of per work
>
> fs/io-wq.c | 10 ++-
> fs/io-wq.h | 7 +-
> fs/io_uring.c | 221 +++++++++++++++-----------------------------------
> 3 files changed, 74 insertions(+), 164 deletions(-)
>

--
Pavel Begunkov

2020-06-08 18:33:04

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH 4/4] io_wq: add per-wq work handler instead of per work

io_uring is the only user of io-wq, and now it uses only io-wq callback
for all its requests, namely io_wq_submit_work(). Instead of storing
work->runner callback in each instance of io_wq_work, keep it in io-wq
itself.

pros:
- reduces io_wq_work size
- more robust -- ->func won't be invalidated with mem{cpy,set}(req)
- helps other work

Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io-wq.c | 10 ++++++----
fs/io-wq.h | 7 ++++---
fs/io_uring.c | 3 ++-
3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2bfa9117bc28..a44ad3b98886 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -112,6 +112,7 @@ struct io_wq {
unsigned long state;

free_work_fn *free_work;
+ io_wq_work_fn *do_work;

struct task_struct *manager;
struct user_struct *user;
@@ -528,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker)

hash = io_get_work_hash(work);
linked = old_work = work;
- linked->func(&linked);
+ wq->do_work(&linked);
linked = (old_work == linked) ? NULL : linked;

work = next_hashed;
@@ -785,7 +786,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq_work *old_work = work;

work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ wq->do_work(&work);
work = (work == old_work) ? NULL : work;
wq->free_work(old_work);
} while (work);
@@ -1027,7 +1028,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node;
struct io_wq *wq;

- if (WARN_ON_ONCE(!data->free_work))
+ if (WARN_ON_ONCE(!data->free_work || !data->do_work))
return ERR_PTR(-EINVAL);

wq = kzalloc(sizeof(*wq), GFP_KERNEL);
@@ -1041,6 +1042,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
}

wq->free_work = data->free_work;
+ wq->do_work = data->do_work;

/* caller must already hold a reference to this */
wq->user = data->user;
@@ -1097,7 +1099,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
- if (data->free_work != wq->free_work)
+ if (data->free_work != wq->free_work || data->do_work != wq->do_work)
return false;

return refcount_inc_not_zero(&wq->use_refs);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index df8a4cd3236d..f3bb596f5a3f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -85,7 +85,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,

struct io_wq_work {
struct io_wq_work_node list;
- void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
@@ -94,9 +93,9 @@ struct io_wq_work {
pid_t task_pid;
};

-#define INIT_IO_WORK(work, _func) \
+#define INIT_IO_WORK(work) \
do { \
- *(work) = (struct io_wq_work){ .func = _func }; \
+ *(work) = (struct io_wq_work){}; \
} while (0) \

static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
@@ -108,10 +107,12 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
}

typedef void (free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work **);

struct io_wq_data {
struct user_struct *user;

+ io_wq_work_fn *do_work;
free_work_fn *free_work;
};

diff --git a/fs/io_uring.c b/fs/io_uring.c
index adf18ff9fdb9..b4ca6026269c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5880,7 +5880,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
refcount_set(&req->refs, 2);
req->task = NULL;
req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
+ INIT_IO_WORK(&req->work);

if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
@@ -6896,6 +6896,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,

data.user = ctx->user;
data.free_work = io_free_work;
+ data.do_work = io_wq_submit_work;

if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
--
2.24.0

2020-06-08 20:04:40

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH 0/4] remove work.func

On 6/8/20 12:08 PM, Pavel Begunkov wrote:
> As discussed, removing ->func from io_wq_work and moving
> it into io-wq.
>
> Pavel Begunkov (4):
> io_uring: don't derive close state from ->func
> io_uring: remove custom ->func handlers
> io_uring: don't arm a timeout through work.func
> io_wq: add per-wq work handler instead of per work

Thanks, this looks good and also nicely enable us to build on it to
eliminate that extra overhead. I have applied it for 5.8.

--
Jens Axboe