2022-02-03 00:17:50

by Usama Arif

[permalink] [raw]
Subject: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd

Acquire completion_lock at the start of __io_uring_register before
registering/unregistering eventfd and release it at the end. Hence
all calls to io_cqring_ev_posted which adds to the eventfd counter
will finish before acquiring the spin_lock in io_uring_register, and
all new calls will wait till the eventfd is registered. This avoids
ring quiesce which is much more expensive than acquiring the spin_lock.

On the system tested with this patch, io_uring_reigster with
IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.

Signed-off-by: Usama Arif <[email protected]>
Reviewed-by: Fam Zheng <[email protected]>
---
fs/io_uring.c | 50 ++++++++++++++++++++++++++++++++++----------------
1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2e04f718319d..e75d8abd225a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1803,11 +1803,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
}

- if (posted)
+ if (posted) {
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
io_cqring_ev_posted(ctx);
+ }
+ spin_unlock(&ctx->completion_lock);
return all_flushed;
}

@@ -1971,8 +1971,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
spin_lock(&ctx->completion_lock);
__io_req_complete_post(req, res, cflags);
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
+ spin_unlock(&ctx->completion_lock);
}

static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
@@ -2231,11 +2231,11 @@ static void __io_req_find_next_prep(struct io_kiocb *req)

spin_lock(&ctx->completion_lock);
posted = io_disarm_next(req);
- if (posted)
+ if (posted) {
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
io_cqring_ev_posted(ctx);
+ }
+ spin_unlock(&ctx->completion_lock);
}

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -2272,8 +2272,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
{
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
+ spin_unlock(&ctx->completion_lock);
}

static void handle_prev_tw_list(struct io_wq_work_node *node,
@@ -2535,8 +2535,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
}

io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
+ spin_unlock(&ctx->completion_lock);
state->flush_cqes = false;
}

@@ -5541,10 +5541,12 @@ static int io_poll_check_events(struct io_kiocb *req)
filled = io_fill_cqe_aux(ctx, req->user_data, mask,
IORING_CQE_F_MORE);
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (unlikely(!filled))
+ if (unlikely(!filled)) {
+ spin_unlock(&ctx->completion_lock);
return -ECANCELED;
+ }
io_cqring_ev_posted(ctx);
+ spin_unlock(&ctx->completion_lock);
} else if (req->result) {
return 0;
}
@@ -5579,8 +5581,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
hash_del(&req->hash_node);
__io_req_complete_post(req, req->result, 0);
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
+ spin_unlock(&ctx->completion_lock);
}

static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
@@ -8351,8 +8353,8 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
spin_lock(&ctx->completion_lock);
io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
+ spin_unlock(&ctx->completion_lock);
io_ring_submit_unlock(ctx, lock_ring);
}

@@ -9639,11 +9641,11 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
}
}
spin_unlock_irq(&ctx->timeout_lock);
- if (canceled != 0)
+ if (canceled != 0) {
io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (canceled != 0)
io_cqring_ev_posted(ctx);
+ }
+ spin_unlock(&ctx->completion_lock);
return canceled != 0;
}

@@ -10970,6 +10972,8 @@ static bool io_register_op_must_quiesce(int op)
case IORING_REGISTER_IOWQ_AFF:
case IORING_UNREGISTER_IOWQ_AFF:
case IORING_REGISTER_IOWQ_MAX_WORKERS:
+ case IORING_REGISTER_EVENTFD:
+ case IORING_UNREGISTER_EVENTFD:
return false;
default:
return true;
@@ -11030,6 +11034,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return -EACCES;
}

+ /*
+ * Acquire completion_lock at the start of __io_uring_register before
+ * registering/unregistering eventfd and release it at the end. Any
+ * completion events pending before this call will finish before acquiring
+ * the spin_lock here, and all new completion events will wait till the
+ * eventfd is registered. This avoids ring quiesce which is much more
+ * expensive then acquiring spin_lock.
+ */
+ if (opcode == IORING_REGISTER_EVENTFD || opcode == IORING_UNREGISTER_EVENTFD)
+ spin_lock(&ctx->completion_lock);
+
if (io_register_op_must_quiesce(opcode)) {
ret = io_ctx_quiesce(ctx);
if (ret)
@@ -11141,6 +11156,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
}

+ if (opcode == IORING_REGISTER_EVENTFD || opcode == IORING_UNREGISTER_EVENTFD)
+ spin_unlock(&ctx->completion_lock);
+
if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
--
2.25.1


2022-02-03 06:10:20

by Jens Axboe

[permalink] [raw]
Subject: Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd

On 2/2/22 9:57 AM, Jens Axboe wrote:
> On 2/2/22 8:59 AM, Usama Arif wrote:
>> Acquire completion_lock at the start of __io_uring_register before
>> registering/unregistering eventfd and release it at the end. Hence
>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>> will finish before acquiring the spin_lock in io_uring_register, and
>> all new calls will wait till the eventfd is registered. This avoids
>> ring quiesce which is much more expensive than acquiring the
>> spin_lock.
>>
>> On the system tested with this patch, io_uring_reigster with
>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>
> This seems like optimizing for the wrong thing, so I've got a few
> questions. Are you doing a lot of eventfd registrations (and
> unregister) in your workload? Or is it just the initial pain of
> registering one? In talking to Pavel, he suggested that RCU might be a
> good use case here, and I think so too. That would still remove the
> need to quiesce, and the posted side just needs a fairly cheap rcu
> read lock/unlock around it.

Totally untested, but perhaps can serve as a starting point or
inspiration.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 64c055421809..195752f4823f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,6 +329,12 @@ struct io_submit_state {
struct blk_plug plug;
};

+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ struct io_ring_ctx *ctx;
+ struct rcu_head rcu;
+};
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -412,7 +418,7 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
+ struct io_ev_fd *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -1741,13 +1747,27 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)

static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
+ if (likely(!ctx->io_ev_fd))
return false;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return false;
return !ctx->eventfd_async || io_wq_current_is_worker();
}

+static void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd;
+
+ if (!io_should_trigger_evfd(ctx))
+ return;
+
+ rcu_read_lock();
+ ev_fd = READ_ONCE(ctx->io_ev_fd);
+ if (ev_fd)
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+ rcu_read_unlock();
+}
+
/*
* This should only get called when at least one event has been posted.
* Some applications rely on the eventfd notification count only changing
@@ -1764,8 +1784,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ io_eventfd_signal(ctx);
}

static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1777,8 +1796,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
if (waitqueue_active(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
}
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ io_eventfd_signal(ctx);
}

/* Returns true if there are no backlogged entries after the flush */
@@ -9569,31 +9587,49 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,

static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
+ struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;

- if (ctx->cq_ev_fd)
+ if (ctx->io_ev_fd)
return -EBUSY;

if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;

- ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ctx->cq_ev_fd)) {
- int ret = PTR_ERR(ctx->cq_ev_fd);
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
+
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);

- ctx->cq_ev_fd = NULL;
+ kfree(ev_fd);
return ret;
}

+ ev_fd->ctx = ctx;
+ WRITE_ONCE(ctx->io_ev_fd, ev_fd);
return 0;
}

+static void io_eventfd_put(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+ struct io_ring_ctx *ctx = ev_fd->ctx;
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+ WRITE_ONCE(ctx->io_ev_fd, NULL);
+}
+
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
- if (ctx->cq_ev_fd) {
- eventfd_ctx_put(ctx->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ struct io_ev_fd *ev_fd = ctx->io_ev_fd;
+
+ if (ev_fd) {
+ call_rcu(&ev_fd->rcu, io_eventfd_put);
return 0;
}

@@ -9659,7 +9695,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
mutex_unlock(&ctx->uring_lock);
- io_eventfd_unregister(ctx);
+ if (ctx->io_ev_fd) {
+ eventfd_ctx_put(ctx->io_ev_fd->cq_ev_fd);
+ kfree(ctx->io_ev_fd);
+ }
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
@@ -11209,6 +11248,8 @@ static bool io_register_op_must_quiesce(int op)
case IORING_UNREGISTER_IOWQ_AFF:
case IORING_REGISTER_IOWQ_MAX_WORKERS:
case IORING_REGISTER_MAP_BUFFERS:
+ case IORING_REGISTER_EVENTFD:
+ case IORING_UNREGISTER_EVENTFD:
return false;
default:
return true;
@@ -11423,7 +11464,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
- ctx->cq_ev_fd != NULL, ret);
+ ctx->io_ev_fd != NULL, ret);
out_fput:
fdput(f);
return ret;

--
Jens Axboe

2022-02-03 20:43:14

by Usama Arif

[permalink] [raw]
Subject: Re: [External] Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd



On 02/02/2022 19:18, Jens Axboe wrote:
> On 2/2/22 9:57 AM, Jens Axboe wrote:
>> On 2/2/22 8:59 AM, Usama Arif wrote:
>>> Acquire completion_lock at the start of __io_uring_register before
>>> registering/unregistering eventfd and release it at the end. Hence
>>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>>> will finish before acquiring the spin_lock in io_uring_register, and
>>> all new calls will wait till the eventfd is registered. This avoids
>>> ring quiesce which is much more expensive than acquiring the
>>> spin_lock.
>>>
>>> On the system tested with this patch, io_uring_reigster with
>>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>>
>> This seems like optimizing for the wrong thing, so I've got a few
>> questions. Are you doing a lot of eventfd registrations (and
>> unregister) in your workload? Or is it just the initial pain of
>> registering one? In talking to Pavel, he suggested that RCU might be a
>> good use case here, and I think so too. That would still remove the
>> need to quiesce, and the posted side just needs a fairly cheap rcu
>> read lock/unlock around it.
>
> Totally untested, but perhaps can serve as a starting point or
> inspiration.
>

Hi,

Thank you for the replies and comments. My usecase registers only one
eventfd at the start.

Thanks a lot for the diff below, it was a really good starting point!
I have sent a couple of patches for review implementing the RCU way.
I think that the below diff might have some issues, so i have done some
parts in a different way. Please have a look in the diff below where i
think there might be issues like race conditions, and how the patches I
sent resolve it.

I see that if we remove ring quiesce from the the above 3 opcodes, then
only IORING_REGISTER_ENABLE_RINGS and IORING_REGISTER_RESTRICTIONS is
left for ring quiesce. I just had a quick look at those, and from what i
see we might not need to enter ring quiesce in
IORING_REGISTER_ENABLE_RINGS as the ring is already disabled at that point?
And for IORING_REGISTER_RESTRICTIONS if we do a similar approach to
IORING_REGISTER_EVENTFD, i.e. wrap ctx->restrictions inside an RCU
protected data structure, use spin_lock to prevent multiple
io_register_restrictions calls at the same time, and use read_rcu_lock
in io_check_restriction, then we can remove ring quiesce from
io_uring_register altogether?

My usecase only uses IORING_REGISTER_EVENTFD, but i think entering ring
quiesce costs similar in other opcodes. If the above sounds reasonable,
please let me know and i can send patches for removing ring quiesce for
io_uring_register.

Thanks again!
Usama

>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 64c055421809..195752f4823f 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -329,6 +329,12 @@ struct io_submit_state {
> struct blk_plug plug;
> };
>
> +struct io_ev_fd {
> + struct eventfd_ctx *cq_ev_fd;
> + struct io_ring_ctx *ctx;
> + struct rcu_head rcu;
> +};
> +
> struct io_ring_ctx {
> /* const or read-mostly hot data */
> struct {
> @@ -412,7 +418,7 @@ struct io_ring_ctx {
> struct {
> unsigned cached_cq_tail;
> unsigned cq_entries;
> - struct eventfd_ctx *cq_ev_fd;
> + struct io_ev_fd *io_ev_fd;
> struct wait_queue_head cq_wait;
> unsigned cq_extra;
> atomic_t cq_timeouts;
> @@ -1741,13 +1747,27 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
>
> static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
> {
> - if (likely(!ctx->cq_ev_fd))
> + if (likely(!ctx->io_ev_fd))
> return false;
> if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
> return false;
> return !ctx->eventfd_async || io_wq_current_is_worker();
> }
>
> +static void io_eventfd_signal(struct io_ring_ctx *ctx)
> +{
> + struct io_ev_fd *ev_fd;
> +
> + if (!io_should_trigger_evfd(ctx))
> + return;
> +

As the above io_should_trigger_evfd is not part of rcu_read_lock in this
diff, another thread at this point could unregister the eventfd1 that
was checked in io_should_trigger_evfd call above and register another
one (eventfd2). If execution switches back to the thread executing
io_eventfd_signal after this the eventfd_signal below will be sent to
eventfd2, which is not right. I think there might be other wrong
scenarios that can happen over here as well.

What i have done to avoid this from happening is treat ctx->io_ev_fd as
an RCU protected data structure in the entire file. Hence, the entire
io_eventfd_signal is a read-side critical section and a single ev_fd is
rcu_dereferenced and used in io_should_trigger_evfd and eventfd_signal.


> + rcu_read_lock();
> + ev_fd = READ_ONCE(ctx->io_ev_fd);
> + if (ev_fd)
> + eventfd_signal(ev_fd->cq_ev_fd, 1);
> + rcu_read_unlock();
> +}
> +
> /*
> * This should only get called when at least one event has been posted.
> * Some applications rely on the eventfd notification count only changing
> @@ -1764,8 +1784,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
> */
> if (wq_has_sleeper(&ctx->cq_wait))
> wake_up_all(&ctx->cq_wait);
> - if (io_should_trigger_evfd(ctx))
> - eventfd_signal(ctx->cq_ev_fd, 1);
> + io_eventfd_signal(ctx);
> }
>
> static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
> @@ -1777,8 +1796,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
> if (waitqueue_active(&ctx->cq_wait))
> wake_up_all(&ctx->cq_wait);
> }
> - if (io_should_trigger_evfd(ctx))
> - eventfd_signal(ctx->cq_ev_fd, 1);
> + io_eventfd_signal(ctx);
> }
>
> /* Returns true if there are no backlogged entries after the flush */
> @@ -9569,31 +9587,49 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>
> static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
> {
> + struct io_ev_fd *ev_fd;
> __s32 __user *fds = arg;
> int fd;
>
> - if (ctx->cq_ev_fd)
> + if (ctx->io_ev_fd)
> return -EBUSY;
>

You could have 2 threads call io_uring_register on the same ring at the
same time, they could both pass the above check of ctx->io_ev_fd != NULL
not existing and enter a race condition to assign ctx->io_ev_fd, i guess
thats why locks are used for writes when using RCU, i have used
ctx->ev_fd_lock in the patch i pushed for review. Also as ctx->io_ev_fd
is RCU protected so accesses are only using
rcu_dereference_protected/rcu_dereference/rcu_assign_poitner.


> if (copy_from_user(&fd, fds, sizeof(*fds)))
> return -EFAULT;
>
> - ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
> - if (IS_ERR(ctx->cq_ev_fd)) {
> - int ret = PTR_ERR(ctx->cq_ev_fd);
> + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
> + if (!ev_fd)
> + return -ENOMEM;
> +
> + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
> + if (IS_ERR(ev_fd->cq_ev_fd)) {
> + int ret = PTR_ERR(ev_fd->cq_ev_fd);
>
> - ctx->cq_ev_fd = NULL;
> + kfree(ev_fd);
> return ret;
> }
>
> + ev_fd->ctx = ctx;
> + WRITE_ONCE(ctx->io_ev_fd, ev_fd);
> return 0;
> }
>
> +static void io_eventfd_put(struct rcu_head *rcu)
> +{
> + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
> + struct io_ring_ctx *ctx = ev_fd->ctx;
> +
> + eventfd_ctx_put(ev_fd->cq_ev_fd);
> + kfree(ev_fd);
> + WRITE_ONCE(ctx->io_ev_fd, NULL);
> +}
> +
> static int io_eventfd_unregister(struct io_ring_ctx *ctx)
> {
> - if (ctx->cq_ev_fd) {
> - eventfd_ctx_put(ctx->cq_ev_fd);
> - ctx->cq_ev_fd = NULL;
> + struct io_ev_fd *ev_fd = ctx->io_ev_fd;
> +
> + if (ev_fd) {
> + call_rcu(&ev_fd->rcu, io_eventfd_put);
> return 0;
> }
>
> @@ -9659,7 +9695,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
> if (ctx->rings)
> __io_cqring_overflow_flush(ctx, true);
> mutex_unlock(&ctx->uring_lock);
> - io_eventfd_unregister(ctx);
> + if (ctx->io_ev_fd) {
> + eventfd_ctx_put(ctx->io_ev_fd->cq_ev_fd);
> + kfree(ctx->io_ev_fd);
> + }
> io_destroy_buffers(ctx);
> if (ctx->sq_creds)
> put_cred(ctx->sq_creds);
> @@ -11209,6 +11248,8 @@ static bool io_register_op_must_quiesce(int op)
> case IORING_UNREGISTER_IOWQ_AFF:
> case IORING_REGISTER_IOWQ_MAX_WORKERS:
> case IORING_REGISTER_MAP_BUFFERS:
> + case IORING_REGISTER_EVENTFD:
> + case IORING_UNREGISTER_EVENTFD:
> return false;
> default:
> return true;
> @@ -11423,7 +11464,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
> ret = __io_uring_register(ctx, opcode, arg, nr_args);
> mutex_unlock(&ctx->uring_lock);
> trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
> - ctx->cq_ev_fd != NULL, ret);
> + ctx->io_ev_fd != NULL, ret);
> out_fput:
> fdput(f);
> return ret;
>

2022-02-04 01:17:06

by Jens Axboe

[permalink] [raw]
Subject: Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd

On 2/2/22 8:59 AM, Usama Arif wrote:
> Acquire completion_lock at the start of __io_uring_register before
> registering/unregistering eventfd and release it at the end. Hence
> all calls to io_cqring_ev_posted which adds to the eventfd counter
> will finish before acquiring the spin_lock in io_uring_register, and
> all new calls will wait till the eventfd is registered. This avoids
> ring quiesce which is much more expensive than acquiring the spin_lock.
>
> On the system tested with this patch, io_uring_reigster with
> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.

This seems like optimizing for the wrong thing, so I've got a few
questions. Are you doing a lot of eventfd registrations (and unregister)
in your workload? Or is it just the initial pain of registering one? In
talking to Pavel, he suggested that RCU might be a good use case here,
and I think so too. That would still remove the need to quiesce, and the
posted side just needs a fairly cheap rcu read lock/unlock around it.

--
Jens Axboe

2022-02-04 05:23:21

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [External] Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd

On 2/3/22 15:14, Usama Arif wrote:
> On 02/02/2022 19:18, Jens Axboe wrote:
>> On 2/2/22 9:57 AM, Jens Axboe wrote:
>>> On 2/2/22 8:59 AM, Usama Arif wrote:
>>>> Acquire completion_lock at the start of __io_uring_register before
>>>> registering/unregistering eventfd and release it at the end. Hence
>>>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>>>> will finish before acquiring the spin_lock in io_uring_register, and
>>>> all new calls will wait till the eventfd is registered. This avoids
>>>> ring quiesce which is much more expensive than acquiring the
>>>> spin_lock.
>>>>
>>>> On the system tested with this patch, io_uring_reigster with
>>>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>>>
>>> This seems like optimizing for the wrong thing, so I've got a few
>>> questions. Are you doing a lot of eventfd registrations (and
>>> unregister) in your workload? Or is it just the initial pain of
>>> registering one? In talking to Pavel, he suggested that RCU might be a
>>> good use case here, and I think so too. That would still remove the
>>> need to quiesce, and the posted side just needs a fairly cheap rcu
>>> read lock/unlock around it.
>>
>> Totally untested, but perhaps can serve as a starting point or
>> inspiration.
>>
>
> Hi,
>
> Thank you for the replies and comments. My usecase registers only one eventfd at the start.

Then it's overkill. Update io_register_op_must_quiesce(), set ->cq_ev_fd
on registration with WRITE_ONCE(), read it in io_cqring_ev_posted* with
READ_ONCE() and you're set.

There is a caveat, ->cq_ev_fd won't be immediately visible to already
inflight requests, but we can say it's the responsibility of the
userspace to wait for a grace period, i.e. for all inflight requests
submitted before registration io_cqring_ev_posted* might or might not
see updated ->cq_ev_fd, which works perfectly if there was no requests
in the first place. Of course it changes the behaviour so will need
a new register opcode.

--
Pavel Begunkov

2022-02-04 08:56:33

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [External] Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd

On 2/3/22 15:44, Pavel Begunkov wrote:
> On 2/3/22 15:14, Usama Arif wrote:
>> On 02/02/2022 19:18, Jens Axboe wrote:
>>> On 2/2/22 9:57 AM, Jens Axboe wrote:
>>>> On 2/2/22 8:59 AM, Usama Arif wrote:
>>>>> Acquire completion_lock at the start of __io_uring_register before
>>>>> registering/unregistering eventfd and release it at the end. Hence
>>>>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>>>>> will finish before acquiring the spin_lock in io_uring_register, and
>>>>> all new calls will wait till the eventfd is registered. This avoids
>>>>> ring quiesce which is much more expensive than acquiring the
>>>>> spin_lock.
>>>>>
>>>>> On the system tested with this patch, io_uring_reigster with
>>>>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>>>>
>>>> This seems like optimizing for the wrong thing, so I've got a few
>>>> questions. Are you doing a lot of eventfd registrations (and
>>>> unregister) in your workload? Or is it just the initial pain of
>>>> registering one? In talking to Pavel, he suggested that RCU might be a
>>>> good use case here, and I think so too. That would still remove the
>>>> need to quiesce, and the posted side just needs a fairly cheap rcu
>>>> read lock/unlock around it.
>>>
>>> Totally untested, but perhaps can serve as a starting point or
>>> inspiration.
>>>
>>
>> Hi,
>>
>> Thank you for the replies and comments. My usecase registers only one eventfd at the start.
>
> Then it's overkill. Update io_register_op_must_quiesce(), set ->cq_ev_fd
> on registration with WRITE_ONCE(), read it in io_cqring_ev_posted* with
> READ_ONCE() and you're set.

Actually needs smp_store_release/smp_load_acquire


> There is a caveat, ->cq_ev_fd won't be immediately visible to already
> inflight requests, but we can say it's the responsibility of the
> userspace to wait for a grace period, i.e. for all inflight requests
> submitted before registration io_cqring_ev_posted* might or might not
> see updated ->cq_ev_fd, which works perfectly if there was no requests
> in the first place. Of course it changes the behaviour so will need
> a new register opcode.
>

--
Pavel Begunkov