2023-04-04 12:42:23

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 00/13] optimise registered buffer/file updates

The patchset optimises registered files and buffers updates / removals,
The rsrc-update-bench test showes 11x improvement (1040K -> 11468K
updates / sec). It also improves latency by eliminating rcu grace
period waiting and bouncing it to another worker, and reduces
memory footprint by removing percpu refs.

That's quite important for apps updating files/buffers with medium or
higher frequency as updates are slow and expensive, and it currently
takes quite a number of IO requests per update to make using fixed
files/buffers worthwhile.

Another upside is that it makes it simpler, patch 9 removes very
convoluted synchronisation via flush_delayed_work() from the quiesce
path.

v2: rebase, add patches 12 and 13 to remove the last pair atomics out
of the path and to limit caching.

Pavel Begunkov (13):
io_uring/rsrc: use non-pcpu refcounts for nodes
io_uring/rsrc: keep cached refs per node
io_uring: don't put nodes under spinlocks
io_uring: io_free_req() via tw
io_uring/rsrc: protect node refs with uring_lock
io_uring/rsrc: kill rsrc_ref_lock
io_uring/rsrc: rename rsrc_list
io_uring/rsrc: optimise io_rsrc_put allocation
io_uring/rsrc: don't offload node free
io_uring/rsrc: cache struct io_rsrc_node
io_uring/rsrc: add lockdep sanity checks
io_uring/rsrc: optimise io_rsrc_data refcounting
io_uring/rsrc: add custom limit for node caching

include/linux/io_uring_types.h | 8 +-
io_uring/alloc_cache.h | 6 +-
io_uring/io_uring.c | 54 ++++++----
io_uring/rsrc.c | 176 ++++++++++++---------------------
io_uring/rsrc.h | 58 +++++------
5 files changed, 136 insertions(+), 166 deletions(-)

--
2.39.1


2023-04-04 12:42:32

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 01/13] io_uring/rsrc: use non-pcpu refcounts for nodes

One problem with the current rsrc infra is that often updates will
generates lots of rsrc nodes, each carry pcpu refs. That takes quite a
lot of memory, especially if there is a stall, and takes lots of CPU
cycles. Only pcpu allocations takes >50 of CPU with a naive benchmark
updating files in a loop.

Replace pcpu refs with normal refcounting. There is already a hot path
avoiding atomics / refs, but following patches will further improve it.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/rsrc.c | 15 +++++----------
io_uring/rsrc.h | 6 ++++--
2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a5ed0ee7c160..1b9b7f98fb7e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -155,7 +155,7 @@ void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
- percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+ refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs);
}

static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
@@ -220,13 +220,11 @@ void io_wait_rsrc_data(struct io_rsrc_data *data)

void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
- percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
}

-static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+__cold void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
{
- struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
unsigned long flags;
bool first_add = false;
@@ -269,11 +267,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
if (!ref_node)
return NULL;

- if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
- 0, GFP_KERNEL)) {
- kfree(ref_node);
- return NULL;
- }
+ refcount_set(&ref_node->refs, 1);
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->rsrc_list);
ref_node->done = false;
@@ -298,7 +292,8 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
spin_unlock_irq(&ctx->rsrc_ref_lock);

atomic_inc(&data_to_kill->refs);
- percpu_ref_kill(&rsrc_node->refs);
+ /* put master ref */
+ io_rsrc_put_node(rsrc_node, 1);
ctx->rsrc_node = NULL;
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f27f4975217d..1467b31843bc 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -37,7 +37,7 @@ struct io_rsrc_data {
};

struct io_rsrc_node {
- struct percpu_ref refs;
+ refcount_t refs;
struct list_head node;
struct list_head rsrc_list;
struct io_rsrc_data *rsrc_data;
@@ -54,6 +54,7 @@ struct io_mapped_ubuf {
};

void io_rsrc_put_tw(struct callback_head *cb);
+void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_put_work(struct work_struct *work);
void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
void io_wait_rsrc_data(struct io_rsrc_data *data);
@@ -109,7 +110,8 @@ int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,

static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
{
- percpu_ref_put_many(&node->refs, nr);
+ if (refcount_sub_and_test(nr, &node->refs))
+ io_rsrc_node_ref_zero(node);
}

static inline void io_req_put_rsrc(struct io_kiocb *req)
--
2.39.1

2023-04-04 12:42:41

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 03/13] io_uring: don't put nodes under spinlocks

io_req_put_rsrc() doesn't need any locking, so move it out of
a spinlock section in __io_req_complete_post() and adjust helpers.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/io_uring.c | 7 +++++--
io_uring/rsrc.h | 6 +++---
2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a0b64831c455..596af20cddb4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -970,6 +970,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
static void __io_req_complete_post(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *rsrc_node = NULL;

io_cq_lock(ctx);
if (!(req->flags & REQ_F_CQE_SKIP))
@@ -990,7 +991,7 @@ static void __io_req_complete_post(struct io_kiocb *req)
}
io_put_kbuf_comp(req);
io_dismantle_req(req);
- io_req_put_rsrc(req);
+ rsrc_node = req->rsrc_node;
/*
* Selected buffer deallocation in io_clean_op() assumes that
* we don't hold ->completion_lock. Clean them here to avoid
@@ -1001,6 +1002,8 @@ static void __io_req_complete_post(struct io_kiocb *req)
ctx->locked_free_nr++;
}
io_cq_unlock_post(ctx);
+
+ io_put_rsrc_node(rsrc_node);
}

void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
@@ -1117,7 +1120,7 @@ __cold void io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;

- io_req_put_rsrc(req);
+ io_put_rsrc_node(req->rsrc_node);
io_dismantle_req(req);
io_put_task_remote(req->task, 1);

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 950535e2b9f4..8164777279ba 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -115,10 +115,10 @@ static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
io_rsrc_node_ref_zero(node);
}

-static inline void io_req_put_rsrc(struct io_kiocb *req)
+static inline void io_put_rsrc_node(struct io_rsrc_node *node)
{
- if (req->rsrc_node)
- io_rsrc_put_node(req->rsrc_node, 1);
+ if (node)
+ io_rsrc_put_node(node, 1);
}

static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
--
2.39.1

2023-04-04 12:42:51

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 04/13] io_uring: io_free_req() via tw

io_free_req() is not often used but nevertheless problematic as there is
no way to know the current context, it may be used from the submission
path or even by an irq handler. Push it to a fresh context using
task_work.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/io_uring.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 596af20cddb4..98320f4b0bca 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1116,7 +1116,7 @@ static inline void io_dismantle_req(struct io_kiocb *req)
io_put_file(req->file);
}

-__cold void io_free_req(struct io_kiocb *req)
+static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_ring_ctx *ctx = req->ctx;

@@ -1130,6 +1130,12 @@ __cold void io_free_req(struct io_kiocb *req)
spin_unlock(&ctx->completion_lock);
}

+__cold void io_free_req(struct io_kiocb *req)
+{
+ req->io_task_work.func = io_free_req_tw;
+ io_req_task_work_add(req);
+}
+
static void __io_req_find_next_prep(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
--
2.39.1

2023-04-04 12:43:05

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 07/13] io_uring/rsrc: rename rsrc_list

We have too many "rsrc" around which makes the name of struct
io_rsrc_node::rsrc_list confusing. The field is responsible for keeping
a list of files or buffers, so call it item_list and add comments
around.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/rsrc.c | 6 +++---
io_uring/rsrc.h | 8 +++++++-
2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f3493b9d2bbb..2378beecdc0a 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -146,7 +146,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
struct io_ring_ctx *ctx = rsrc_data->ctx;
struct io_rsrc_put *prsrc, *tmp;

- list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+ list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) {
list_del(&prsrc->list);

if (prsrc->tag) {
@@ -249,7 +249,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)

ref_node->refs = 1;
INIT_LIST_HEAD(&ref_node->node);
- INIT_LIST_HEAD(&ref_node->rsrc_list);
+ INIT_LIST_HEAD(&ref_node->item_list);
ref_node->done = false;
return ref_node;
}
@@ -737,7 +737,7 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
prsrc->tag = *tag_slot;
*tag_slot = 0;
prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &node->rsrc_list);
+ list_add(&prsrc->list, &node->item_list);
return 0;
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index a96103095f0f..509a5ea7eabf 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -38,11 +38,17 @@ struct io_rsrc_data {

struct io_rsrc_node {
struct list_head node;
- struct list_head rsrc_list;
struct io_rsrc_data *rsrc_data;
struct llist_node llist;
int refs;
bool done;
+
+ /*
+ * Keeps a list of struct io_rsrc_put to be completed. Each entry
+ * represents one rsrc (e.g. file or buffer), but all of them should've
+ * came from the same table and so are of the same type.
+ */
+ struct list_head item_list;
};

struct io_mapped_ubuf {
--
2.39.1

2023-04-04 12:43:13

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 05/13] io_uring/rsrc: protect node refs with uring_lock

Currently, for nodes we have an atomic counter and some cached
(non-atomic) refs protected by uring_lock. Let's put all ref
manipulations under uring_lock and get rid of the atomic part.
It's free as in all cases we care about we already hold the lock.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/io_uring.c | 18 ++++++++++++------
io_uring/rsrc.c | 30 ++++--------------------------
io_uring/rsrc.h | 29 +++++------------------------
3 files changed, 21 insertions(+), 56 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 98320f4b0bca..36a76c7b34f0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -967,7 +967,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
return true;
}

-static void __io_req_complete_post(struct io_kiocb *req)
+static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_rsrc_node *rsrc_node = NULL;
@@ -1003,7 +1003,11 @@ static void __io_req_complete_post(struct io_kiocb *req)
}
io_cq_unlock_post(ctx);

- io_put_rsrc_node(rsrc_node);
+ if (rsrc_node) {
+ io_ring_submit_lock(ctx, issue_flags);
+ io_put_rsrc_node(rsrc_node);
+ io_ring_submit_unlock(ctx, issue_flags);
+ }
}

void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
@@ -1013,12 +1017,12 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
io_req_task_work_add(req);
} else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
!(req->ctx->flags & IORING_SETUP_IOPOLL)) {
- __io_req_complete_post(req);
+ __io_req_complete_post(req, issue_flags);
} else {
struct io_ring_ctx *ctx = req->ctx;

mutex_lock(&ctx->uring_lock);
- __io_req_complete_post(req);
+ __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED);
mutex_unlock(&ctx->uring_lock);
}
}
@@ -1120,7 +1124,10 @@ static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_ring_ctx *ctx = req->ctx;

- io_put_rsrc_node(req->rsrc_node);
+ if (req->rsrc_node) {
+ io_tw_lock(ctx, ts);
+ io_put_rsrc_node(req->rsrc_node);
+ }
io_dismantle_req(req);
io_put_task_remote(req->task, 1);

@@ -2790,7 +2797,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
- io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index e9187d49d558..89e43e59b490 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -27,23 +27,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage);

-#define IO_RSRC_REF_BATCH 100
-
/* only define max */
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)

-void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- struct io_rsrc_node *node = ctx->rsrc_node;
-
- if (node && node->cached_refs) {
- io_rsrc_put_node(node, node->cached_refs);
- node->cached_refs = 0;
- }
-}
-
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -153,13 +140,6 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
*slot = NULL;
}

-void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
- __must_hold(&ctx->uring_lock)
-{
- node->cached_refs += IO_RSRC_REF_BATCH;
- refcount_add(IO_RSRC_REF_BATCH, &node->refs);
-}
-
static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
{
struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
@@ -225,7 +205,8 @@ void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}

-__cold void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
+void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
+ __must_hold(&node->rsrc_data->ctx->uring_lock)
{
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
unsigned long flags;
@@ -269,7 +250,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
if (!ref_node)
return NULL;

- refcount_set(&ref_node->refs, 1);
+ ref_node->refs = 1;
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->rsrc_list);
ref_node->done = false;
@@ -283,8 +264,6 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);

- io_rsrc_refs_drop(ctx);
-
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;

@@ -295,14 +274,13 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,

atomic_inc(&data_to_kill->refs);
/* put master ref */
- io_rsrc_put_node(rsrc_node, 1);
+ io_put_rsrc_node(rsrc_node);
ctx->rsrc_node = NULL;
}

if (!ctx->rsrc_node) {
ctx->rsrc_node = ctx->rsrc_backup_node;
ctx->rsrc_backup_node = NULL;
- ctx->rsrc_node->cached_refs = 0;
}
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8164777279ba..a96103095f0f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -37,13 +37,12 @@ struct io_rsrc_data {
};

struct io_rsrc_node {
- refcount_t refs;
struct list_head node;
struct list_head rsrc_list;
struct io_rsrc_data *rsrc_data;
struct llist_node llist;
+ int refs;
bool done;
- int cached_refs;
};

struct io_mapped_ubuf {
@@ -57,10 +56,8 @@ struct io_mapped_ubuf {
void io_rsrc_put_tw(struct callback_head *cb);
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_put_work(struct work_struct *work);
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
void io_wait_rsrc_data(struct io_rsrc_data *data);
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
-void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc);
@@ -109,38 +106,22 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);

-static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
-{
- if (refcount_sub_and_test(nr, &node->refs))
- io_rsrc_node_ref_zero(node);
-}
-
static inline void io_put_rsrc_node(struct io_rsrc_node *node)
{
- if (node)
- io_rsrc_put_node(node, 1);
+ if (node && !--node->refs)
+ io_rsrc_node_ref_zero(node);
}

static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
{
- struct io_rsrc_node *node = req->rsrc_node;
-
- if (node) {
- if (node == ctx->rsrc_node)
- node->cached_refs++;
- else
- io_rsrc_put_node(node, 1);
- }
+ io_put_rsrc_node(req->rsrc_node);
}

static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
struct io_rsrc_node *node)
{
- node->cached_refs--;
- if (unlikely(node->cached_refs < 0))
- io_rsrc_refs_refill(ctx, node);
+ node->refs++;
}

static inline void io_req_set_rsrc_node(struct io_kiocb *req,
--
2.39.1

2023-04-04 12:43:25

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 06/13] io_uring/rsrc: kill rsrc_ref_lock

We use ->rsrc_ref_lock spinlock to protect ->rsrc_ref_list in
io_rsrc_node_ref_zero(). Now we removed pcpu refcounting, which means
io_rsrc_node_ref_zero() is not executed from the irq context as an RCU
callback anymore, and we also put it under ->uring_lock.
io_rsrc_node_switch(), which queues up nodes into the list, is also
protected by ->uring_lock, so we can safely get rid of ->rsrc_ref_lock.

Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 2 +-
io_uring/io_uring.c | 1 -
io_uring/rsrc.c | 5 -----
3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index a0a5b5964d3a..9492889f00c0 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -333,8 +333,8 @@ struct io_ring_ctx {
struct delayed_work rsrc_put_work;
struct callback_head rsrc_put_tw;
struct llist_head rsrc_put_llist;
+ /* protected by ->uring_lock */
struct list_head rsrc_ref_list;
- spinlock_t rsrc_ref_lock;

struct list_head io_buffers_pages;

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 36a76c7b34f0..764df5694d73 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -325,7 +325,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
- spin_lock_init(&ctx->rsrc_ref_lock);
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 89e43e59b490..f3493b9d2bbb 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -209,11 +209,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
__must_hold(&node->rsrc_data->ctx->uring_lock)
{
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
- unsigned long flags;
bool first_add = false;
unsigned long delay = HZ;

- spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
node->done = true;

/* if we are mid-quiesce then do not delay */
@@ -229,7 +227,6 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
list_del(&node->node);
first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
}
- spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);

if (!first_add)
return;
@@ -268,9 +265,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;

rsrc_node->rsrc_data = data_to_kill;
- spin_lock_irq(&ctx->rsrc_ref_lock);
list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
- spin_unlock_irq(&ctx->rsrc_ref_lock);

atomic_inc(&data_to_kill->refs);
/* put master ref */
--
2.39.1

2023-04-04 12:43:29

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 08/13] io_uring/rsrc: optimise io_rsrc_put allocation

Every io_rsrc_node keeps a list of items to put, and all entries are
kmalloc()'ed. However, it's quite often to queue up only one entry per
node, so let's add an inline entry there to avoid extra allocations.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/rsrc.c | 51 ++++++++++++++++++++++++++++++++-----------------
io_uring/rsrc.h | 2 ++
2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 2378beecdc0a..9647c02be0dc 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -140,26 +140,34 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
*slot = NULL;
}

+static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
+ struct io_rsrc_put *prsrc)
+{
+ struct io_ring_ctx *ctx = rsrc_data->ctx;
+
+ if (prsrc->tag) {
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ mutex_lock(&ctx->uring_lock);
+ io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
+ }
+ }
+ rsrc_data->do_put(ctx, prsrc);
+}
+
static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
{
struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
- struct io_ring_ctx *ctx = rsrc_data->ctx;
struct io_rsrc_put *prsrc, *tmp;

+ if (ref_node->inline_items)
+ io_rsrc_put_work_one(rsrc_data, &ref_node->item);
+
list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) {
list_del(&prsrc->list);
-
- if (prsrc->tag) {
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- mutex_lock(&ctx->uring_lock);
- io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
- mutex_unlock(&ctx->uring_lock);
- } else {
- io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
- }
- }
-
- rsrc_data->do_put(ctx, prsrc);
+ io_rsrc_put_work_one(rsrc_data, prsrc);
kfree(prsrc);
}

@@ -251,6 +259,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->item_list);
ref_node->done = false;
+ ref_node->inline_items = 0;
return ref_node;
}

@@ -729,15 +738,23 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
{
u64 *tag_slot = io_get_tag_slot(data, idx);
struct io_rsrc_put *prsrc;
+ bool inline_item = true;

- prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
- if (!prsrc)
- return -ENOMEM;
+ if (!node->inline_items) {
+ prsrc = &node->item;
+ node->inline_items++;
+ } else {
+ prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+ if (!prsrc)
+ return -ENOMEM;
+ inline_item = false;
+ }

prsrc->tag = *tag_slot;
*tag_slot = 0;
prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &node->item_list);
+ if (!inline_item)
+ list_add(&prsrc->list, &node->item_list);
return 0;
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 509a5ea7eabf..11703082d125 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -49,6 +49,8 @@ struct io_rsrc_node {
* came from the same table and so are of the same type.
*/
struct list_head item_list;
+ struct io_rsrc_put item;
+ int inline_items;
};

struct io_mapped_ubuf {
--
2.39.1

2023-04-04 12:43:35

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 02/13] io_uring/rsrc: keep cached refs per node

We cache refs of the current node (i.e. ctx->rsrc_node) in
ctx->rsrc_cached_refs. We'll be moving away from atomics, so move the
cached refs in struct io_rsrc_node for now. It's a prep patch and
shouldn't change anything in practise.

Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 1 -
io_uring/rsrc.c | 15 +++++++++------
io_uring/rsrc.h | 16 +++++++++-------
3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 561fa421c453..a0a5b5964d3a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -240,7 +240,6 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
- int rsrc_cached_refs;
atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1b9b7f98fb7e..e9187d49d558 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -36,9 +36,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- if (ctx->rsrc_cached_refs) {
- io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
- ctx->rsrc_cached_refs = 0;
+ struct io_rsrc_node *node = ctx->rsrc_node;
+
+ if (node && node->cached_refs) {
+ io_rsrc_put_node(node, node->cached_refs);
+ node->cached_refs = 0;
}
}

@@ -151,11 +153,11 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
*slot = NULL;
}

-void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
__must_hold(&ctx->uring_lock)
{
- ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
- refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs);
+ node->cached_refs += IO_RSRC_REF_BATCH;
+ refcount_add(IO_RSRC_REF_BATCH, &node->refs);
}

static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
@@ -300,6 +302,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
if (!ctx->rsrc_node) {
ctx->rsrc_node = ctx->rsrc_backup_node;
ctx->rsrc_backup_node = NULL;
+ ctx->rsrc_node->cached_refs = 0;
}
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 1467b31843bc..950535e2b9f4 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -43,6 +43,7 @@ struct io_rsrc_node {
struct io_rsrc_data *rsrc_data;
struct llist_node llist;
bool done;
+ int cached_refs;
};

struct io_mapped_ubuf {
@@ -56,7 +57,7 @@ struct io_mapped_ubuf {
void io_rsrc_put_tw(struct callback_head *cb);
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_put_work(struct work_struct *work);
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
void io_wait_rsrc_data(struct io_rsrc_data *data);
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
@@ -128,17 +129,18 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req,

if (node) {
if (node == ctx->rsrc_node)
- ctx->rsrc_cached_refs++;
+ node->cached_refs++;
else
io_rsrc_put_node(node, 1);
}
}

-static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx)
+static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
+ struct io_rsrc_node *node)
{
- ctx->rsrc_cached_refs--;
- if (unlikely(ctx->rsrc_cached_refs < 0))
- io_rsrc_refs_refill(ctx);
+ node->cached_refs--;
+ if (unlikely(node->cached_refs < 0))
+ io_rsrc_refs_refill(ctx, node);
}

static inline void io_req_set_rsrc_node(struct io_kiocb *req,
@@ -151,7 +153,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
lockdep_assert_held(&ctx->uring_lock);

req->rsrc_node = ctx->rsrc_node;
- io_charge_rsrc_node(ctx);
+ io_charge_rsrc_node(ctx, ctx->rsrc_node);
io_ring_submit_unlock(ctx, issue_flags);
}
}
--
2.39.1

2023-04-04 12:44:36

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 10/13] io_uring/rsrc: cache struct io_rsrc_node

Add allocation cache for struct io_rsrc_node, it's always allocated and
put under ->uring_lock, so it doesn't need any extra synchronisation
around caches.

Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 1 +
io_uring/io_uring.c | 11 +++++++++--
io_uring/rsrc.c | 23 +++++++++++++++--------
io_uring/rsrc.h | 9 +++++++--
4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 47496059e13a..5d772e36e7fc 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -332,6 +332,7 @@ struct io_ring_ctx {

/* protected by ->uring_lock */
struct list_head rsrc_ref_list;
+ struct io_alloc_cache rsrc_node_cache;

struct list_head io_buffers_pages;

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d6a0025afc31..419d6f42935f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
init_completion(&ctx->ref_comp);
@@ -2790,6 +2791,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}

+static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
+{
+ kfree(container_of(entry, struct io_rsrc_node, cache));
+}
+
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -2815,9 +2821,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)

/* there are no registered resources left, nobody uses it */
if (ctx->rsrc_node)
- io_rsrc_node_destroy(ctx->rsrc_node);
+ io_rsrc_node_destroy(ctx, ctx->rsrc_node);
if (ctx->rsrc_backup_node)
- io_rsrc_node_destroy(ctx->rsrc_backup_node);
+ io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node);

WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));

@@ -2829,6 +2835,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));

+ io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
if (ctx->mm_account) {
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 77cb2f8cfd68..cbf563fcb053 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
kfree(prsrc);
}

- io_rsrc_node_destroy(ref_node);
+ io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
if (atomic_dec_and_test(&rsrc_data->refs))
complete(&rsrc_data->done);
}
@@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}

-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
- kfree(ref_node);
+ if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
+ kfree(node);
}

void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
@@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
}
}

-static struct io_rsrc_node *io_rsrc_node_alloc(void)
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
struct io_rsrc_node *ref_node;
+ struct io_cache_entry *entry;

- ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
- if (!ref_node)
- return NULL;
+ entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
+ if (entry) {
+ ref_node = container_of(entry, struct io_rsrc_node, cache);
+ } else {
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return NULL;
+ }

ref_node->refs = 1;
INIT_LIST_HEAD(&ref_node->node);
@@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
if (ctx->rsrc_backup_node)
return 0;
- ctx->rsrc_backup_node = io_rsrc_node_alloc();
+ ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 11703082d125..3b9f4c57c47c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -4,6 +4,8 @@

#include <net/af_unix.h>

+#include "alloc_cache.h"
+
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
@@ -37,8 +39,11 @@ struct io_rsrc_data {
};

struct io_rsrc_node {
+ union {
+ struct io_cache_entry cache;
+ struct io_rsrc_data *rsrc_data;
+ };
struct list_head node;
- struct io_rsrc_data *rsrc_data;
struct llist_node llist;
int refs;
bool done;
@@ -65,7 +70,7 @@ void io_rsrc_put_tw(struct callback_head *cb);
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_put_work(struct work_struct *work);
void io_wait_rsrc_data(struct io_rsrc_data *data);
-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc);
--
2.39.1

2023-04-04 12:44:45

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 11/13] io_uring/rsrc: add lockdep sanity checks

We should hold ->uring_lock while putting nodes with io_put_rsrc_node(),
add a lockdep check for that.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/io_uring.c | 4 ++--
io_uring/rsrc.c | 2 +-
io_uring/rsrc.h | 6 ++++--
3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 419d6f42935f..da36fa1eeac9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1002,7 +1002,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)

if (rsrc_node) {
io_ring_submit_lock(ctx, issue_flags);
- io_put_rsrc_node(rsrc_node);
+ io_put_rsrc_node(ctx, rsrc_node);
io_ring_submit_unlock(ctx, issue_flags);
}
}
@@ -1123,7 +1123,7 @@ static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)

if (req->rsrc_node) {
io_tw_lock(ctx, ts);
- io_put_rsrc_node(req->rsrc_node);
+ io_put_rsrc_node(ctx, req->rsrc_node);
}
io_dismantle_req(req);
io_put_task_remote(req->task, 1);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index cbf563fcb053..95edc5f73204 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -236,7 +236,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,

atomic_inc(&data_to_kill->refs);
/* put master ref */
- io_put_rsrc_node(rsrc_node);
+ io_put_rsrc_node(ctx, rsrc_node);
ctx->rsrc_node = NULL;
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 3b9f4c57c47c..cf24c3fd701f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -119,8 +119,10 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);

-static inline void io_put_rsrc_node(struct io_rsrc_node *node)
+static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
+ lockdep_assert_held(&ctx->uring_lock);
+
if (node && !--node->refs)
io_rsrc_node_ref_zero(node);
}
@@ -128,7 +130,7 @@ static inline void io_put_rsrc_node(struct io_rsrc_node *node)
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx)
{
- io_put_rsrc_node(req->rsrc_node);
+ io_put_rsrc_node(ctx, req->rsrc_node);
}

static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
--
2.39.1

2023-04-04 12:44:45

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 13/13] io_uring/rsrc: add custom limit for node caching

The number of entries in the rsrc node cache is limited to 512, which
still seems unnecessarily large. Add per cache thresholds and set to
to 32 for the rsrc node cache.

Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 1 +
io_uring/alloc_cache.h | 6 ++++--
io_uring/io_uring.c | 9 ++++++---
io_uring/rsrc.h | 2 ++
4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5d772e36e7fc..4a6ce03a4903 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -190,6 +190,7 @@ struct io_ev_fd {
struct io_alloc_cache {
struct io_wq_work_node list;
unsigned int nr_cached;
+ unsigned int max_cached;
size_t elem_size;
};

diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 2fbecaa3a1ba..851a527afb5e 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -13,7 +13,7 @@ struct io_cache_entry {
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
struct io_cache_entry *entry)
{
- if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
+ if (cache->nr_cached < cache->max_cached) {
cache->nr_cached++;
wq_stack_add_head(&entry->node, &cache->list);
/* KASAN poisons object */
@@ -38,10 +38,12 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
return NULL;
}

-static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size)
+static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
+ unsigned max_nr, size_t size)
{
cache->list.next = NULL;
cache->nr_cached = 0;
+ cache->max_cached = max_nr;
cache->elem_size = size;
}

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index da36fa1eeac9..ae90d2753e0d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,9 +310,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
- io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
- io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
- io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
+ io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+ sizeof(struct io_rsrc_node));
+ io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+ sizeof(struct async_poll));
+ io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
+ sizeof(struct io_async_msghdr));
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 7ab9b2b2e757..8729f2fee256 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -6,6 +6,8 @@

#include "alloc_cache.h"

+#define IO_NODE_ALLOC_CACHE_MAX 32
+
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
--
2.39.1

2023-04-04 12:44:59

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 09/13] io_uring/rsrc: don't offload node free

struct delayed_work rsrc_put_work was previously used to offload node
freeing because io_rsrc_node_ref_zero() was previously called by RCU in
the IRQ context. Now, as percpu refcounting is gone, we can do it
eagerly at the spot without pushing it to a worker.

Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 3 --
io_uring/io_uring.c | 6 ----
io_uring/rsrc.c | 59 +++-------------------------------
3 files changed, 4 insertions(+), 64 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9492889f00c0..47496059e13a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -330,9 +330,6 @@ struct io_ring_ctx {
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;

- struct delayed_work rsrc_put_work;
- struct callback_head rsrc_put_tw;
- struct llist_head rsrc_put_llist;
/* protected by ->uring_lock */
struct list_head rsrc_ref_list;

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 764df5694d73..d6a0025afc31 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -326,9 +326,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
- INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
- init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
- init_llist_head(&ctx->rsrc_put_llist);
init_llist_head(&ctx->work_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
@@ -2821,11 +2818,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_rsrc_node_destroy(ctx->rsrc_node);
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
- flush_delayed_work(&ctx->rsrc_put_work);
- flush_delayed_work(&ctx->fallback_work);

WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
- WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));

#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 9647c02be0dc..77cb2f8cfd68 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -145,15 +145,8 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
{
struct io_ring_ctx *ctx = rsrc_data->ctx;

- if (prsrc->tag) {
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- mutex_lock(&ctx->uring_lock);
- io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
- mutex_unlock(&ctx->uring_lock);
- } else {
- io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
- }
- }
+ if (prsrc->tag)
+ io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
rsrc_data->do_put(ctx, prsrc);
}

@@ -176,32 +169,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
complete(&rsrc_data->done);
}

-void io_rsrc_put_work(struct work_struct *work)
-{
- struct io_ring_ctx *ctx;
- struct llist_node *node;
-
- ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
- node = llist_del_all(&ctx->rsrc_put_llist);
-
- while (node) {
- struct io_rsrc_node *ref_node;
- struct llist_node *next = node->next;
-
- ref_node = llist_entry(node, struct io_rsrc_node, llist);
- __io_rsrc_put_work(ref_node);
- node = next;
- }
-}
-
-void io_rsrc_put_tw(struct callback_head *cb)
-{
- struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
- rsrc_put_tw);
-
- io_rsrc_put_work(&ctx->rsrc_put_work.work);
-}
-
void io_wait_rsrc_data(struct io_rsrc_data *data)
{
if (data && !atomic_dec_and_test(&data->refs))
@@ -217,34 +184,18 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
__must_hold(&node->rsrc_data->ctx->uring_lock)
{
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
- bool first_add = false;
- unsigned long delay = HZ;

node->done = true;
-
- /* if we are mid-quiesce then do not delay */
- if (node->rsrc_data->quiesce)
- delay = 0;
-
while (!list_empty(&ctx->rsrc_ref_list)) {
node = list_first_entry(&ctx->rsrc_ref_list,
struct io_rsrc_node, node);
/* recycle ref nodes in order */
if (!node->done)
break;
- list_del(&node->node);
- first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
- }

- if (!first_add)
- return;
-
- if (ctx->submitter_task) {
- if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw,
- ctx->notify_method))
- return;
+ list_del(&node->node);
+ __io_rsrc_put_work(node);
}
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
}

static struct io_rsrc_node *io_rsrc_node_alloc(void)
@@ -320,13 +271,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
if (ret < 0) {
atomic_inc(&data->refs);
/* wait for all works potentially completing data->done */
- flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
mutex_lock(&ctx->uring_lock);
break;
}

- flush_delayed_work(&ctx->rsrc_put_work);
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
--
2.39.1

2023-04-04 12:45:16

by Pavel Begunkov

[permalink] [raw]
Subject: [PATCH v2 12/13] io_uring/rsrc: optimise io_rsrc_data refcounting

Every struct io_rsrc_node takes a struct io_rsrc_data reference, which
means all rsrc updates do 2 extra atomics. Replace atomics refcounting
with a int as it's all done under ->uring_lock.

Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/rsrc.c | 30 ++++++++++++++++++------------
io_uring/rsrc.h | 2 +-
2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 95edc5f73204..74e13230fa0c 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -31,6 +31,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)

+static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data)
+{
+ return !--rsrc_data->refs;
+}
+
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -165,13 +170,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
}

io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
- if (atomic_dec_and_test(&rsrc_data->refs))
+ if (io_put_rsrc_data_ref(rsrc_data))
complete(&rsrc_data->done);
}

void io_wait_rsrc_data(struct io_rsrc_data *data)
{
- if (data && !atomic_dec_and_test(&data->refs))
+ if (data && !io_put_rsrc_data_ref(data))
wait_for_completion(&data->done);
}

@@ -234,7 +239,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
rsrc_node->rsrc_data = data_to_kill;
list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);

- atomic_inc(&data_to_kill->refs);
+ data_to_kill->refs++;
/* put master ref */
io_put_rsrc_node(ctx, rsrc_node);
ctx->rsrc_node = NULL;
@@ -267,8 +272,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
return ret;
io_rsrc_node_switch(ctx, data);

- /* kill initial ref, already quiesced if zero */
- if (atomic_dec_and_test(&data->refs))
+ /* kill initial ref */
+ if (io_put_rsrc_data_ref(data))
return 0;

data->quiesce = true;
@@ -276,17 +281,19 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
do {
ret = io_run_task_work_sig(ctx);
if (ret < 0) {
- atomic_inc(&data->refs);
- /* wait for all works potentially completing data->done */
- reinit_completion(&data->done);
mutex_lock(&ctx->uring_lock);
+ if (!data->refs) {
+ ret = 0;
+ } else {
+ /* restore the master reference */
+ data->refs++;
+ }
break;
}
-
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
- if (atomic_read(&data->refs) <= 0)
+ if (!data->refs)
break;
/*
* it has been revived by another thread while
@@ -361,6 +368,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
data->nr = nr;
data->ctx = ctx;
data->do_put = do_put;
+ data->refs = 1;
if (utags) {
ret = -EFAULT;
for (i = 0; i < nr; i++) {
@@ -371,8 +379,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
goto fail;
}
}
-
- atomic_set(&data->refs, 1);
init_completion(&data->done);
*pdata = data;
return 0;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index cf24c3fd701f..7ab9b2b2e757 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -33,8 +33,8 @@ struct io_rsrc_data {
u64 **tags;
unsigned int nr;
rsrc_put_fn *do_put;
- atomic_t refs;
struct completion done;
+ int refs;
bool quiesce;
};

--
2.39.1

2023-04-04 15:35:39

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH v2 00/13] optimise registered buffer/file updates

On 4/4/23 6:39?AM, Pavel Begunkov wrote:
> The patchset optimises registered files and buffers updates / removals,
> The rsrc-update-bench test showes 11x improvement (1040K -> 11468K
> updates / sec). It also improves latency by eliminating rcu grace
> period waiting and bouncing it to another worker, and reduces
> memory footprint by removing percpu refs.
>
> That's quite important for apps updating files/buffers with medium or
> higher frequency as updates are slow and expensive, and it currently
> takes quite a number of IO requests per update to make using fixed
> files/buffers worthwhile.
>
> Another upside is that it makes it simpler, patch 9 removes very
> convoluted synchronisation via flush_delayed_work() from the quiesce
> path.

Ran this on the big box. Stock kernel is 6.3-rc5 + for-6.4/io_uring, and
patched is same kernel with this patchset applied.

Test Kernel Ops
---------------------------------------------------------
CPU0 rsrc-update-bench Stock 165670
CPU0 rsrc-update-bench Stock 166412
rsrc-update-bench Stock 213411
rsrc-update-bench Stock 208995

CPU0 rsrc-update-bench Patched 10890297
CPU0 rsrc-update-bench Patched 10451699
rsrc-update-bench Patched 10793148
rsrc-update-bench Patched 10934918

which is just ridicolous. It's ~64x faster pinned, and ~51x faster not
pinned.

On top of that, it's a nice cleanup too and reduction in complexity.

--
Jens Axboe

2023-04-04 15:36:35

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH v2 00/13] optimise registered buffer/file updates


On Tue, 04 Apr 2023 13:39:44 +0100, Pavel Begunkov wrote:
> The patchset optimises registered files and buffers updates / removals,
> The rsrc-update-bench test showes 11x improvement (1040K -> 11468K
> updates / sec). It also improves latency by eliminating rcu grace
> period waiting and bouncing it to another worker, and reduces
> memory footprint by removing percpu refs.
>
> That's quite important for apps updating files/buffers with medium or
> higher frequency as updates are slow and expensive, and it currently
> takes quite a number of IO requests per update to make using fixed
> files/buffers worthwhile.
>
> [...]

Applied, thanks!

[01/13] io_uring/rsrc: use non-pcpu refcounts for nodes
commit: b8fb5b4fdd67f9d18109c5d21d44a8bd4ddb608b
[02/13] io_uring/rsrc: keep cached refs per node
commit: 8e15c0e71b8ae64fb7163532860f8d608165281f
[03/13] io_uring: don't put nodes under spinlocks
commit: 2ad4c6d08018e4eec130c29992028dc356ab2181
[04/13] io_uring: io_free_req() via tw
commit: 03adabe81abb20221079b48343783b4327bd1186
[05/13] io_uring/rsrc: protect node refs with uring_lock
commit: ef8ae64ffa9578c12e44de42604004c2cc3e9c27
[06/13] io_uring/rsrc: kill rsrc_ref_lock
commit: 0a4813b1abdf06e44ce60cdebfd374cfd27c46bf
[07/13] io_uring/rsrc: rename rsrc_list
commit: c824986c113f15e2ef2c00da9a226c09ecaac74c
[08/13] io_uring/rsrc: optimise io_rsrc_put allocation
commit: ff7c75ecaa9e6b251f76c24e289d4bfe413ffe31
[09/13] io_uring/rsrc: don't offload node free
commit: 36b9818a5a84cb7c977fb723babca1c8d74f288f
[10/13] io_uring/rsrc: cache struct io_rsrc_node
commit: 9eae8655f9cd2eeed99fb7a0d2bb22816c17e497
[11/13] io_uring/rsrc: add lockdep sanity checks
commit: 1f2c8f610aa6c6a3dc3523f93eaf28c25051df6f
[12/13] io_uring/rsrc: optimise io_rsrc_data refcounting
commit: 757ef4682b6aa29fdf752ad47f0d63eb48b261cf
[13/13] io_uring/rsrc: add custom limit for node caching
commit: 69bbc6ade9d9d4e3c556cb83e77b6f3cd9ad3d18

Best regards,
--
Jens Axboe