LinuxLists.cc - [PATCH v3] io_uring: releasing CPU resources when polling

2024-05-08 01:17:48

Subject: [PATCH v3] io_uring: releasing CPU resources when polling

This patch is intended to release the CPU resources of io_uring in
polling mode. When IO is issued, the program immediately polls for
check completion, which is a waste of CPU resources when IO commands
are executed on the disk.

I add the hybrid polling feature in io_uring, enables polling to
release a portion of CPU resources without affecting block layer.

- Record the running time and context switching time of each
IO, and use these time to determine whether a process continue
to schedule.

- Adaptive adjustment to different devices. Due to the real-time
nature of time recording, each device's IO processing speed is
different, so the CPU optimization effect will vary.

- Set a interface (ctx->flag) enables application to choose whether
or not to use this feature.

The CPU optimization in peak workload of patch is tested as follows:
set 8 poll queues
all CPU utilization of original polling is 100% for per CPU, after
optimization, the CPU utilization drop a lot (per CPU);

read(128k, QD64, 1Job) 37% write(128k, QD64, 1Job) 40%
randread(4k, QD64, 16Job) 52% randwrite(4k, QD64, 16Job) 12%

Compared to original polling, the optimised performance reduction
with peak workload within 1%.

read 0.29% write 0.51% randread 0.09% randwrite 0%

Signed-off-by: hexue <[email protected]>

---

changes:
v2:
- extend hybrid poll to async polled io

v1:
- initial version
---
include/linux/io_uring_types.h | 14 ++++
include/uapi/linux/io_uring.h | 1 +
io_uring/io_uring.c | 4 +-
io_uring/io_uring.h | 3 +
io_uring/rw.c | 115 ++++++++++++++++++++++++++++++++-
5 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 854ad67a5f70..3a75b9904326 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -224,6 +224,11 @@ struct io_alloc_cache {
size_t elem_size;
};

+struct iopoll_info {
+ long last_runtime;
+ long last_irqtime;
+};
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -421,6 +426,7 @@ struct io_ring_ctx {
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
+ struct xarray poll_array;
};

struct io_tw_state {
@@ -571,6 +577,12 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
)
#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)

+struct hy_poll_time {
+ int poll_state;
+ struct timespec64 iopoll_start;
+ struct timespec64 iopoll_end;
+};
+
struct io_kiocb {
union {
/*
@@ -641,6 +653,8 @@ struct io_kiocb {
u64 extra1;
u64 extra2;
} big_cqe;
+ /* for hybrid iopoll */
+ struct hy_poll_time *hy_poll;
};

struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7a673b52827b..0038cdfec18f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -198,6 +198,7 @@ enum {
* Removes indirection through the SQ index array.
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
+#define IORING_SETUP_HY_POLL (1U << 17)

enum io_uring_op {
IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cd9a137ad6ce..2c14768bbe27 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -311,6 +311,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;

ctx->flags = p->flags;
+ xa_init(&ctx->poll_array);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
@@ -2921,6 +2922,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
+ xa_destroy(&ctx->poll_array);
kfree(ctx);
}

@@ -4050,7 +4052,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
- IORING_SETUP_NO_SQARRAY))
+ IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
return -EINVAL;

return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d5495710c178..72d6a4c3b46d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -125,6 +125,9 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
__io_req_task_work_add(req, 0);
}

+/* if sleep time less than 1us, then do not do the schedule op */
+#define MIN_SCHETIME 1000
+
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index d5e79d9bdc71..29c7ce23ed71 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -713,6 +713,46 @@ static bool need_complete_io(struct io_kiocb *req)
S_ISBLK(file_inode(req->file)->i_mode);
}

+void init_hybrid_poll(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ /*
+ * In multiple concurrency, a thread may operate several files
+ * under different file systems, the inode numbers may be
+ * duplicated. Each device has a different IO command processing
+ * capability, so using device number to record the running time
+ * of device
+ */
+ u32 index = req->file->f_inode->i_rdev;
+ struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+ struct hy_poll_time *hpt = kmalloc(sizeof(struct hy_poll_time), GFP_KERNEL);
+
+ /* if alloc fail, go to regular poll */
+ if (!hpt) {
+ ctx->flags &= ~IORING_SETUP_HY_POLL;
+ return;
+ }
+ hpt->poll_state = 0;
+ req->hy_poll = hpt;
+
+ if (!entry) {
+ entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
+ if (!entry) {
+ ctx->flags &= ~IORING_SETUP_HY_POLL;
+ return;
+ }
+ entry->last_runtime = 0;
+ entry->last_irqtime = 0;
+ xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
+ }
+
+ /*
+ * Here we need nanosecond timestamps, some ways of reading
+ * timestamps directly are only accurate to microseconds, so
+ * there's no better alternative here for now
+ */
+ ktime_get_ts64(&hpt->iopoll_start);
+}
+
static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -750,6 +790,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
+ if (ctx->flags & IORING_SETUP_HY_POLL)
+ init_hybrid_poll(ctx, req);
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -1118,6 +1160,75 @@ void io_rw_fail(struct io_kiocb *req)
io_req_set_res(req, res, req->cqe.flags);
}

+void io_delay(struct hy_poll_time *hpt, struct iopoll_info *entry)
+{
+ struct hrtimer_sleeper timer;
+ struct timespec64 tc, oldtc;
+ enum hrtimer_mode mode;
+ ktime_t kt;
+ long sleep_ti;
+
+ if (hpt->poll_state == 1)
+ return;
+
+ if (entry->last_runtime <= entry->last_irqtime)
+ return;
+
+ /*
+ * Avoid excessive scheduling time affecting performance
+ * by using only 25 per cent of the remaining time
+ */
+ sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;
+
+ /*
+ * If the time available for sleep is too short, i.e. the
+ * totle running time and the context switching loss time
+ * are very close to each other, the scheduling operation
+ * is not performed to avoid increasing latency
+ */
+ if (sleep_ti < MIN_SCHETIME)
+ return;
+
+ ktime_get_ts64(&oldtc);
+ kt = ktime_set(0, sleep_ti);
+ hpt->poll_state = 1;
+
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires(&timer.timer, kt);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ hrtimer_sleeper_start_expires(&timer, mode);
+
+ if (timer.task)
+ io_schedule();
+
+ hrtimer_cancel(&timer.timer);
+ mode = HRTIMER_MODE_ABS;
+ __set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&timer.timer);
+
+ ktime_get_ts64(&tc);
+ entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti;
+}
+
+int io_uring_hybrid_poll(struct io_kiocb *req,
+ struct io_comp_batch *iob, unsigned int poll_flags)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct hy_poll_time *hpt = req->hy_poll;
+ u32 index = req->file->f_inode->i_rdev;
+ struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+ int ret;
+
+ io_delay(hpt, entry);
+ ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+
+ ktime_get_ts64(&hpt->iopoll_end);
+ entry->last_runtime = hpt->iopoll_end.tv_nsec - hpt->iopoll_start.tv_nsec;
+ return ret;
+}
+
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
struct io_wq_work_node *pos, *start, *prev;
@@ -1145,7 +1256,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (READ_ONCE(req->iopoll_completed))
break;

- if (req->opcode == IORING_OP_URING_CMD) {
+ if (ctx->flags & IORING_SETUP_HY_POLL) {
+ ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+ } else if (req->opcode == IORING_OP_URING_CMD) {
struct io_uring_cmd *ioucmd;

ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
--
2.40.1

2024-05-13 03:21:45

by hexue

[permalink] [raw]

Subject: Re: [PATCH v3] io_uring: releasing CPU resources when polling

On 5/7/24 17:16, hexue wrote:
>This patch is intended to release the CPU resources of io_uring in
>polling mode. When IO is issued, the program immediately polls for
>check completion, which is a waste of CPU resources when IO commands
>are executed on the disk.
>
>I add the hybrid polling feature in io_uring, enables polling to
>release a portion of CPU resources without affecting block layer.
>
>- Record the running time and context switching time of each
> IO, and use these time to determine whether a process continue
> to schedule.
>
>- Adaptive adjustment to different devices. Due to the real-time
> nature of time recording, each device's IO processing speed is
> different, so the CPU optimization effect will vary.
>
>- Set a interface (ctx->flag) enables application to choose whether
> or not to use this feature.
>
>The CPU optimization in peak workload of patch is tested as follows:
> set 8 poll queues
> all CPU utilization of original polling is 100% for per CPU, after
> optimization, the CPU utilization drop a lot (per CPU);
>
> read(128k, QD64, 1Job) 37% write(128k, QD64, 1Job) 40%
> randread(4k, QD64, 16Job) 52% randwrite(4k, QD64, 16Job) 12%
>
> Compared to original polling, the optimised performance reduction
> with peak workload within 1%.
>
> read 0.29% write 0.51% randread 0.09% randwrite 0%
>
>Signed-off-by: hexue <[email protected]>
>
>---
>
>changes:
>v2:
> - extend hybrid poll to async polled io
>
>v1:
> - initial version
>---
> include/linux/io_uring_types.h | 14 ++++
> include/uapi/linux/io_uring.h | 1 +
> io_uring/io_uring.c | 4 +-
> io_uring/io_uring.h | 3 +
> io_uring/rw.c | 115 ++++++++++++++++++++++++++++++++-
> 5 files changed, 135 insertions(+), 2 deletions(-)
>
>diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>index 854ad67a5f70..3a75b9904326 100644
>--- a/include/linux/io_uring_types.h
>+++ b/include/linux/io_uring_types.h
>@@ -224,6 +224,11 @@ struct io_alloc_cache {
> size_t elem_size;
> };
>
>+struct iopoll_info {
>+ long last_runtime;
>+ long last_irqtime;
>+};
>+
> struct io_ring_ctx {
> /* const or read-mostly hot data */
> struct {
>@@ -421,6 +426,7 @@ struct io_ring_ctx {
> unsigned short n_sqe_pages;
> struct page **ring_pages;
> struct page **sqe_pages;
>+ struct xarray poll_array;
> };
>
> struct io_tw_state {
>@@ -571,6 +577,12 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
> )
> #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)
>
>+struct hy_poll_time {
>+ int poll_state;
>+ struct timespec64 iopoll_start;
>+ struct timespec64 iopoll_end;
>+};
>+
> struct io_kiocb {
> union {
> /*
>@@ -641,6 +653,8 @@ struct io_kiocb {
> u64 extra1;
> u64 extra2;
> } big_cqe;
>+ /* for hybrid iopoll */
>+ struct hy_poll_time *hy_poll;
> };
>
> struct io_overflow_cqe {
>diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>index 7a673b52827b..0038cdfec18f 100644
>--- a/include/uapi/linux/io_uring.h
>+++ b/include/uapi/linux/io_uring.h
>@@ -198,6 +198,7 @@ enum {
> * Removes indirection through the SQ index array.
> */
> #define IORING_SETUP_NO_SQARRAY (1U << 16)
>+#define IORING_SETUP_HY_POLL (1U << 17)
>
> enum io_uring_op {
> IORING_OP_NOP,
>diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>index cd9a137ad6ce..2c14768bbe27 100644
>--- a/io_uring/io_uring.c
>+++ b/io_uring/io_uring.c
>@@ -311,6 +311,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> goto err;
>
> ctx->flags = p->flags;
>+ xa_init(&ctx->poll_array);
> atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
> init_waitqueue_head(&ctx->sqo_sq_wait);
> INIT_LIST_HEAD(&ctx->sqd_list);
>@@ -2921,6 +2922,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
> kfree(ctx->cancel_table_locked.hbs);
> kfree(ctx->io_bl);
> xa_destroy(&ctx->io_bl_xa);
>+ xa_destroy(&ctx->poll_array);
> kfree(ctx);
> }
>
>@@ -4050,7 +4052,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
> IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
> IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
> IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
>- IORING_SETUP_NO_SQARRAY))
>+ IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
> return -EINVAL;
>
> return io_uring_create(entries, &p, params);
>diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
>index d5495710c178..72d6a4c3b46d 100644
>--- a/io_uring/io_uring.h
>+++ b/io_uring/io_uring.h
>@@ -125,6 +125,9 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
> __io_req_task_work_add(req, 0);
> }
>
>+/* if sleep time less than 1us, then do not do the schedule op */
>+#define MIN_SCHETIME 1000
>+
> #define io_for_each_link(pos, head) \
> for (pos = (head); pos; pos = pos->link)
>
>diff --git a/io_uring/rw.c b/io_uring/rw.c
>index d5e79d9bdc71..29c7ce23ed71 100644
>--- a/io_uring/rw.c
>+++ b/io_uring/rw.c
>@@ -713,6 +713,46 @@ static bool need_complete_io(struct io_kiocb *req)
> S_ISBLK(file_inode(req->file)->i_mode);
> }
>
>+void init_hybrid_poll(struct io_ring_ctx *ctx, struct io_kiocb *req)
>+{
>+ /*
>+ * In multiple concurrency, a thread may operate several files
>+ * under different file systems, the inode numbers may be
>+ * duplicated. Each device has a different IO command processing
>+ * capability, so using device number to record the running time
>+ * of device
>+ */
>+ u32 index = req->file->f_inode->i_rdev;
>+ struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
>+ struct hy_poll_time *hpt = kmalloc(sizeof(struct hy_poll_time), GFP_KERNEL);
>+
>+ /* if alloc fail, go to regular poll */
>+ if (!hpt) {
>+ ctx->flags &= ~IORING_SETUP_HY_POLL;
>+ return;
>+ }
>+ hpt->poll_state = 0;
>+ req->hy_poll = hpt;
>+
>+ if (!entry) {
>+ entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
>+ if (!entry) {
>+ ctx->flags &= ~IORING_SETUP_HY_POLL;
>+ return;
>+ }
>+ entry->last_runtime = 0;
>+ entry->last_irqtime = 0;
>+ xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
>+ }
>+
>+ /*
>+ * Here we need nanosecond timestamps, some ways of reading
>+ * timestamps directly are only accurate to microseconds, so
>+ * there's no better alternative here for now
>+ */
>+ ktime_get_ts64(&hpt->iopoll_start);
>+}
>+
> static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
> {
> struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
>@@ -750,6 +790,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
> kiocb->ki_flags |= IOCB_HIPRI;
> kiocb->ki_complete = io_complete_rw_iopoll;
> req->iopoll_completed = 0;
>+ if (ctx->flags & IORING_SETUP_HY_POLL)
>+ init_hybrid_poll(ctx, req);
> } else {
> if (kiocb->ki_flags & IOCB_HIPRI)
> return -EINVAL;
>@@ -1118,6 +1160,75 @@ void io_rw_fail(struct io_kiocb *req)
> io_req_set_res(req, res, req->cqe.flags);
> }
>
>+void io_delay(struct hy_poll_time *hpt, struct iopoll_info *entry)
>+{
>+ struct hrtimer_sleeper timer;
>+ struct timespec64 tc, oldtc;
>+ enum hrtimer_mode mode;
>+ ktime_t kt;
>+ long sleep_ti;
>+
>+ if (hpt->poll_state == 1)
>+ return;
>+
>+ if (entry->last_runtime <= entry->last_irqtime)
>+ return;
>+
>+ /*
>+ * Avoid excessive scheduling time affecting performance
>+ * by using only 25 per cent of the remaining time
>+ */
>+ sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;
>+
>+ /*
>+ * If the time available for sleep is too short, i.e. the
>+ * totle running time and the context switching loss time
>+ * are very close to each other, the scheduling operation
>+ * is not performed to avoid increasing latency
>+ */
>+ if (sleep_ti < MIN_SCHETIME)
>+ return;
>+
>+ ktime_get_ts64(&oldtc);
>+ kt = ktime_set(0, sleep_ti);
>+ hpt->poll_state = 1;
>+
>+ mode = HRTIMER_MODE_REL;
>+ hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
>+ hrtimer_set_expires(&timer.timer, kt);
>+ set_current_state(TASK_UNINTERRUPTIBLE);
>+ hrtimer_sleeper_start_expires(&timer, mode);
>+
>+ if (timer.task)
>+ io_schedule();
>+
>+ hrtimer_cancel(&timer.timer);
>+ mode = HRTIMER_MODE_ABS;
>+ __set_current_state(TASK_RUNNING);
>+ destroy_hrtimer_on_stack(&timer.timer);
>+
>+ ktime_get_ts64(&tc);
>+ entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti;
>+}
>+
>+int io_uring_hybrid_poll(struct io_kiocb *req,
>+ struct io_comp_batch *iob, unsigned int poll_flags)
>+{
>+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
>+ struct io_ring_ctx *ctx = req->ctx;
>+ struct hy_poll_time *hpt = req->hy_poll;
>+ u32 index = req->file->f_inode->i_rdev;
>+ struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
>+ int ret;
>+
>+ io_delay(hpt, entry);
>+ ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
>+
>+ ktime_get_ts64(&hpt->iopoll_end);
>+ entry->last_runtime = hpt->iopoll_end.tv_nsec - hpt->iopoll_start.tv_nsec;
>+ return ret;
>+}
>+
> int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> {
> struct io_wq_work_node *pos, *start, *prev;
>@@ -1145,7 +1256,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> if (READ_ONCE(req->iopoll_completed))
> break;
>
>- if (req->opcode == IORING_OP_URING_CMD) {
>+ if (ctx->flags & IORING_SETUP_HY_POLL) {
>+ ret = io_uring_hybrid_poll(req, &iob, poll_flags);
>+ } else if (req->opcode == IORING_OP_URING_CMD) {
> struct io_uring_cmd *ioucmd;
>
> ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);

Hi, Jens
I have revised some of the code according to your suggestions,
and added comments to the parts that were not modified.
Do you have any other comments?

--

Xue He