2022-02-19 18:43:19

by Olivier Langlois

[permalink] [raw]
Subject: [PATCH v1] io_uring: Add support for napi_busy_poll

The sqpoll thread can be used for performing the napi busy poll in a
similar way that it does io polling for file systems supporting direct
access bypassing the page cache.

The other way that io_uring can be used for napi busy poll is by
calling io_uring_enter() to get events.

If the user specify a timeout value, it is distributed between polling
and sleeping by using the systemwide setting
/proc/sys/net/core/busy_poll.

Co-developed-by: Hao Xu <[email protected]>
Signed-off-by: Hao Xu <[email protected]>
Signed-off-by: Olivier Langlois <[email protected]>
---
fs/io_uring.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 192 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b9c7e4793b..0ed06f024e79 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -63,6 +63,7 @@
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
+#include <net/busy_poll.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -395,6 +396,10 @@ struct io_ring_ctx {
struct list_head sqd_list;

unsigned long check_cq_overflow;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ struct list_head napi_list;
+#endif

struct {
unsigned cached_cq_tail;
@@ -1464,6 +1469,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+ INIT_LIST_HEAD(&ctx->napi_list);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -5398,6 +5404,111 @@ IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */

+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
+
+struct napi_entry {
+ struct list_head list;
+ unsigned int napi_id;
+ unsigned long timeout;
+};
+
+/*
+ * Add busy poll NAPI ID from sk.
+ */
+static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+ unsigned int napi_id;
+ struct socket *sock;
+ struct sock *sk;
+ struct napi_entry *ne;
+
+ if (!net_busy_loop_on())
+ return;
+
+ sock = sock_from_file(file);
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+
+ /* Non-NAPI IDs can be rejected */
+ if (napi_id < MIN_NAPI_ID)
+ return;
+
+ list_for_each_entry(ne, &ctx->napi_list, list) {
+ if (ne->napi_id == napi_id) {
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ return;
+ }
+ }
+
+ ne = kmalloc(sizeof(*ne), GFP_KERNEL);
+ if (!ne)
+ return;
+
+ ne->napi_id = napi_id;
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ list_add_tail(&ne->list, &ctx->napi_list);
+}
+
+static inline void io_check_napi_entry_timeout(struct napi_entry *ne)
+{
+ if (time_after(jiffies, ne->timeout)) {
+ list_del(&ne->list);
+ kfree(ne);
+ }
+}
+
+/*
+ * Busy poll if globally on and supporting sockets found
+ */
+static bool io_napi_busy_loop(struct io_ring_ctx *ctx)
+{
+ struct napi_entry *ne, *n;
+
+ if (list_empty(&ctx->napi_list))
+ return false;
+
+ list_for_each_entry_safe(ne, n, &ctx->napi_list, list) {
+ napi_busy_loop(ne->napi_id, NULL, NULL, true,
+ BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ }
+ return !list_empty(&ctx->napi_list);
+}
+
+static void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+ while (!list_empty(&ctx->napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(&ctx->napi_list, struct napi_entry,
+ list);
+
+ list_del(&ne->list);
+ kfree(ne);
+ }
+}
+#else
+static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+}
+
+static inline bool io_napi_busy_loop(struct io_ring_ctx *ctx)
+{
+ return false;
+}
+
+static inline void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
@@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
__io_poll_execute(req, mask);
return 0;
}
+ io_add_napi(req->file, req->ctx);

/*
* Release ownership. If someone tried to queue a tw while it was
@@ -7518,7 +7630,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
-
+ if (io_napi_busy_loop(ctx))
+ ++ret;
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait);
if (creds)
@@ -7649,6 +7762,9 @@ struct io_wait_queue {
struct io_ring_ctx *ctx;
unsigned cq_tail;
unsigned nr_timeouts;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned busy_poll_to;
+#endif
};

static inline bool io_should_wake(struct io_wait_queue *iowq)
@@ -7709,6 +7825,67 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
return !*timeout ? -ETIME : 1;
}

+#ifdef CONFIG_NET_RX_BUSY_POLL
+static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
+ struct io_wait_queue *iowq)
+{
+ unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+ struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to);
+
+ if (timespec64_compare(ts, &pollto) > 0) {
+ *ts = timespec64_sub(*ts, pollto);
+ iowq->busy_poll_to = busy_poll_to;
+ } else {
+ iowq->busy_poll_to = timespec64_to_ns(ts) / 1000;
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+ }
+}
+
+static inline bool io_busy_loop_timeout(unsigned long start_time,
+ unsigned long bp_usec)
+{
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ }
+ return true;
+}
+
+static bool io_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct io_wait_queue *iowq = p;
+
+ return signal_pending(current) ||
+ io_should_wake(iowq) ||
+ io_busy_loop_timeout(start_time, iowq->busy_poll_to);
+}
+
+static void io_blocking_napi_busy_loop(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq)
+{
+ unsigned long start_time =
+ list_is_singular(&ctx->napi_list) ? 0 :
+ busy_loop_current_time();
+
+ do {
+ if (list_is_singular(&ctx->napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(&ctx->napi_list,
+ struct napi_entry, list);
+
+ napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq,
+ true, BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ break;
+ }
+ } while (io_napi_busy_loop(ctx) &&
+ !io_busy_loop_end(iowq, start_time));
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -7729,12 +7906,20 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!io_run_task_work())
break;
} while (1);
-
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ iowq.busy_poll_to = 0;
+#endif
if (uts) {
struct timespec64 ts;

if (get_timespec64(&ts, uts))
return -EFAULT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
+ !list_empty(&ctx->napi_list)) {
+ io_adjust_busy_loop_timeout(&ts, &iowq);
+ }
+#endif
timeout = timespec64_to_jiffies(&ts);
}

@@ -7759,6 +7944,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;

trace_io_uring_cqring_wait(ctx, min_events);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (iowq.busy_poll_to)
+ io_blocking_napi_busy_loop(ctx, &iowq);
+#endif
do {
/* if we can't even flush overflow, don't wait for more */
if (!io_cqring_overflow_flush(ctx)) {
@@ -9440,6 +9629,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
+ io_free_napi_list(ctx);
mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
--
2.35.1


2022-02-20 04:20:14

by Olivier Langlois

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

One side effect that I have discovered from testing the napi_busy_poll
patch, despite improving the network timing of the threads performing
the busy poll, it is the networking performance degradation that it has
on the rest of the system.

I dedicate isolated CPUS to specific threads of my program. My kernel
is compiled with CONFIG_NO_HZ_FULL. One thing that I have never really
understood is why there were still kernel threads assigned to the
isolated CPUs.

$ CORENUM=2; ps -L -e -o pid,psr,cpu,cmd | grep -E
"^[[:space:]]+[[:digit:]]+[[:space:]]+${CORENUM}"
24 2 - [cpuhp/2]
25 2 - [idle_inject/2]
26 2 - [migration/2]
27 2 - [ksoftirqd/2]
28 2 - [kworker/2:0-events]
29 2 - [kworker/2:0H]
83 2 - [kworker/2:1-mm_percpu_wq]

It is very hard to keep the CPU 100% tickless if there are still tasks
assigned to isolated CPUs by the kernel.

This question isn't really answered anywhere AFAIK:
https://www.kernel.org/doc/html/latest/timers/no_hz.html
https://jeremyeder.com/2013/11/15/nohz_fullgodmode/

Those threads running on their dedicated CPUS are the ones doing the
NAPI busy polling. Because of that, those CPUs usage ramp up to 100%
and running ping on the side is now having horrible numbers:

[2022-02-19 07:27:54] INFO SOCKPP/ping ping results for 10 loops:
0. 104.16.211.191 rtt min/avg/max/mdev = 9.926/34.987/80.048/17.016 ms
1. 104.16.212.191 rtt min/avg/max/mdev = 9.861/34.934/79.986/17.019 ms
2. 104.16.213.191 rtt min/avg/max/mdev = 9.876/34.949/79.965/16.997 ms
3. 104.16.214.191 rtt min/avg/max/mdev = 9.852/34.927/79.977/17.019 ms
4. 104.16.215.191 rtt min/avg/max/mdev = 9.869/34.943/79.958/16.997 ms

Doing this:
echo 990000 > /proc/sys/kernel/sched_rt_runtime_us

as instructed here:
https://www.kernel.org/doc/html/latest/scheduler/sched-rt-group.html

fix the problem:

$ ping 104.16.211.191
PING 104.16.211.191 (104.16.211.191) 56(84) bytes of data.
64 bytes from 104.16.211.191: icmp_seq=1 ttl=62 time=1.05 ms
64 bytes from 104.16.211.191: icmp_seq=2 ttl=62 time=0.812 ms
64 bytes from 104.16.211.191: icmp_seq=3 ttl=62 time=0.864 ms
64 bytes from 104.16.211.191: icmp_seq=4 ttl=62 time=0.846 ms
64 bytes from 104.16.211.191: icmp_seq=5 ttl=62 time=1.23 ms
64 bytes from 104.16.211.191: icmp_seq=6 ttl=62 time=0.957 ms
64 bytes from 104.16.211.191: icmp_seq=7 ttl=62 time=1.10 ms
^C
--- 104.16.211.191 ping statistics ---
7 packets transmitted, 7 received, 0% packet loss, time 6230ms
rtt min/avg/max/mdev = 0.812/0.979/1.231/0.142 ms

If I was to guess, I would say that it is ksoftirqd on those CPUs that
is starving and is not servicing the network packets but I wish that I
had a better understanding of what is really happening and know if it
would be possible to keep 100% those processors dedicated to my tasks
and have the network softirqs handled somewhere else to not have to
tweak /proc/sys/kernel/sched_rt_runtime_us to fix the issue...

2022-02-20 19:51:40

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On 2/19/22 2:42 PM, Olivier Langlois wrote:
> One side effect that I have discovered from testing the napi_busy_poll
> patch, despite improving the network timing of the threads performing
> the busy poll, it is the networking performance degradation that it has
> on the rest of the system.
>
> I dedicate isolated CPUS to specific threads of my program. My kernel
> is compiled with CONFIG_NO_HZ_FULL. One thing that I have never really
> understood is why there were still kernel threads assigned to the
> isolated CPUs.
>
> $ CORENUM=2; ps -L -e -o pid,psr,cpu,cmd | grep -E
> "^[[:space:]]+[[:digit:]]+[[:space:]]+${CORENUM}"
> 24 2 - [cpuhp/2]
> 25 2 - [idle_inject/2]
> 26 2 - [migration/2]
> 27 2 - [ksoftirqd/2]
> 28 2 - [kworker/2:0-events]
> 29 2 - [kworker/2:0H]
> 83 2 - [kworker/2:1-mm_percpu_wq]
>
> It is very hard to keep the CPU 100% tickless if there are still tasks
> assigned to isolated CPUs by the kernel.
>
> This question isn't really answered anywhere AFAIK:
> https://www.kernel.org/doc/html/latest/timers/no_hz.html
> https://jeremyeder.com/2013/11/15/nohz_fullgodmode/
>
> Those threads running on their dedicated CPUS are the ones doing the
> NAPI busy polling. Because of that, those CPUs usage ramp up to 100%
> and running ping on the side is now having horrible numbers:
>
> [2022-02-19 07:27:54] INFO SOCKPP/ping ping results for 10 loops:
> 0. 104.16.211.191 rtt min/avg/max/mdev = 9.926/34.987/80.048/17.016 ms
> 1. 104.16.212.191 rtt min/avg/max/mdev = 9.861/34.934/79.986/17.019 ms
> 2. 104.16.213.191 rtt min/avg/max/mdev = 9.876/34.949/79.965/16.997 ms
> 3. 104.16.214.191 rtt min/avg/max/mdev = 9.852/34.927/79.977/17.019 ms
> 4. 104.16.215.191 rtt min/avg/max/mdev = 9.869/34.943/79.958/16.997 ms
>
> Doing this:
> echo 990000 > /proc/sys/kernel/sched_rt_runtime_us
>
> as instructed here:
> https://www.kernel.org/doc/html/latest/scheduler/sched-rt-group.html
>
> fix the problem:
>
> $ ping 104.16.211.191
> PING 104.16.211.191 (104.16.211.191) 56(84) bytes of data.
> 64 bytes from 104.16.211.191: icmp_seq=1 ttl=62 time=1.05 ms
> 64 bytes from 104.16.211.191: icmp_seq=2 ttl=62 time=0.812 ms
> 64 bytes from 104.16.211.191: icmp_seq=3 ttl=62 time=0.864 ms
> 64 bytes from 104.16.211.191: icmp_seq=4 ttl=62 time=0.846 ms
> 64 bytes from 104.16.211.191: icmp_seq=5 ttl=62 time=1.23 ms
> 64 bytes from 104.16.211.191: icmp_seq=6 ttl=62 time=0.957 ms
> 64 bytes from 104.16.211.191: icmp_seq=7 ttl=62 time=1.10 ms
> ^C
> --- 104.16.211.191 ping statistics ---
> 7 packets transmitted, 7 received, 0% packet loss, time 6230ms
> rtt min/avg/max/mdev = 0.812/0.979/1.231/0.142 ms
>
> If I was to guess, I would say that it is ksoftirqd on those CPUs that
> is starving and is not servicing the network packets but I wish that I
> had a better understanding of what is really happening and know if it
> would be possible to keep 100% those processors dedicated to my tasks
> and have the network softirqs handled somewhere else to not have to
> tweak /proc/sys/kernel/sched_rt_runtime_us to fix the issue...

Outside of this, I was hoping to see some performance numbers in the
main patch. Sounds like you have them, can you share?

--
Jens Axboe

2022-02-21 03:46:32

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

Hi Olivier,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.17-rc4]
[cannot apply to next-20220217]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Olivier-Langlois/io_uring-Add-support-for-napi_busy_poll/20220220-190634
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 4f12b742eb2b3a850ac8be7dc4ed52976fc6cb0b
config: riscv-randconfig-r042-20220220 (https://download.01.org/0day-ci/archive/20220221/[email protected]/config)
compiler: riscv32-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/ad36ae938f354b0cd3b38716572385f710accdb0
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Olivier-Langlois/io_uring-Add-support-for-napi_busy_poll/20220220-190634
git checkout ad36ae938f354b0cd3b38716572385f710accdb0
# save the config file to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=riscv SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

riscv32-linux-ld: fs/io_uring.o: in function `.L0 ':
>> io_uring.c:(.text+0x8d04): undefined reference to `__divdi3'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

2022-02-21 06:42:47

by Hao Xu

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

在 2022/2/21 上午2:37, Olivier Langlois 写道:
> On Sat, 2022-02-19 at 17:22 -0700, Jens Axboe wrote:
>>
>> Outside of this, I was hoping to see some performance numbers in the
>> main patch. Sounds like you have them, can you share?
>>
> Yes.
>
> It is not much. Only numbers from my application and it is far from
> being the best benchmark because the result can be influenced by
> multiple external factors.
>
> Beside addressing the race condition remaining inside io_cqring_wait()
> around napi_list for v2 patch, creating a benchmark program that
> isolate the performance of the new feature is on my todo list.
>
> I would think that creating a simple UDP ping-pong setup and measure

An echo-server may be a good choice.
> RTT with and without busy_polling should be a good enough test.
>
> In the meantime, here are the results that I have:
>
> Without io_uring busy poll:
> reaction time to an update: 17159usec
> reaction time to an update: 19068usec
> reaction time to an update: 23055usec
> reaction time to an update: 16511usec
> reaction time to an update: 17604usec
>
> With io_uring busy poll:
> reaction time to an update: 15782usec
> reaction time to an update: 15337usec
> reaction time to an update: 15379usec
> reaction time to an update: 15275usec
> reaction time to an update: 15107usec
>
> Concerning my latency issue with busy polling, I have found this that
> might help me:
> https://lwn.net/ml/netdev/[email protected]/

2022-02-21 09:03:09

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

Hi Olivier,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.17-rc4]
[cannot apply to next-20220217]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Olivier-Langlois/io_uring-Add-support-for-napi_busy_poll/20220220-190634
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 4f12b742eb2b3a850ac8be7dc4ed52976fc6cb0b
config: nds32-allnoconfig (https://download.01.org/0day-ci/archive/20220221/[email protected]/config)
compiler: nds32le-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/ad36ae938f354b0cd3b38716572385f710accdb0
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Olivier-Langlois/io_uring-Add-support-for-napi_busy_poll/20220220-190634
git checkout ad36ae938f354b0cd3b38716572385f710accdb0
# save the config file to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=nds32 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

fs/io_uring.c: In function 'io_ring_ctx_alloc':
>> fs/io_uring.c:1472:28: error: 'struct io_ring_ctx' has no member named 'napi_list'
1472 | INIT_LIST_HEAD(&ctx->napi_list);
| ^~
fs/io_uring.c: In function '__io_submit_flush_completions':
fs/io_uring.c:2529:40: warning: variable 'prev' set but not used [-Wunused-but-set-variable]
2529 | struct io_wq_work_node *node, *prev;
| ^~~~


vim +1472 fs/io_uring.c

1413
1414 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1415 {
1416 struct io_ring_ctx *ctx;
1417 int hash_bits;
1418
1419 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1420 if (!ctx)
1421 return NULL;
1422
1423 /*
1424 * Use 5 bits less than the max cq entries, that should give us around
1425 * 32 entries per hash list if totally full and uniformly spread.
1426 */
1427 hash_bits = ilog2(p->cq_entries);
1428 hash_bits -= 5;
1429 if (hash_bits <= 0)
1430 hash_bits = 1;
1431 ctx->cancel_hash_bits = hash_bits;
1432 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1433 GFP_KERNEL);
1434 if (!ctx->cancel_hash)
1435 goto err;
1436 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1437
1438 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1439 if (!ctx->dummy_ubuf)
1440 goto err;
1441 /* set invalid range, so io_import_fixed() fails meeting it */
1442 ctx->dummy_ubuf->ubuf = -1UL;
1443
1444 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1445 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1446 goto err;
1447
1448 ctx->flags = p->flags;
1449 init_waitqueue_head(&ctx->sqo_sq_wait);
1450 INIT_LIST_HEAD(&ctx->sqd_list);
1451 INIT_LIST_HEAD(&ctx->cq_overflow_list);
1452 init_completion(&ctx->ref_comp);
1453 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1454 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1455 mutex_init(&ctx->uring_lock);
1456 init_waitqueue_head(&ctx->cq_wait);
1457 spin_lock_init(&ctx->completion_lock);
1458 spin_lock_init(&ctx->timeout_lock);
1459 INIT_WQ_LIST(&ctx->iopoll_list);
1460 INIT_LIST_HEAD(&ctx->defer_list);
1461 INIT_LIST_HEAD(&ctx->timeout_list);
1462 INIT_LIST_HEAD(&ctx->ltimeout_list);
1463 spin_lock_init(&ctx->rsrc_ref_lock);
1464 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1465 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1466 init_llist_head(&ctx->rsrc_put_llist);
1467 INIT_LIST_HEAD(&ctx->tctx_list);
1468 ctx->submit_state.free_list.next = NULL;
1469 INIT_WQ_LIST(&ctx->locked_free_list);
1470 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1471 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
> 1472 INIT_LIST_HEAD(&ctx->napi_list);
1473 return ctx;
1474 err:
1475 kfree(ctx->dummy_ubuf);
1476 kfree(ctx->cancel_hash);
1477 kfree(ctx);
1478 return NULL;
1479 }
1480

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

2022-02-21 09:39:32

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On 2/20/22 11:37 AM, Olivier Langlois wrote:
> On Sat, 2022-02-19 at 17:22 -0700, Jens Axboe wrote:
>>
>> Outside of this, I was hoping to see some performance numbers in the
>> main patch. Sounds like you have them, can you share?
>>
> Yes.
>
> It is not much. Only numbers from my application and it is far from
> being the best benchmark because the result can be influenced by
> multiple external factors.
>
> Beside addressing the race condition remaining inside io_cqring_wait()
> around napi_list for v2 patch, creating a benchmark program that
> isolate the performance of the new feature is on my todo list.
>
> I would think that creating a simple UDP ping-pong setup and measure
> RTT with and without busy_polling should be a good enough test.

Yes, a separate targeted test like that would be very useful and
interesting indeed!

> In the meantime, here are the results that I have:
>
> Without io_uring busy poll:
> reaction time to an update: 17159usec
> reaction time to an update: 19068usec
> reaction time to an update: 23055usec
> reaction time to an update: 16511usec
> reaction time to an update: 17604usec
>
> With io_uring busy poll:
> reaction time to an update: 15782usec
> reaction time to an update: 15337usec
> reaction time to an update: 15379usec
> reaction time to an update: 15275usec
> reaction time to an update: 15107usec

OK, that's a pretty good improvement in both latency and
deviation/consistency. Is this using SQPOLL, or is it using polling off
cqring_wait from the task itself? Also something to consider for the
test benchmark app, should be able to run both (which is usually just
setting the SETUP_SQPOLL flag or not, if done right).

> Concerning my latency issue with busy polling, I have found this that
> might help me:
> https://lwn.net/ml/netdev/[email protected]/
>

--
Jens Axboe

2022-02-21 09:48:40

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

Hi Olivier,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.17-rc4]
[cannot apply to next-20220217]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Olivier-Langlois/io_uring-Add-support-for-napi_busy_poll/20220220-190634
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 4f12b742eb2b3a850ac8be7dc4ed52976fc6cb0b
config: arm64-randconfig-r002-20220220 (https://download.01.org/0day-ci/archive/20220221/[email protected]/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project d271fc04d5b97b12e6b797c6067d3c96a8d7470e)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install arm64 cross compiling tool for clang build
# apt-get install binutils-aarch64-linux-gnu
# https://github.com/0day-ci/linux/commit/ad36ae938f354b0cd3b38716572385f710accdb0
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Olivier-Langlois/io_uring-Add-support-for-napi_busy_poll/20220220-190634
git checkout ad36ae938f354b0cd3b38716572385f710accdb0
# save the config file to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

>> fs/io_uring.c:1472:23: error: no member named 'napi_list' in 'struct io_ring_ctx'
INIT_LIST_HEAD(&ctx->napi_list);
~~~ ^
1 error generated.


vim +1472 fs/io_uring.c

1413
1414 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1415 {
1416 struct io_ring_ctx *ctx;
1417 int hash_bits;
1418
1419 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1420 if (!ctx)
1421 return NULL;
1422
1423 /*
1424 * Use 5 bits less than the max cq entries, that should give us around
1425 * 32 entries per hash list if totally full and uniformly spread.
1426 */
1427 hash_bits = ilog2(p->cq_entries);
1428 hash_bits -= 5;
1429 if (hash_bits <= 0)
1430 hash_bits = 1;
1431 ctx->cancel_hash_bits = hash_bits;
1432 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1433 GFP_KERNEL);
1434 if (!ctx->cancel_hash)
1435 goto err;
1436 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1437
1438 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1439 if (!ctx->dummy_ubuf)
1440 goto err;
1441 /* set invalid range, so io_import_fixed() fails meeting it */
1442 ctx->dummy_ubuf->ubuf = -1UL;
1443
1444 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1445 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1446 goto err;
1447
1448 ctx->flags = p->flags;
1449 init_waitqueue_head(&ctx->sqo_sq_wait);
1450 INIT_LIST_HEAD(&ctx->sqd_list);
1451 INIT_LIST_HEAD(&ctx->cq_overflow_list);
1452 init_completion(&ctx->ref_comp);
1453 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1454 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1455 mutex_init(&ctx->uring_lock);
1456 init_waitqueue_head(&ctx->cq_wait);
1457 spin_lock_init(&ctx->completion_lock);
1458 spin_lock_init(&ctx->timeout_lock);
1459 INIT_WQ_LIST(&ctx->iopoll_list);
1460 INIT_LIST_HEAD(&ctx->defer_list);
1461 INIT_LIST_HEAD(&ctx->timeout_list);
1462 INIT_LIST_HEAD(&ctx->ltimeout_list);
1463 spin_lock_init(&ctx->rsrc_ref_lock);
1464 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1465 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1466 init_llist_head(&ctx->rsrc_put_llist);
1467 INIT_LIST_HEAD(&ctx->tctx_list);
1468 ctx->submit_state.free_list.next = NULL;
1469 INIT_WQ_LIST(&ctx->locked_free_list);
1470 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1471 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
> 1472 INIT_LIST_HEAD(&ctx->napi_list);
1473 return ctx;
1474 err:
1475 kfree(ctx->dummy_ubuf);
1476 kfree(ctx->cancel_hash);
1477 kfree(ctx);
1478 return NULL;
1479 }
1480

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

2022-02-21 09:55:31

by Olivier Langlois

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On Sat, 2022-02-19 at 17:22 -0700, Jens Axboe wrote:
>
> Outside of this, I was hoping to see some performance numbers in the
> main patch. Sounds like you have them, can you share?
>
Yes.

It is not much. Only numbers from my application and it is far from
being the best benchmark because the result can be influenced by
multiple external factors.

Beside addressing the race condition remaining inside io_cqring_wait()
around napi_list for v2 patch, creating a benchmark program that
isolate the performance of the new feature is on my todo list.

I would think that creating a simple UDP ping-pong setup and measure
RTT with and without busy_polling should be a good enough test.

In the meantime, here are the results that I have:

Without io_uring busy poll:
reaction time to an update: 17159usec
reaction time to an update: 19068usec
reaction time to an update: 23055usec
reaction time to an update: 16511usec
reaction time to an update: 17604usec

With io_uring busy poll:
reaction time to an update: 15782usec
reaction time to an update: 15337usec
reaction time to an update: 15379usec
reaction time to an update: 15275usec
reaction time to an update: 15107usec

Concerning my latency issue with busy polling, I have found this that
might help me:
https://lwn.net/ml/netdev/[email protected]/

2022-02-21 10:03:20

by Hao Xu

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

在 2022/2/19 下午4:03, Olivier Langlois 写道:
> The sqpoll thread can be used for performing the napi busy poll in a
> similar way that it does io polling for file systems supporting direct
> access bypassing the page cache.
>
> The other way that io_uring can be used for napi busy poll is by
> calling io_uring_enter() to get events.
>
> If the user specify a timeout value, it is distributed between polling
> and sleeping by using the systemwide setting
> /proc/sys/net/core/busy_poll.
>
> Co-developed-by: Hao Xu <[email protected]>
> Signed-off-by: Hao Xu <[email protected]>
> Signed-off-by: Olivier Langlois <[email protected]>
> ---
> fs/io_uring.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 192 insertions(+), 2 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 77b9c7e4793b..0ed06f024e79 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -63,6 +63,7 @@
> #include <net/sock.h>
> #include <net/af_unix.h>
> #include <net/scm.h>
> +#include <net/busy_poll.h>
> #include <linux/anon_inodes.h>
> #include <linux/sched/mm.h>
> #include <linux/uaccess.h>
> @@ -395,6 +396,10 @@ struct io_ring_ctx {
> struct list_head sqd_list;
>
> unsigned long check_cq_overflow;
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + /* used to track busy poll napi_id */
> + struct list_head napi_list;
> +#endif
>
> struct {
> unsigned cached_cq_tail;
> @@ -1464,6 +1469,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> INIT_WQ_LIST(&ctx->locked_free_list);
> INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
> INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
> + INIT_LIST_HEAD(&ctx->napi_list);
> return ctx;
> err:
> kfree(ctx->dummy_ubuf);
> @@ -5398,6 +5404,111 @@ IO_NETOP_FN(send);
> IO_NETOP_FN(recv);
> #endif /* CONFIG_NET */
>
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +
> +#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
> +
> +struct napi_entry {
> + struct list_head list;
> + unsigned int napi_id;
> + unsigned long timeout;
> +};
> +
> +/*
> + * Add busy poll NAPI ID from sk.
> + */
> +static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
> +{
> + unsigned int napi_id;
> + struct socket *sock;
> + struct sock *sk;
> + struct napi_entry *ne;
> +
> + if (!net_busy_loop_on())
> + return;
> +
> + sock = sock_from_file(file);
> + if (!sock)
> + return;
> +
> + sk = sock->sk;
> + if (!sk)
> + return;
> +
> + napi_id = READ_ONCE(sk->sk_napi_id);
> +
> + /* Non-NAPI IDs can be rejected */
> + if (napi_id < MIN_NAPI_ID)
> + return;
> +
> + list_for_each_entry(ne, &ctx->napi_list, list) {
> + if (ne->napi_id == napi_id) {
> + ne->timeout = jiffies + NAPI_TIMEOUT;
> + return;
> + }
> + }
> +
> + ne = kmalloc(sizeof(*ne), GFP_KERNEL);
> + if (!ne)
> + return;
> +
> + ne->napi_id = napi_id;
> + ne->timeout = jiffies + NAPI_TIMEOUT;
> + list_add_tail(&ne->list, &ctx->napi_list);
> +}
> +
> +static inline void io_check_napi_entry_timeout(struct napi_entry *ne)
> +{
> + if (time_after(jiffies, ne->timeout)) {
> + list_del(&ne->list);
> + kfree(ne);
> + }
> +}
> +
> +/*
> + * Busy poll if globally on and supporting sockets found
> + */
> +static bool io_napi_busy_loop(struct io_ring_ctx *ctx)
> +{
> + struct napi_entry *ne, *n;
> +
> + if (list_empty(&ctx->napi_list))
> + return false;
> +
> + list_for_each_entry_safe(ne, n, &ctx->napi_list, list) {
> + napi_busy_loop(ne->napi_id, NULL, NULL, true,
> + BUSY_POLL_BUDGET);
> + io_check_napi_entry_timeout(ne);
> + }
> + return !list_empty(&ctx->napi_list);
> +}
> +
> +static void io_free_napi_list(struct io_ring_ctx *ctx)
> +{
> + while (!list_empty(&ctx->napi_list)) {
> + struct napi_entry *ne =
> + list_first_entry(&ctx->napi_list, struct napi_entry,
> + list);
> +
> + list_del(&ne->list);
> + kfree(ne);
> + }
> +}
> +#else
> +static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
> +{
> +}
> +
> +static inline bool io_napi_busy_loop(struct io_ring_ctx *ctx)
> +{
> + return false;
> +}
> +
> +static inline void io_free_napi_list(struct io_ring_ctx *ctx)
> +{
> +}
> +#endif /* CONFIG_NET_RX_BUSY_POLL */
> +
> struct io_poll_table {
> struct poll_table_struct pt;
> struct io_kiocb *req;
> @@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
> __io_poll_execute(req, mask);
> return 0;
> }
> + io_add_napi(req->file, req->ctx);

I think this may not be the right place to do it. the process will be:
arm_poll sockfdA--> get invalid napi_id from sk->napi_id --> event
triggered --> arm_poll for sockfdA again --> get valid napi_id
then why not do io_add_napi() in event
handler(apoll_task_func/poll_task_func).
>
> /*
> * Release ownership. If someone tried to queue a tw while it was
> @@ -7518,7 +7630,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
> !(ctx->flags & IORING_SETUP_R_DISABLED))
> ret = io_submit_sqes(ctx, to_submit);
> mutex_unlock(&ctx->uring_lock);
> -
> + if (io_napi_busy_loop(ctx))
> + ++ret;
> if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
> wake_up(&ctx->sqo_sq_wait);
> if (creds)
> @@ -7649,6 +7762,9 @@ struct io_wait_queue {
> struct io_ring_ctx *ctx;
> unsigned cq_tail;
> unsigned nr_timeouts;
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + unsigned busy_poll_to;
> +#endif
> };
>
> static inline bool io_should_wake(struct io_wait_queue *iowq)
> @@ -7709,6 +7825,67 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
> return !*timeout ? -ETIME : 1;
> }
>
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> +static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
> + struct io_wait_queue *iowq)
> +{
> + unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
> + struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to);
> +
> + if (timespec64_compare(ts, &pollto) > 0) {
> + *ts = timespec64_sub(*ts, pollto);
> + iowq->busy_poll_to = busy_poll_to;
> + } else {
> + iowq->busy_poll_to = timespec64_to_ns(ts) / 1000;

How about timespec64_tons(ts) >> 10, since we don't need accurate
number.
> + ts->tv_sec = 0;
> + ts->tv_nsec = 0;
> + }
> +}
> +
> +static inline bool io_busy_loop_timeout(unsigned long start_time,
> + unsigned long bp_usec)
> +{
> + if (bp_usec) {
> + unsigned long end_time = start_time + bp_usec;
> + unsigned long now = busy_loop_current_time();
> +
> + return time_after(now, end_time);
> + }
> + return true;
> +}
> +
> +static bool io_busy_loop_end(void *p, unsigned long start_time)
> +{
> + struct io_wait_queue *iowq = p;
> +
> + return signal_pending(current) ||
> + io_should_wake(iowq) ||
> + io_busy_loop_timeout(start_time, iowq->busy_poll_to);
> +}
> +
> +static void io_blocking_napi_busy_loop(struct io_ring_ctx *ctx,
> + struct io_wait_queue *iowq)
> +{
> + unsigned long start_time =
> + list_is_singular(&ctx->napi_list) ? 0 :
> + busy_loop_current_time();
> +
> + do {
> + if (list_is_singular(&ctx->napi_list)) {
> + struct napi_entry *ne =
> + list_first_entry(&ctx->napi_list,
> + struct napi_entry, list);
> +
> + napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq,
> + true, BUSY_POLL_BUDGET);
> + io_check_napi_entry_timeout(ne);
> + break;
> + }
> + } while (io_napi_busy_loop(ctx) &&

Why don't we setup busy_loop_end callback for normal(non-singular) case,
we can record the number of napi_entry, and divide the time frame to
each entry.
> + !io_busy_loop_end(iowq, start_time));
> +}
> +#endif /* CONFIG_NET_RX_BUSY_POLL */
> +
> /*
> * Wait until events become available, if we don't already have some. The
> * application must reap them itself, as they reside on the shared cq ring.
> @@ -7729,12 +7906,20 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> if (!io_run_task_work())
> break;
> } while (1);
> -
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + iowq.busy_poll_to = 0;
> +#endif
> if (uts) {
> struct timespec64 ts;
>
> if (get_timespec64(&ts, uts))
> return -EFAULT;
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
> + !list_empty(&ctx->napi_list)) {
> + io_adjust_busy_loop_timeout(&ts, &iowq);
> + }
> +#endif
> timeout = timespec64_to_jiffies(&ts);
> }
>
> @@ -7759,6 +7944,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
>
> trace_io_uring_cqring_wait(ctx, min_events);
> +#ifdef CONFIG_NET_RX_BUSY_POLL
> + if (iowq.busy_poll_to)
> + io_blocking_napi_busy_loop(ctx, &iowq);

We may not need locks for the napi_list, the reason is we don't need to
poll an accurate list, the busy polling/NAPI itself is kind of
speculation. So the deletion is not an emergency.
To say the least, we can probably delay the deletion to some safe place
like the original task's task work though this may cause other problems...

Regards,
Hao
> +#endif
> do {
> /* if we can't even flush overflow, don't wait for more */
> if (!io_cqring_overflow_flush(ctx)) {
> @@ -9440,6 +9629,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
> __io_sqe_files_unregister(ctx);
> if (ctx->rings)
> __io_cqring_overflow_flush(ctx, true);
> + io_free_napi_list(ctx);
> mutex_unlock(&ctx->uring_lock);
> io_eventfd_unregister(ctx);
> io_destroy_buffers(ctx);

2022-02-22 04:34:32

by Olivier Langlois

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On Sun, 2022-02-20 at 12:38 -0700, Jens Axboe wrote:
>
> OK, that's a pretty good improvement in both latency and
> deviation/consistency. Is this using SQPOLL, or is it using polling
> off
> cqring_wait from the task itself? Also something to consider for the
> test benchmark app, should be able to run both (which is usually just
> setting the SETUP_SQPOLL flag or not, if done right).
>
>
The answer to your question is complex. This is one of the external
factor that I was refering too.

1 thread is managing 49 TCP sockets. This thread io_uring context is
configured with SQPOLL. Upon receiving a packet of interest, it will
wake up thread #2 with an eventfd installed into a private non SQPOLL
io_uring context and will send a request to a 50th TCP socket.

Both threads are now busy polling NAPI. One from the SQPOLL code and
the other with the io_cqring_wait() code.

If it was not enough, since I have discovered busy poll benefits and
that to reschedule a sleeping task takes about 5-10 uSecs, thread #1 is
also busy polling io_uring instead of blocking in io_uring_enter().

Thx for suggesting designing the benchmark to be able to test both
SQPOLL and non SQPOLL busy polling. This is something that I already in
mind.

I have completed 3 small improvements for the patch v2. I need to check
the kernel test bot and Hao comments to see if I have more to work on
but if all is good, I only need to complete the benchmark program. I
might able to send v2 later today.

Greetings,

2022-02-25 09:04:19

by Olivier Langlois

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On Mon, 2022-02-21 at 13:23 +0800, Hao Xu wrote:
> > @@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct
> > io_kiocb *req,
> > ????????????????__io_poll_execute(req, mask);
> > ????????????????return 0;
> > ????????}
> > +???????io_add_napi(req->file, req->ctx);
>
> I think this may not be the right place to do it. the process will
> be:
> arm_poll sockfdA--> get invalid napi_id from sk->napi_id --> event
> triggered --> arm_poll for sockfdA again --> get valid napi_id
> then why not do io_add_napi() in event
> handler(apoll_task_func/poll_task_func).

You have a valid concern that the first time a socket is passed to
io_uring that napi_id might not be assigned yet.

OTOH, getting it after data is available for reading does not help
neither since busy polling must be done before data is received.

for both places, the extracted napi_id will only be leveraged at the
next polling.

Your suggestion is superior because it might be the only working way
for MULTIPOLL requests.

However, I choose __io_arm_poll_handler() because if napi_busy_poll()
is desired without a sqpoll thread, the context must be locked when
calling io_add_napi(). This is the case when __io_arm_poll_handler() is
called AFAIK.

and I don't think that the context is locked when
(apoll_task_func/poll_task_func) are called.

I acknowledge that this is an issue that needs to be fixed but right
now I am not sure how to address this so let me share v2 of the patch
and plan a v3 for at least this pending issue.

> >
> > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > +static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
> > +???????????????????????????????????????struct io_wait_queue *iowq)
> > +{
> > +???????unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
> > +???????struct timespec64 pollto = ns_to_timespec64(1000 *
> > (s64)busy_poll_to);
> > +
> > +???????if (timespec64_compare(ts, &pollto) > 0) {
> > +???????????????*ts = timespec64_sub(*ts, pollto);
> > +???????????????iowq->busy_poll_to = busy_poll_to;
> > +???????} else {
> > +???????????????iowq->busy_poll_to = timespec64_to_ns(ts) / 1000;
>
> How about timespec64_tons(ts) >> 10, since we don't need accurate
> number.

Fantastic suggestion! The kernel test robot did also detect an issue
with that statement. I did discover do_div() in the meantime but what
you suggest is better, IMHO...

> > +static void io_blocking_napi_busy_loop(struct io_ring_ctx *ctx,
> > +????????????????????????????????????? struct io_wait_queue *iowq)
> > +{
> > +???????unsigned long start_time =
> > +???????????????list_is_singular(&ctx->napi_list) ? 0 :
> > +???????????????busy_loop_current_time();
> > +
> > +???????do {
> > +???????????????if (list_is_singular(&ctx->napi_list)) {
> > +???????????????????????struct napi_entry *ne =
> > +???????????????????????????????list_first_entry(&ctx->napi_list,
> > +??????????????????????????????????????????????? struct napi_entry,
> > list);
> > +
> > +???????????????????????napi_busy_loop(ne->napi_id,
> > io_busy_loop_end, iowq,
> > +????????????????????????????????????? true, BUSY_POLL_BUDGET);
> > +???????????????????????io_check_napi_entry_timeout(ne);
> > +???????????????????????break;
> > +???????????????}
> > +???????} while (io_napi_busy_loop(ctx) &&
>
> Why don't we setup busy_loop_end callback for normal(non-singular)
> case,
> we can record the number of napi_entry, and divide the time frame to
> each entry.

This is from intuition that iterating through all the napi devices in a
'sprinkler' pattern is the correct way to proceed when handling several
devices.

If you busy poll the first devices for a certain amount of time and a
packet is received in the last device, you won't know until you reach
it which will be much later than with the proposed 'sprinkler' way.

singular case is treated differently because entering/exiting
napi_busy_loop() incur setup overhead that you don't need for that
special case.

> > +??????????????? !io_busy_loop_end(iowq, start_time));
> > +}
> > +#endif /* CONFIG_NET_RX_BUSY_POLL */
> > +
> > ? /*
> > ?? * Wait until events become available, if we don't already have
> > some. The
> > ?? * application must reap them itself, as they reside on the
> > shared cq ring.
> > @@ -7729,12 +7906,20 @@ static int io_cqring_wait(struct
> > io_ring_ctx *ctx, int min_events,
> > ????????????????if (!io_run_task_work())
> > ????????????????????????break;
> > ????????} while (1);
> > -
> > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > +???????iowq.busy_poll_to = 0;
> > +#endif
> > ????????if (uts) {
> > ????????????????struct timespec64 ts;
> > ?
> > ????????????????if (get_timespec64(&ts, uts))
> > ????????????????????????return -EFAULT;
> > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > +???????????????if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
> > +?????????????????? !list_empty(&ctx->napi_list)) {
> > +???????????????????????io_adjust_busy_loop_timeout(&ts, &iowq);
> > +???????????????}
> > +#endif
> > ????????????????timeout = timespec64_to_jiffies(&ts);
> > ????????}
> > ?
> > @@ -7759,6 +7944,10 @@ static int io_cqring_wait(struct io_ring_ctx
> > *ctx, int min_events,
> > ????????iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
> > ?
> > ????????trace_io_uring_cqring_wait(ctx, min_events);
> > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > +???????if (iowq.busy_poll_to)
> > +???????????????io_blocking_napi_busy_loop(ctx, &iowq);
>
> We may not need locks for the napi_list, the reason is we don't need
> to
> poll an accurate list, the busy polling/NAPI itself is kind of
> speculation. So the deletion is not an emergency.
> To say the least, we can probably delay the deletion to some safe
> place
> like the original task's task work though this may cause other
> problems...

There are 2 concerns here.

1. Iterating a list while another thread modify it is not thread-safe
unless you use a lock.

If we offer napi_busy_poll() without sqpoll with the modification in
io_cqring_wait(), this is a real possibility. A thread could call
io_uring_enter(IORING_ENTER_GETEVENTS) while another thread calls
io_uring_enter() to submit new sqes that could trigger a call to
io_add_napi().

If napi_busy_poll() is only offered through sqpoll thread, this becomes
a non-issue since the only thread accessing/modifying the napi_list
field is the sqpoll thread.

Providing the patch benchmark result with v2 could help deciding what
to do with this choice.

2. You are correct when you say that deletion is not an emergency.?

However, the design guideline that I did follow when writing the patch
is that napi_busy_poll support should not impact users not using this
feature. Doing the deletion where that patch is doing it fullfill this
goal.

Comparing a timeout value with the jiffies variable is very cheap and
will only be performed when napi_busy_poll is used.

The other option would be to add a refcount to each napi_entry and
decrement it if needed everytime a request is discarded. Doing that
that check for every requests that io_uring discards on completion, I
am very confident that this would negatively impact various performance
benchmarks that Jens routinely perform...

2022-02-25 20:18:29

by Olivier Langlois

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On Fri, 2022-02-25 at 00:32 -0500, Olivier Langlois wrote:
>
> > >
> > > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > > +static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
> > > +???????????????????????????????????????struct io_wait_queue
> > > *iowq)
> > > +{
> > > +???????unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
> > > +???????struct timespec64 pollto = ns_to_timespec64(1000 *
> > > (s64)busy_poll_to);
> > > +
> > > +???????if (timespec64_compare(ts, &pollto) > 0) {
> > > +???????????????*ts = timespec64_sub(*ts, pollto);
> > > +???????????????iowq->busy_poll_to = busy_poll_to;
> > > +???????} else {
> > > +???????????????iowq->busy_poll_to = timespec64_to_ns(ts) / 1000;
> >
> > How about timespec64_tons(ts) >> 10, since we don't need accurate
> > number.
>
> Fantastic suggestion! The kernel test robot did also detect an issue
> with that statement. I did discover do_div() in the meantime but what
> you suggest is better, IMHO...

After having seen Jens patch (io_uring: don't convert to jiffies for
waiting on timeouts), I think that I'll stick with do_div().

I have a hard time considering removing timing accuracy when effort is
made to make the same function more accurate...
>
>
> > > +??????????????? !io_busy_loop_end(iowq, start_time));
> > > +}
> > > +#endif /* CONFIG_NET_RX_BUSY_POLL */
> > > +
> > > ? /*
> > > ?? * Wait until events become available, if we don't already have
> > > some. The
> > > ?? * application must reap them itself, as they reside on the
> > > shared cq ring.
> > > @@ -7729,12 +7906,20 @@ static int io_cqring_wait(struct
> > > io_ring_ctx *ctx, int min_events,
> > > ????????????????if (!io_run_task_work())
> > > ????????????????????????break;
> > > ????????} while (1);
> > > -
> > > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > > +???????iowq.busy_poll_to = 0;
> > > +#endif
> > > ????????if (uts) {
> > > ????????????????struct timespec64 ts;
> > > ?
> > > ????????????????if (get_timespec64(&ts, uts))
> > > ????????????????????????return -EFAULT;
> > > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > > +???????????????if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
> > > +?????????????????? !list_empty(&ctx->napi_list)) {
> > > +???????????????????????io_adjust_busy_loop_timeout(&ts, &iowq);
> > > +???????????????}
> > > +#endif
> > > ????????????????timeout = timespec64_to_jiffies(&ts);
> > > ????????}
> > > ?
> > > @@ -7759,6 +7944,10 @@ static int io_cqring_wait(struct
> > > io_ring_ctx
> > > *ctx, int min_events,
> > > ????????iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) +
> > > min_events;
> > > ?
> > > ????????trace_io_uring_cqring_wait(ctx, min_events);
> > > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > > +???????if (iowq.busy_poll_to)
> > > +???????????????io_blocking_napi_busy_loop(ctx, &iowq);
> >
> > We may not need locks for the napi_list, the reason is we don't
> > need
> > to
> > poll an accurate list, the busy polling/NAPI itself is kind of
> > speculation. So the deletion is not an emergency.
> > To say the least, we can probably delay the deletion to some safe
> > place
> > like the original task's task work though this may cause other
> > problems...
>
> There are 2 concerns here.
>
> 1. Iterating a list while another thread modify it is not thread-safe
> unless you use a lock.
>
> If we offer napi_busy_poll() without sqpoll with the modification in
> io_cqring_wait(), this is a real possibility. A thread could call
> io_uring_enter(IORING_ENTER_GETEVENTS) while another thread calls
> io_uring_enter() to submit new sqes that could trigger a call to
> io_add_napi().
>
> If napi_busy_poll() is only offered through sqpoll thread, this
> becomes
> a non-issue since the only thread accessing/modifying the napi_list
> field is the sqpoll thread.
>
> Providing the patch benchmark result with v2 could help deciding what
> to do with this choice.
>
> 2. You are correct when you say that deletion is not an emergency.?
>
> However, the design guideline that I did follow when writing the
> patch
> is that napi_busy_poll support should not impact users not using this
> feature. Doing the deletion where that patch is doing it fullfill
> this
> goal.
>
> Comparing a timeout value with the jiffies variable is very cheap and
> will only be performed when napi_busy_poll is used.
>
> The other option would be to add a refcount to each napi_entry and
> decrement it if needed everytime a request is discarded. Doing that
> that check for every requests that io_uring discards on completion, I
> am very confident that this would negatively impact various
> performance
> benchmarks that Jens routinely perform...
>
Another fact to consider, it is that I expect the content of napi_list
to be extremely stable. Regular entry deletion should not be a thing.

postponing the deletion using task work is not an option too. How would
io_busy_loop_end() discern between a pending list entry deletion and
any other task work making the busy looping stop?

2022-02-28 20:13:58

by Hao Xu

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll


On 2/25/22 13:32, Olivier Langlois wrote:
> On Mon, 2022-02-21 at 13:23 +0800, Hao Xu wrote:
>>> @@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct
>>> io_kiocb *req,
>>>                 __io_poll_execute(req, mask);
>>>                 return 0;
>>>         }
>>> +       io_add_napi(req->file, req->ctx);
>> I think this may not be the right place to do it. the process will
>> be:
>> arm_poll sockfdA--> get invalid napi_id from sk->napi_id --> event
>> triggered --> arm_poll for sockfdA again --> get valid napi_id
>> then why not do io_add_napi() in event
>> handler(apoll_task_func/poll_task_func).
> You have a valid concern that the first time a socket is passed to
> io_uring that napi_id might not be assigned yet.
>
> OTOH, getting it after data is available for reading does not help
> neither since busy polling must be done before data is received.
>
> for both places, the extracted napi_id will only be leveraged at the
> next polling.

Hi Olivier,

I think we have some gap here. AFAIK, it's not 'might not', it is

'definitely not', the sk->napi_id won't be valid until the poll callback.

Some driver's code FYR: (drivers/net/ethernet/intel/e1000/e1000_main.c)

e1000_receive_skb-->napi_gro_receive-->napi_skb_finish-->gro_normal_one

and in gro_normal_one(), it does:

          if (napi->rx_count >= gro_normal_batch)
                  gro_normal_list(napi);


The gro_normal_list() delivers the info up to the specifical network
protocol like tcp.

And then sk->napi_id is set, meanwhile the poll callback is triggered.

So that's why I call the napi polling technology a 'speculation'. It's
totally for the

future data. Correct me if I'm wrong especially for the poll callback
triggering part.

>
> Your suggestion is superior because it might be the only working way
> for MULTIPOLL requests.
>
> However, I choose __io_arm_poll_handler() because if napi_busy_poll()
> is desired without a sqpoll thread, the context must be locked when
> calling io_add_napi(). This is the case when __io_arm_poll_handler() is
> called AFAIK.
>
> and I don't think that the context is locked when
> (apoll_task_func/poll_task_func) are called.
>
> I acknowledge that this is an issue that needs to be fixed but right
> now I am not sure how to address this so let me share v2 of the patch
> and plan a v3 for at least this pending issue.
>
>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>> +static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
>>> +                                       struct io_wait_queue *iowq)
>>> +{
>>> +       unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
>>> +       struct timespec64 pollto = ns_to_timespec64(1000 *
>>> (s64)busy_poll_to);
>>> +
>>> +       if (timespec64_compare(ts, &pollto) > 0) {
>>> +               *ts = timespec64_sub(*ts, pollto);
>>> +               iowq->busy_poll_to = busy_poll_to;
>>> +       } else {
>>> +               iowq->busy_poll_to = timespec64_to_ns(ts) / 1000;
>> How about timespec64_tons(ts) >> 10, since we don't need accurate
>> number.
> Fantastic suggestion! The kernel test robot did also detect an issue
> with that statement. I did discover do_div() in the meantime but what
> you suggest is better, IMHO...
>
>>> +static void io_blocking_napi_busy_loop(struct io_ring_ctx *ctx,
>>> +                                      struct io_wait_queue *iowq)
>>> +{
>>> +       unsigned long start_time =
>>> +               list_is_singular(&ctx->napi_list) ? 0 :
>>> +               busy_loop_current_time();
>>> +
>>> +       do {
>>> +               if (list_is_singular(&ctx->napi_list)) {
>>> +                       struct napi_entry *ne =
>>> +                               list_first_entry(&ctx->napi_list,
>>> +                                                struct napi_entry,
>>> list);
>>> +
>>> +                       napi_busy_loop(ne->napi_id,
>>> io_busy_loop_end, iowq,
>>> +                                      true, BUSY_POLL_BUDGET);
>>> +                       io_check_napi_entry_timeout(ne);
>>> +                       break;
>>> +               }
>>> +       } while (io_napi_busy_loop(ctx) &&
>> Why don't we setup busy_loop_end callback for normal(non-singular)
>> case,
>> we can record the number of napi_entry, and divide the time frame to
>> each entry.
> This is from intuition that iterating through all the napi devices in a
> 'sprinkler' pattern is the correct way to proceed when handling several
> devices.
>
> If you busy poll the first devices for a certain amount of time and a
> packet is received in the last device, you won't know until you reach
> it which will be much later than with the proposed 'sprinkler' way.
>
> singular case is treated differently because entering/exiting
> napi_busy_loop() incur setup overhead that you don't need for that
> special case.
>
>>> +                !io_busy_loop_end(iowq, start_time));
>>> +}
>>> +#endif /* CONFIG_NET_RX_BUSY_POLL */
>>> +
>>>   /*
>>>    * Wait until events become available, if we don't already have
>>> some. The
>>>    * application must reap them itself, as they reside on the
>>> shared cq ring.
>>> @@ -7729,12 +7906,20 @@ static int io_cqring_wait(struct
>>> io_ring_ctx *ctx, int min_events,
>>>                 if (!io_run_task_work())
>>>                         break;
>>>         } while (1);
>>> -
>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>> +       iowq.busy_poll_to = 0;
>>> +#endif
>>>         if (uts) {
>>>                 struct timespec64 ts;
>>>
>>>                 if (get_timespec64(&ts, uts))
>>>                         return -EFAULT;
>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>> +               if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
>>> +                   !list_empty(&ctx->napi_list)) {
>>> +                       io_adjust_busy_loop_timeout(&ts, &iowq);
>>> +               }
>>> +#endif
>>>                 timeout = timespec64_to_jiffies(&ts);
>>>         }
>>>
>>> @@ -7759,6 +7944,10 @@ static int io_cqring_wait(struct io_ring_ctx
>>> *ctx, int min_events,
>>>         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
>>>
>>>         trace_io_uring_cqring_wait(ctx, min_events);
>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>> +       if (iowq.busy_poll_to)
>>> +               io_blocking_napi_busy_loop(ctx, &iowq);
>> We may not need locks for the napi_list, the reason is we don't need
>> to
>> poll an accurate list, the busy polling/NAPI itself is kind of
>> speculation. So the deletion is not an emergency.
>> To say the least, we can probably delay the deletion to some safe
>> place
>> like the original task's task work though this may cause other
>> problems...
> There are 2 concerns here.
>
> 1. Iterating a list while another thread modify it is not thread-safe
> unless you use a lock.
>
> If we offer napi_busy_poll() without sqpoll with the modification in
> io_cqring_wait(), this is a real possibility. A thread could call
> io_uring_enter(IORING_ENTER_GETEVENTS) while another thread calls
> io_uring_enter() to submit new sqes that could trigger a call to
> io_add_napi().

Thanks, I forgot the io_add_napi() part. Yes, we have to ensure

the entry to be added will be really added...so lock is necessary.

I knew there may be multiple threads accesses the napi_list like

you described above, but if there were only deletion, then lock might

be avoided since we just want it not to crash.

>
> If napi_busy_poll() is only offered through sqpoll thread, this becomes
> a non-issue since the only thread accessing/modifying the napi_list
> field is the sqpoll thread.
>
> Providing the patch benchmark result with v2 could help deciding what
> to do with this choice.
>
> 2. You are correct when you say that deletion is not an emergency.
>
> However, the design guideline that I did follow when writing the patch
> is that napi_busy_poll support should not impact users not using this
> feature. Doing the deletion where that patch is doing it fullfill this
> goal.
>
> Comparing a timeout value with the jiffies variable is very cheap and
> will only be performed when napi_busy_poll is used.
>
> The other option would be to add a refcount to each napi_entry and
> decrement it if needed everytime a request is discarded. Doing that
> that check for every requests that io_uring discards on completion, I
> am very confident that this would negatively impact various performance
> benchmarks that Jens routinely perform...

2022-02-28 20:17:26

by Hao Xu

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll


On 2/25/22 23:32, Olivier Langlois wrote:
> On Fri, 2022-02-25 at 00:32 -0500, Olivier Langlois wrote:
>>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>>> +static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
>>>> +                                       struct io_wait_queue
>>>> *iowq)
>>>> +{
>>>> +       unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
>>>> +       struct timespec64 pollto = ns_to_timespec64(1000 *
>>>> (s64)busy_poll_to);
>>>> +
>>>> +       if (timespec64_compare(ts, &pollto) > 0) {
>>>> +               *ts = timespec64_sub(*ts, pollto);
>>>> +               iowq->busy_poll_to = busy_poll_to;
>>>> +       } else {
>>>> +               iowq->busy_poll_to = timespec64_to_ns(ts) / 1000;
>>> How about timespec64_tons(ts) >> 10, since we don't need accurate
>>> number.
>> Fantastic suggestion! The kernel test robot did also detect an issue
>> with that statement. I did discover do_div() in the meantime but what
>> you suggest is better, IMHO...
> After having seen Jens patch (io_uring: don't convert to jiffies for
> waiting on timeouts), I think that I'll stick with do_div().
>
> I have a hard time considering removing timing accuracy when effort is
> made to make the same function more accurate...


I think they are different things. Jens' patch is to resolve the problem

that jiffies possibly can not stand for time < 1ms (when HZ is 1000).

For example, a user assigns 10us, turn out to be 1ms, it's big difference.

But divided by 1000 or 1024 is not that quite different in this case.

>>
>>>> +                !io_busy_loop_end(iowq, start_time));
>>>> +}
>>>> +#endif /* CONFIG_NET_RX_BUSY_POLL */
>>>> +
>>>>   /*
>>>>    * Wait until events become available, if we don't already have
>>>> some. The
>>>>    * application must reap them itself, as they reside on the
>>>> shared cq ring.
>>>> @@ -7729,12 +7906,20 @@ static int io_cqring_wait(struct
>>>> io_ring_ctx *ctx, int min_events,
>>>>                 if (!io_run_task_work())
>>>>                         break;
>>>>         } while (1);
>>>> -
>>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>>> +       iowq.busy_poll_to = 0;
>>>> +#endif
>>>>         if (uts) {
>>>>                 struct timespec64 ts;
>>>>
>>>>                 if (get_timespec64(&ts, uts))
>>>>                         return -EFAULT;
>>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>>> +               if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
>>>> +                   !list_empty(&ctx->napi_list)) {
>>>> +                       io_adjust_busy_loop_timeout(&ts, &iowq);
>>>> +               }
>>>> +#endif
>>>>                 timeout = timespec64_to_jiffies(&ts);
>>>>         }
>>>>
>>>> @@ -7759,6 +7944,10 @@ static int io_cqring_wait(struct
>>>> io_ring_ctx
>>>> *ctx, int min_events,
>>>>         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) +
>>>> min_events;
>>>>
>>>>         trace_io_uring_cqring_wait(ctx, min_events);
>>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>>> +       if (iowq.busy_poll_to)
>>>> +               io_blocking_napi_busy_loop(ctx, &iowq);
>>> We may not need locks for the napi_list, the reason is we don't
>>> need
>>> to
>>> poll an accurate list, the busy polling/NAPI itself is kind of
>>> speculation. So the deletion is not an emergency.
>>> To say the least, we can probably delay the deletion to some safe
>>> place
>>> like the original task's task work though this may cause other
>>> problems...
>> There are 2 concerns here.
>>
>> 1. Iterating a list while another thread modify it is not thread-safe
>> unless you use a lock.
>>
>> If we offer napi_busy_poll() without sqpoll with the modification in
>> io_cqring_wait(), this is a real possibility. A thread could call
>> io_uring_enter(IORING_ENTER_GETEVENTS) while another thread calls
>> io_uring_enter() to submit new sqes that could trigger a call to
>> io_add_napi().
>>
>> If napi_busy_poll() is only offered through sqpoll thread, this
>> becomes
>> a non-issue since the only thread accessing/modifying the napi_list
>> field is the sqpoll thread.
>>
>> Providing the patch benchmark result with v2 could help deciding what
>> to do with this choice.
>>
>> 2. You are correct when you say that deletion is not an emergency.
>>
>> However, the design guideline that I did follow when writing the
>> patch
>> is that napi_busy_poll support should not impact users not using this
>> feature. Doing the deletion where that patch is doing it fullfill
>> this
>> goal.
>>
>> Comparing a timeout value with the jiffies variable is very cheap and
>> will only be performed when napi_busy_poll is used.
>>
>> The other option would be to add a refcount to each napi_entry and
>> decrement it if needed everytime a request is discarded. Doing that
>> that check for every requests that io_uring discards on completion, I
>> am very confident that this would negatively impact various
>> performance
>> benchmarks that Jens routinely perform...
>>
> Another fact to consider, it is that I expect the content of napi_list
> to be extremely stable. Regular entry deletion should not be a thing.
>
> postponing the deletion using task work is not an option too. How would
> io_busy_loop_end() discern between a pending list entry deletion and
> any other task work making the busy looping stop?

2022-02-28 21:02:24

by Olivier Langlois

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On Tue, 2022-03-01 at 02:26 +0800, Hao Xu wrote:
>
> On 2/25/22 13:32, Olivier Langlois wrote:
> > On Mon, 2022-02-21 at 13:23 +0800, Hao Xu wrote:
> > > > @@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct
> > > > io_kiocb *req,
> > > > ?????????????????__io_poll_execute(req, mask);
> > > > ?????????????????return 0;
> > > > ?????????}
> > > > +???????io_add_napi(req->file, req->ctx);
> > > I think this may not be the right place to do it. the process
> > > will
> > > be:
> > > arm_poll sockfdA--> get invalid napi_id from sk->napi_id -->
> > > event
> > > triggered --> arm_poll for sockfdA again --> get valid napi_id
> > > then why not do io_add_napi() in event
> > > handler(apoll_task_func/poll_task_func).
> > You have a valid concern that the first time a socket is passed to
> > io_uring that napi_id might not be assigned yet.
> >
> > OTOH, getting it after data is available for reading does not help
> > neither since busy polling must be done before data is received.
> >
> > for both places, the extracted napi_id will only be leveraged at
> > the
> > next polling.
>
> Hi Olivier,
>
> I think we have some gap here. AFAIK, it's not 'might not', it is
>
> 'definitely not', the sk->napi_id won't be valid until the poll
> callback.
>
> Some driver's code FYR:
> (drivers/net/ethernet/intel/e1000/e1000_main.c)
>
> e1000_receive_skb-->napi_gro_receive-->napi_skb_finish--
> >gro_normal_one
>
> and in gro_normal_one(), it does:
>
> ?????????? if (napi->rx_count >= gro_normal_batch)
> ?????????????????? gro_normal_list(napi);
>
>
> The gro_normal_list() delivers the info up to the specifical network
> protocol like tcp.
>
> And then sk->napi_id is set, meanwhile the poll callback is
> triggered.
>
> So that's why I call the napi polling technology a 'speculation'.
> It's
> totally for the
>
> future data. Correct me if I'm wrong especially for the poll callback
> triggering part.
>
When I said 'might not', I was meaning that from the io_uring point of
view, it has no idea what is the previous socket usage. If it has been
used outside io_uring, the napi_id could available on the first call.

If it is really read virgin socket, neither my choosen call site or
your proposed sites will make the napi busy poll possible for the first
poll.

I feel like there is not much to gain to argue on this point since I
pretty much admitted that your solution was most likely the only call
site making MULTIPOLL requests work correctly with napi busy poll as
those requests could visit __io_arm_poll_handler only once (Correct me
if my statement is wrong).

The only issue was that I wasn't sure is how using your calling sites
would make locking work.

I suppose that adding a dedicated spinlock for protecting napi_list
instead of relying on uring_lock could be a solution. Would that work?

2022-02-28 21:29:15

by Olivier Langlois

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On Tue, 2022-03-01 at 02:34 +0800, Hao Xu wrote:
>
> On 2/25/22 23:32, Olivier Langlois wrote:
> > On Fri, 2022-02-25 at 00:32 -0500, Olivier Langlois wrote:
> > > > > +#ifdef CONFIG_NET_RX_BUSY_POLL
> > > > > +static void io_adjust_busy_loop_timeout(struct timespec64
> > > > > *ts,
> > > > > +???????????????????????????????????????struct io_wait_queue
> > > > > *iowq)
> > > > > +{
> > > > > +???????unsigned busy_poll_to =
> > > > > READ_ONCE(sysctl_net_busy_poll);
> > > > > +???????struct timespec64 pollto = ns_to_timespec64(1000 *
> > > > > (s64)busy_poll_to);
> > > > > +
> > > > > +???????if (timespec64_compare(ts, &pollto) > 0) {
> > > > > +???????????????*ts = timespec64_sub(*ts, pollto);
> > > > > +???????????????iowq->busy_poll_to = busy_poll_to;
> > > > > +???????} else {
> > > > > +???????????????iowq->busy_poll_to = timespec64_to_ns(ts) /
> > > > > 1000;
> > > > How about timespec64_tons(ts) >> 10, since we don't need
> > > > accurate
> > > > number.
> > > Fantastic suggestion! The kernel test robot did also detect an
> > > issue
> > > with that statement. I did discover do_div() in the meantime but
> > > what
> > > you suggest is better, IMHO...
> > After having seen Jens patch (io_uring: don't convert to jiffies
> > for
> > waiting on timeouts), I think that I'll stick with do_div().
> >
> > I have a hard time considering removing timing accuracy when effort
> > is
> > made to make the same function more accurate...
>
>
> I think they are different things. Jens' patch is to resolve the
> problem
>
> that jiffies possibly can not stand for time < 1ms (when HZ is 1000).
>
> For example, a user assigns 10us, turn out to be 1ms, it's big
> difference.
>
> But divided by 1000 or 1024 is not that quite different in this case.
>
> >
idk... For every 100uSec slice, dividing by 1024 will introduce a
~2.4uSec error. I didn't dig enough the question to figure out if the
error was smaller than the used clock accuracy.

but even if the error is small, why letting it slip in when 100%
accurate value is possible?

Beside, making the painfully picky do_div() macro for some platforms
happy, I fail to understand the problem with doing a division to get an
accurate value.

let me reverse the question. Even if the bit shifting is a bit faster
than doing the division, would the code be called often enough to make
a significant difference?

2022-03-01 07:06:41

by Hao Xu

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

在 2022/3/1 上午5:20, Olivier Langlois 写道:
> On Tue, 2022-03-01 at 02:34 +0800, Hao Xu wrote:
>>
>> On 2/25/22 23:32, Olivier Langlois wrote:
>>> On Fri, 2022-02-25 at 00:32 -0500, Olivier Langlois wrote:
>>>>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>>>>> +static void io_adjust_busy_loop_timeout(struct timespec64
>>>>>> *ts,
>>>>>> +                                       struct io_wait_queue
>>>>>> *iowq)
>>>>>> +{
>>>>>> +       unsigned busy_poll_to =
>>>>>> READ_ONCE(sysctl_net_busy_poll);
>>>>>> +       struct timespec64 pollto = ns_to_timespec64(1000 *
>>>>>> (s64)busy_poll_to);
>>>>>> +
>>>>>> +       if (timespec64_compare(ts, &pollto) > 0) {
>>>>>> +               *ts = timespec64_sub(*ts, pollto);
>>>>>> +               iowq->busy_poll_to = busy_poll_to;
>>>>>> +       } else {
>>>>>> +               iowq->busy_poll_to = timespec64_to_ns(ts) /
>>>>>> 1000;
>>>>> How about timespec64_tons(ts) >> 10, since we don't need
>>>>> accurate
>>>>> number.
>>>> Fantastic suggestion! The kernel test robot did also detect an
>>>> issue
>>>> with that statement. I did discover do_div() in the meantime but
>>>> what
>>>> you suggest is better, IMHO...
>>> After having seen Jens patch (io_uring: don't convert to jiffies
>>> for
>>> waiting on timeouts), I think that I'll stick with do_div().
>>>
>>> I have a hard time considering removing timing accuracy when effort
>>> is
>>> made to make the same function more accurate...
>>
>>
>> I think they are different things. Jens' patch is to resolve the
>> problem
>>
>> that jiffies possibly can not stand for time < 1ms (when HZ is 1000).
>>
>> For example, a user assigns 10us, turn out to be 1ms, it's big
>> difference.
>>
>> But divided by 1000 or 1024 is not that quite different in this case.
>>
>>>
> idk... For every 100uSec slice, dividing by 1024 will introduce a
> ~2.4uSec error. I didn't dig enough the question to figure out if the
> error was smaller than the used clock accuracy.
>
> but even if the error is small, why letting it slip in when 100%
> accurate value is possible?
>
> Beside, making the painfully picky do_div() macro for some platforms
> happy, I fail to understand the problem with doing a division to get an
> accurate value.
>
> let me reverse the question. Even if the bit shifting is a bit faster
> than doing the division, would the code be called often enough to make
> a significant difference?
It's just my personal preference: when a faster way is acceptable, I
just choose that one. For this one, do_div() should be ok since that
code is not hot in most case. But all depends to your test results.

Regards,
Hao

2022-03-01 09:36:40

by Hao Xu

[permalink] [raw]
Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll


On 3/1/22 05:01, Olivier Langlois wrote:
> On Tue, 2022-03-01 at 02:26 +0800, Hao Xu wrote:
>> On 2/25/22 13:32, Olivier Langlois wrote:
>>> On Mon, 2022-02-21 at 13:23 +0800, Hao Xu wrote:
>>>>> @@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct
>>>>> io_kiocb *req,
>>>>>                  __io_poll_execute(req, mask);
>>>>>                  return 0;
>>>>>          }
>>>>> +       io_add_napi(req->file, req->ctx);
>>>> I think this may not be the right place to do it. the process
>>>> will
>>>> be:
>>>> arm_poll sockfdA--> get invalid napi_id from sk->napi_id -->
>>>> event
>>>> triggered --> arm_poll for sockfdA again --> get valid napi_id
>>>> then why not do io_add_napi() in event
>>>> handler(apoll_task_func/poll_task_func).
>>> You have a valid concern that the first time a socket is passed to
>>> io_uring that napi_id might not be assigned yet.
>>>
>>> OTOH, getting it after data is available for reading does not help
>>> neither since busy polling must be done before data is received.
>>>
>>> for both places, the extracted napi_id will only be leveraged at
>>> the
>>> next polling.
>> Hi Olivier,
>>
>> I think we have some gap here. AFAIK, it's not 'might not', it is
>>
>> 'definitely not', the sk->napi_id won't be valid until the poll
>> callback.
>>
>> Some driver's code FYR:
>> (drivers/net/ethernet/intel/e1000/e1000_main.c)
>>
>> e1000_receive_skb-->napi_gro_receive-->napi_skb_finish--
>>> gro_normal_one
>> and in gro_normal_one(), it does:
>>
>>            if (napi->rx_count >= gro_normal_batch)
>>                    gro_normal_list(napi);
>>
>>
>> The gro_normal_list() delivers the info up to the specifical network
>> protocol like tcp.
>>
>> And then sk->napi_id is set, meanwhile the poll callback is
>> triggered.
>>
>> So that's why I call the napi polling technology a 'speculation'.
>> It's
>> totally for the
>>
>> future data. Correct me if I'm wrong especially for the poll callback
>> triggering part.
>>
> When I said 'might not', I was meaning that from the io_uring point of
> view, it has no idea what is the previous socket usage. If it has been
> used outside io_uring, the napi_id could available on the first call.
>
> If it is really read virgin socket, neither my choosen call site or
> your proposed sites will make the napi busy poll possible for the first
> poll.
>
> I feel like there is not much to gain to argue on this point since I
> pretty much admitted that your solution was most likely the only call
> site making MULTIPOLL requests work correctly with napi busy poll as
> those requests could visit __io_arm_poll_handler only once (Correct me
> if my statement is wrong).
>
> The only issue was that I wasn't sure is how using your calling sites
> would make locking work.
>
> I suppose that adding a dedicated spinlock for protecting napi_list
> instead of relying on uring_lock could be a solution. Would that work?
spinlock should be fine.