LinuxLists.cc - [PATCH v1] io_uring: Add support for napi_busy

2022-02-19 18:43:19

Subject: [PATCH v1] io_uring: Add support for napi_busy_poll

The sqpoll thread can be used for performing the napi busy poll in a
similar way that it does io polling for file systems supporting direct
access bypassing the page cache.

The other way that io_uring can be used for napi busy poll is by
calling io_uring_enter() to get events.

If the user specify a timeout value, it is distributed between polling
and sleeping by using the systemwide setting
/proc/sys/net/core/busy_poll.

Co-developed-by: Hao Xu <[email protected]>
Signed-off-by: Hao Xu <[email protected]>
Signed-off-by: Olivier Langlois <[email protected]>
---
fs/io_uring.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 192 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b9c7e4793b..0ed06f024e79 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -63,6 +63,7 @@
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
+#include <net/busy_poll.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -395,6 +396,10 @@ struct io_ring_ctx {
struct list_head sqd_list;

unsigned long check_cq_overflow;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ struct list_head napi_list;
+#endif

struct {
unsigned cached_cq_tail;
@@ -1464,6 +1469,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+ INIT_LIST_HEAD(&ctx->napi_list);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -5398,6 +5404,111 @@ IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */

+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
+
+struct napi_entry {
+ struct list_head list;
+ unsigned int napi_id;
+ unsigned long timeout;
+};
+
+/*
+ * Add busy poll NAPI ID from sk.
+ */
+static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+ unsigned int napi_id;
+ struct socket *sock;
+ struct sock *sk;
+ struct napi_entry *ne;
+
+ if (!net_busy_loop_on())
+ return;
+
+ sock = sock_from_file(file);
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+
+ /* Non-NAPI IDs can be rejected */
+ if (napi_id < MIN_NAPI_ID)
+ return;
+
+ list_for_each_entry(ne, &ctx->napi_list, list) {
+ if (ne->napi_id == napi_id) {
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ return;
+ }
+ }
+
+ ne = kmalloc(sizeof(*ne), GFP_KERNEL);
+ if (!ne)
+ return;
+
+ ne->napi_id = napi_id;
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ list_add_tail(&ne->list, &ctx->napi_list);
+}
+
+static inline void io_check_napi_entry_timeout(struct napi_entry *ne)
+{
+ if (time_after(jiffies, ne->timeout)) {
+ list_del(&ne->list);
+ kfree(ne);
+ }
+}
+
+/*
+ * Busy poll if globally on and supporting sockets found
+ */
+static bool io_napi_busy_loop(struct io_ring_ctx *ctx)
+{
+ struct napi_entry *ne, *n;
+
+ if (list_empty(&ctx->napi_list))
+ return false;
+
+ list_for_each_entry_safe(ne, n, &ctx->napi_list, list) {
+ napi_busy_loop(ne->napi_id, NULL, NULL, true,
+ BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ }
+ return !list_empty(&ctx->napi_list);
+}
+
+static void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+ while (!list_empty(&ctx->napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(&ctx->napi_list, struct napi_entry,
+ list);
+
+ list_del(&ne->list);
+ kfree(ne);
+ }
+}
+#else
+static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+}
+
+static inline bool io_napi_busy_loop(struct io_ring_ctx *ctx)
+{
+ return false;
+}
+
+static inline void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
@@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
__io_poll_execute(req, mask);
return 0;
}
+ io_add_napi(req->file, req->ctx);

/*
* Release ownership. If someone tried to queue a tw while it was
@@ -7518,7 +7630,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
-
+ if (io_napi_busy_loop(ctx))
+ ++ret;
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait);
if (creds)
@@ -7649,6 +7762,9 @@ struct io_wait_queue {
struct io_ring_ctx *ctx;
unsigned cq_tail;
unsigned nr_timeouts;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned busy_poll_to;
+#endif
};

static inline bool io_should_wake(struct io_wait_queue *iowq)
@@ -7709,6 +7825,67 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
return !*timeout ? -ETIME : 1;
}

+#ifdef CONFIG_NET_RX_BUSY_POLL
+static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
+ struct io_wait_queue *iowq)
+{
+ unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+ struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to);
+
+ if (timespec64_compare(ts, &pollto) > 0) {
+ *ts = timespec64_sub(*ts, pollto);
+ iowq->busy_poll_to = busy_poll_to;
+ } else {
+ iowq->busy_poll_to = timespec64_to_ns(ts) / 1000;
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+ }
+}
+
+static inline bool io_busy_loop_timeout(unsigned long start_time,
+ unsigned long bp_usec)
+{
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ }
+ return true;
+}
+
+static bool io_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct io_wait_queue *iowq = p;
+
+ return signal_pending(current) ||
+ io_should_wake(iowq) ||
+ io_busy_loop_timeout(start_time, iowq->busy_poll_to);
+}
+
+static void io_blocking_napi_busy_loop(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq)
+{
+ unsigned long start_time =
+ list_is_singular(&ctx->napi_list) ? 0 :
+ busy_loop_current_time();
+
+ do {
+ if (list_is_singular(&ctx->napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(&ctx->napi_list,
+ struct napi_entry, list);
+
+ napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq,
+ true, BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ break;
+ }
+ } while (io_napi_busy_loop(ctx) &&
+ !io_busy_loop_end(iowq, start_time));
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -7729,12 +7906,20 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!io_run_task_work())
break;
} while (1);
-
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ iowq.busy_poll_to = 0;
+#endif
if (uts) {
struct timespec64 ts;

if (get_timespec64(&ts, uts))
return -EFAULT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (!(ctx->flags & IORING_SETUP_SQPOLL) &&
+ !list_empty(&ctx->napi_list)) {
+ io_adjust_busy_loop_timeout(&ts, &iowq);
+ }
+#endif
timeout = timespec64_to_jiffies(&ts);
}

@@ -7759,6 +7944,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;

trace_io_uring_cqring_wait(ctx, min_events);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (iowq.busy_poll_to)
+ io_blocking_napi_busy_loop(ctx, &iowq);
+#endif
do {
/* if we can't even flush overflow, don't wait for more */
if (!io_cqring_overflow_flush(ctx)) {
@@ -9440,6 +9629,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
+ io_free_napi_list(ctx);
mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
--
2.35.1

2022-02-20 04:20:14

by Olivier Langlois

[permalink] [raw]

Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

One side effect that I have discovered from testing the napi_busy_poll
patch, despite improving the network timing of the threads performing
the busy poll, it is the networking performance degradation that it has
on the rest of the system.

I dedicate isolated CPUS to specific threads of my program. My kernel
is compiled with CONFIG_NO_HZ_FULL. One thing that I have never really
understood is why there were still kernel threads assigned to the
isolated CPUs.

$ CORENUM=2; ps -L -e -o pid,psr,cpu,cmd | grep -E
"^[[:space:]]+[[:digit:]]+[[:space:]]+${CORENUM}"
24 2 - [cpuhp/2]
25 2 - [idle_inject/2]
26 2 - [migration/2]
27 2 - [ksoftirqd/2]
28 2 - [kworker/2:0-events]
29 2 - [kworker/2:0H]
83 2 - [kworker/2:1-mm_percpu_wq]

It is very hard to keep the CPU 100% tickless if there are still tasks
assigned to isolated CPUs by the kernel.

This question isn't really answered anywhere AFAIK:
https://www.kernel.org/doc/html/latest/timers/no_hz.html
https://jeremyeder.com/2013/11/15/nohz_fullgodmode/

Those threads running on their dedicated CPUS are the ones doing the
NAPI busy polling. Because of that, those CPUs usage ramp up to 100%
and running ping on the side is now having horrible numbers:

[2022-02-19 07:27:54] INFO SOCKPP/ping ping results for 10 loops:
0. 104.16.211.191 rtt min/avg/max/mdev = 9.926/34.987/80.048/17.016 ms
1. 104.16.212.191 rtt min/avg/max/mdev = 9.861/34.934/79.986/17.019 ms
2. 104.16.213.191 rtt min/avg/max/mdev = 9.876/34.949/79.965/16.997 ms
3. 104.16.214.191 rtt min/avg/max/mdev = 9.852/34.927/79.977/17.019 ms
4. 104.16.215.191 rtt min/avg/max/mdev = 9.869/34.943/79.958/16.997 ms

Doing this:
echo 990000 > /proc/sys/kernel/sched_rt_runtime_us

as instructed here:
https://www.kernel.org/doc/html/latest/scheduler/sched-rt-group.html

fix the problem:

$ ping 104.16.211.191
PING 104.16.211.191 (104.16.211.191) 56(84) bytes of data.
64 bytes from 104.16.211.191: icmp_seq=1 ttl=62 time=1.05 ms
64 bytes from 104.16.211.191: icmp_seq=2 ttl=62 time=0.812 ms
64 bytes from 104.16.211.191: icmp_seq=3 ttl=62 time=0.864 ms
64 bytes from 104.16.211.191: icmp_seq=4 ttl=62 time=0.846 ms
64 bytes from 104.16.211.191: icmp_seq=5 ttl=62 time=1.23 ms
64 bytes from 104.16.211.191: icmp_seq=6 ttl=62 time=0.957 ms
64 bytes from 104.16.211.191: icmp_seq=7 ttl=62 time=1.10 ms
^C
--- 104.16.211.191 ping statistics ---
7 packets transmitted, 7 received, 0% packet loss, time 6230ms
rtt min/avg/max/mdev = 0.812/0.979/1.231/0.142 ms

If I was to guess, I would say that it is ksoftirqd on those CPUs that
is starving and is not servicing the network packets but I wish that I
had a better understanding of what is really happening and know if it
would be possible to keep 100% those processors dedicated to my tasks
and have the network softirqs handled somewhere else to not have to
tweak /proc/sys/kernel/sched_rt_runtime_us to fix the issue...

2022-02-20 19:51:40

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On 2/19/22 2:42 PM, Olivier Langlois wrote:
> One side effect that I have discovered from testing the napi_busy_poll
> patch, despite improving the network timing of the threads performing
> the busy poll, it is the networking performance degradation that it has
> on the rest of the system.
>
> I dedicate isolated CPUS to specific threads of my program. My kernel
> is compiled with CONFIG_NO_HZ_FULL. One thing that I have never really
> understood is why there were still kernel threads assigned to the
> isolated CPUs.
>
> $ CORENUM=2; ps -L -e -o pid,psr,cpu,cmd | grep -E
> "^[[:space:]]+[[:digit:]]+[[:space:]]+${CORENUM}"
> 24 2 - [cpuhp/2]
> 25 2 - [idle_inject/2]
> 26 2 - [migration/2]
> 27 2 - [ksoftirqd/2]
> 28 2 - [kworker/2:0-events]
> 29 2 - [kworker/2:0H]
> 83 2 - [kworker/2:1-mm_percpu_wq]
>
> It is very hard to keep the CPU 100% tickless if there are still tasks
> assigned to isolated CPUs by the kernel.
>
> This question isn't really answered anywhere AFAIK:
> https://www.kernel.org/doc/html/latest/timers/no_hz.html
> https://jeremyeder.com/2013/11/15/nohz_fullgodmode/
>
> Those threads running on their dedicated CPUS are the ones doing the
> NAPI busy polling. Because of that, those CPUs usage ramp up to 100%
> and running ping on the side is now having horrible numbers:
>
> [2022-02-19 07:27:54] INFO SOCKPP/ping ping results for 10 loops:
> 0. 104.16.211.191 rtt min/avg/max/mdev = 9.926/34.987/80.048/17.016 ms
> 1. 104.16.212.191 rtt min/avg/max/mdev = 9.861/34.934/79.986/17.019 ms
> 2. 104.16.213.191 rtt min/avg/max/mdev = 9.876/34.949/79.965/16.997 ms
> 3. 104.16.214.191 rtt min/avg/max/mdev = 9.852/34.927/79.977/17.019 ms
> 4. 104.16.215.191 rtt min/avg/max/mdev = 9.869/34.943/79.958/16.997 ms
>
> Doing this:
> echo 990000 > /proc/sys/kernel/sched_rt_runtime_us
>
> as instructed here:
> https://www.kernel.org/doc/html/latest/scheduler/sched-rt-group.html
>
> fix the problem:
>
> $ ping 104.16.211.191
> PING 104.16.211.191 (104.16.211.191) 56(84) bytes of data.
> 64 bytes from 104.16.211.191: icmp_seq=1 ttl=62 time=1.05 ms
> 64 bytes from 104.16.211.191: icmp_seq=2 ttl=62 time=0.812 ms
> 64 bytes from 104.16.211.191: icmp_seq=3 ttl=62 time=0.864 ms
> 64 bytes from 104.16.211.191: icmp_seq=4 ttl=62 time=0.846 ms
> 64 bytes from 104.16.211.191: icmp_seq=5 ttl=62 time=1.23 ms
> 64 bytes from 104.16.211.191: icmp_seq=6 ttl=62 time=0.957 ms
> 64 bytes from 104.16.211.191: icmp_seq=7 ttl=62 time=1.10 ms
> ^C
> --- 104.16.211.191 ping statistics ---
> 7 packets transmitted, 7 received, 0% packet loss, time 6230ms
> rtt min/avg/max/mdev = 0.812/0.979/1.231/0.142 ms
>
> If I was to guess, I would say that it is ksoftirqd on those CPUs that
> is starving and is not servicing the network packets but I wish that I
> had a better understanding of what is really happening and know if it
> would be possible to keep 100% those processors dedicated to my tasks
> and have the network softirqs handled somewhere else to not have to
> tweak /proc/sys/kernel/sched_rt_runtime_us to fix the issue...

Outside of this, I was hoping to see some performance numbers in the
main patch. Sounds like you have them, can you share?

--
Jens Axboe

2022-02-21 03:46:32

by kernel test robot

[permalink] [raw]

在 2022/3/1 上午5:20, Olivier Langlois 写道:
> On Tue, 2022-03-01 at 02:34 +0800, Hao Xu wrote:
>>
>> On 2/25/22 23:32, Olivier Langlois wrote:
>>> On Fri, 2022-02-25 at 00:32 -0500, Olivier Langlois wrote:
>>>>>> +#ifdef CONFIG_NET_RX_BUSY_POLL
>>>>>> +static void io_adjust_busy_loop_timeout(struct timespec64
>>>>>> *ts,
>>>>>> +                                       struct io_wait_queue
>>>>>> *iowq)
>>>>>> +{
>>>>>> +       unsigned busy_poll_to =
>>>>>> READ_ONCE(sysctl_net_busy_poll);
>>>>>> +       struct timespec64 pollto = ns_to_timespec64(1000 *
>>>>>> (s64)busy_poll_to);
>>>>>> +
>>>>>> +       if (timespec64_compare(ts, &pollto) > 0) {
>>>>>> +               *ts = timespec64_sub(*ts, pollto);
>>>>>> +               iowq->busy_poll_to = busy_poll_to;
>>>>>> +       } else {
>>>>>> +               iowq->busy_poll_to = timespec64_to_ns(ts) /
>>>>>> 1000;
>>>>> How about timespec64_tons(ts) >> 10, since we don't need
>>>>> accurate
>>>>> number.
>>>> Fantastic suggestion! The kernel test robot did also detect an
>>>> issue
>>>> with that statement. I did discover do_div() in the meantime but
>>>> what
>>>> you suggest is better, IMHO...
>>> After having seen Jens patch (io_uring: don't convert to jiffies
>>> for
>>> waiting on timeouts), I think that I'll stick with do_div().
>>>
>>> I have a hard time considering removing timing accuracy when effort
>>> is
>>> made to make the same function more accurate...
>>
>>
>> I think they are different things. Jens' patch is to resolve the
>> problem
>>
>> that jiffies possibly can not stand for time < 1ms (when HZ is 1000).
>>
>> For example, a user assigns 10us, turn out to be 1ms, it's big
>> difference.
>>
>> But divided by 1000 or 1024 is not that quite different in this case.
>>
>>>
> idk... For every 100uSec slice, dividing by 1024 will introduce a
> ~2.4uSec error. I didn't dig enough the question to figure out if the
> error was smaller than the used clock accuracy.
>
> but even if the error is small, why letting it slip in when 100%
> accurate value is possible?
>
> Beside, making the painfully picky do_div() macro for some platforms
> happy, I fail to understand the problem with doing a division to get an
> accurate value.
>
> let me reverse the question. Even if the bit shifting is a bit faster
> than doing the division, would the code be called often enough to make
> a significant difference?
It's just my personal preference: when a faster way is acceptable, I
just choose that one. For this one, do_div() should be ok since that
code is not hot in most case. But all depends to your test results.

Regards,
Hao

2022-03-01 09:36:40

by Hao Xu

[permalink] [raw]

Subject: Re: [PATCH v1] io_uring: Add support for napi_busy_poll

On 3/1/22 05:01, Olivier Langlois wrote:
> On Tue, 2022-03-01 at 02:26 +0800, Hao Xu wrote:
>> On 2/25/22 13:32, Olivier Langlois wrote:
>>> On Mon, 2022-02-21 at 13:23 +0800, Hao Xu wrote:
>>>>> @@ -5776,6 +5887,7 @@ static int __io_arm_poll_handler(struct
>>>>> io_kiocb *req,
>>>>>                  __io_poll_execute(req, mask);
>>>>>                  return 0;
>>>>>          }
>>>>> +       io_add_napi(req->file, req->ctx);
>>>> I think this may not be the right place to do it. the process
>>>> will
>>>> be:
>>>> arm_poll sockfdA--> get invalid napi_id from sk->napi_id -->
>>>> event
>>>> triggered --> arm_poll for sockfdA again --> get valid napi_id
>>>> then why not do io_add_napi() in event
>>>> handler(apoll_task_func/poll_task_func).
>>> You have a valid concern that the first time a socket is passed to
>>> io_uring that napi_id might not be assigned yet.
>>>
>>> OTOH, getting it after data is available for reading does not help
>>> neither since busy polling must be done before data is received.
>>>
>>> for both places, the extracted napi_id will only be leveraged at
>>> the
>>> next polling.
>> Hi Olivier,
>>
>> I think we have some gap here. AFAIK, it's not 'might not', it is
>>
>> 'definitely not', the sk->napi_id won't be valid until the poll
>> callback.
>>
>> Some driver's code FYR:
>> (drivers/net/ethernet/intel/e1000/e1000_main.c)
>>
>> e1000_receive_skb-->napi_gro_receive-->napi_skb_finish--
>>> gro_normal_one
>> and in gro_normal_one(), it does:
>>
>>            if (napi->rx_count >= gro_normal_batch)
>>                    gro_normal_list(napi);
>>
>>
>> The gro_normal_list() delivers the info up to the specifical network
>> protocol like tcp.
>>
>> And then sk->napi_id is set, meanwhile the poll callback is
>> triggered.
>>
>> So that's why I call the napi polling technology a 'speculation'.
>> It's
>> totally for the
>>
>> future data. Correct me if I'm wrong especially for the poll callback
>> triggering part.
>>
> When I said 'might not', I was meaning that from the io_uring point of
> view, it has no idea what is the previous socket usage. If it has been
> used outside io_uring, the napi_id could available on the first call.
>
> If it is really read virgin socket, neither my choosen call site or
> your proposed sites will make the napi busy poll possible for the first
> poll.
>
> I feel like there is not much to gain to argue on this point since I
> pretty much admitted that your solution was most likely the only call
> site making MULTIPOLL requests work correctly with napi busy poll as
> those requests could visit __io_arm_poll_handler only once (Correct me
> if my statement is wrong).
>
> The only issue was that I wasn't sure is how using your calling sites
> would make locking work.
>
> I suppose that adding a dedicated spinlock for protecting napi_list
> instead of relying on uring_lock could be a solution. Would that work?
spinlock should be fine.