2020-11-16 08:15:50

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH] xsk: add cq event

When we write all cq items to tx, we have to wait for a new event based
on poll to indicate that it is writable. But the current writability is
triggered based on whether tx is full or not, and In fact, when tx is
dissatisfied, the user of cq's item may not necessarily get it, because it
may still be occupied by the network card. In this case, we need to know
when cq is available, so this patch adds a socket option, When the user
configures this option using setsockopt, when cq is available, a
readable event is generated for all xsk bound to this umem.

I can't find a better description of this event,
I think it can also be 'readable', although it is indeed different from
the 'readable' of the new data. But the overhead of xsk checking whether
cq or rx is readable is small.

Signed-off-by: Xuan Zhuo <[email protected]>
---
include/net/xdp_sock.h | 1 +
include/uapi/linux/if_xdp.h | 1 +
net/xdp/xsk.c | 28 ++++++++++++++++++++++++++++
3 files changed, 30 insertions(+)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 1a9559c..faf5b1a 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -49,6 +49,7 @@ struct xdp_sock {
struct xsk_buff_pool *pool;
u16 queue_id;
bool zc;
+ bool cq_event;
enum {
XSK_READY = 0,
XSK_BOUND,
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index a78a809..2dba3cb 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
#define XDP_OPTIONS 8
+#define XDP_CQ_EVENT 9

struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cfbec39..0c53403 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -285,7 +285,16 @@ void __xsk_map_flush(void)

void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
{
+ struct xdp_sock *xs;
+
xskq_prod_submit_n(pool->cq, nb_entries);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+ if (xs->cq_event)
+ sock_def_readable(&xs->sk);
+ }
+ rcu_read_unlock();
}
EXPORT_SYMBOL(xsk_tx_completed);

@@ -495,6 +504,9 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
__xsk_sendmsg(sk);
}

+ if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
if (xs->rx && !xskq_prod_is_empty(xs->rx))
mask |= EPOLLIN | EPOLLRDNORM;
if (xs->tx && !xskq_cons_is_full(xs->tx))
@@ -882,6 +894,22 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
mutex_unlock(&xs->mutex);
return err;
}
+ case XDP_CQ_EVENT:
+ {
+ int cq_event;
+
+ if (optlen < sizeof(cq_event))
+ return -EINVAL;
+ if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
+ return -EFAULT;
+
+ if (cq_event)
+ xs->cq_event = true;
+ else
+ xs->cq_event = false;
+
+ return 0;
+ }
default:
break;
}
--
1.8.3.1


2020-11-16 09:16:09

by Denis Kirjanov

[permalink] [raw]
Subject: Re: [PATCH] xsk: add cq event

On 11/16/20, Xuan Zhuo <[email protected]> wrote:
> When we write all cq items to tx, we have to wait for a new event based
> on poll to indicate that it is writable. But the current writability is
> triggered based on whether tx is full or not, and In fact, when tx is
> dissatisfied, the user of cq's item may not necessarily get it, because it
> may still be occupied by the network card. In this case, we need to know
> when cq is available, so this patch adds a socket option, When the user
> configures this option using setsockopt, when cq is available, a
> readable event is generated for all xsk bound to this umem.
>
> I can't find a better description of this event,
> I think it can also be 'readable', although it is indeed different from
> the 'readable' of the new data. But the overhead of xsk checking whether
> cq or rx is readable is small.
>
> Signed-off-by: Xuan Zhuo <[email protected]>
> ---
> include/net/xdp_sock.h | 1 +
> include/uapi/linux/if_xdp.h | 1 +
> net/xdp/xsk.c | 28 ++++++++++++++++++++++++++++
> 3 files changed, 30 insertions(+)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 1a9559c..faf5b1a 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -49,6 +49,7 @@ struct xdp_sock {
> struct xsk_buff_pool *pool;
> u16 queue_id;
> bool zc;
> + bool cq_event;
> enum {
> XSK_READY = 0,
> XSK_BOUND,
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index a78a809..2dba3cb 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
> #define XDP_UMEM_COMPLETION_RING 6
> #define XDP_STATISTICS 7
> #define XDP_OPTIONS 8
> +#define XDP_CQ_EVENT 9
>
> struct xdp_umem_reg {
> __u64 addr; /* Start of packet data area */
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index cfbec39..0c53403 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -285,7 +285,16 @@ void __xsk_map_flush(void)
>
> void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
> {
> + struct xdp_sock *xs;
> +
> xskq_prod_submit_n(pool->cq, nb_entries);
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
> + if (xs->cq_event)
> + sock_def_readable(&xs->sk);
> + }
> + rcu_read_unlock();
> }
> EXPORT_SYMBOL(xsk_tx_completed);
>
> @@ -495,6 +504,9 @@ static __poll_t xsk_poll(struct file *file, struct
> socket *sock,
> __xsk_sendmsg(sk);
> }
>
> + if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
> + mask |= EPOLLIN | EPOLLRDNORM;
> +
> if (xs->rx && !xskq_prod_is_empty(xs->rx))
> mask |= EPOLLIN | EPOLLRDNORM;
> if (xs->tx && !xskq_cons_is_full(xs->tx))
> @@ -882,6 +894,22 @@ static int xsk_setsockopt(struct socket *sock, int
> level, int optname,
> mutex_unlock(&xs->mutex);
> return err;
> }
> + case XDP_CQ_EVENT:
> + {
> + int cq_event;
> +
> + if (optlen < sizeof(cq_event))
> + return -EINVAL;
> + if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
> + return -EFAULT;
> +
> + if (cq_event)
> + xs->cq_event = true;
> + else
> + xs->cq_event = false;

It's false by default, isn't it?

> +
> + return 0;
> + }
> default:
> break;
> }
> --
> 1.8.3.1
>
>

2020-11-16 10:18:40

by Denis Kirjanov

[permalink] [raw]
Subject: Re: [PATCH] xsk: add cq event

On 11/16/20, Xuan Zhuo <[email protected]> wrote:
> On Mon, 16 Nov 2020 12:13:21 +0300, Denis Kirjanov <[email protected]>
> wrote:
>> On 11/16/20, Xuan Zhuo <[email protected]> wrote:
>> > When we write all cq items to tx, we have to wait for a new event based
>> > on poll to indicate that it is writable. But the current writability is
>> > triggered based on whether tx is full or not, and In fact, when tx is
>> > dissatisfied, the user of cq's item may not necessarily get it, because
>> > it
>> > may still be occupied by the network card. In this case, we need to
>> > know
>> > when cq is available, so this patch adds a socket option, When the user
>> > configures this option using setsockopt, when cq is available, a
>> > readable event is generated for all xsk bound to this umem.
>> >
>> > I can't find a better description of this event,
>> > I think it can also be 'readable', although it is indeed different from
>> > the 'readable' of the new data. But the overhead of xsk checking
>> > whether
>> > cq or rx is readable is small.
>> >
>> > Signed-off-by: Xuan Zhuo <[email protected]>
>> > ---
>> > include/net/xdp_sock.h | 1 +
>> > include/uapi/linux/if_xdp.h | 1 +
>> > net/xdp/xsk.c | 28 ++++++++++++++++++++++++++++
>> > 3 files changed, 30 insertions(+)
>> >
>> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
>> > index 1a9559c..faf5b1a 100644
>> > --- a/include/net/xdp_sock.h
>> > +++ b/include/net/xdp_sock.h
>> > @@ -49,6 +49,7 @@ struct xdp_sock {
>> > struct xsk_buff_pool *pool;
>> > u16 queue_id;
>> > bool zc;
>> > + bool cq_event;
>> > enum {
>> > XSK_READY = 0,
>> > XSK_BOUND,
>> > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
>> > index a78a809..2dba3cb 100644
>> > --- a/include/uapi/linux/if_xdp.h
>> > +++ b/include/uapi/linux/if_xdp.h
>> > @@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
>> > #define XDP_UMEM_COMPLETION_RING 6
>> > #define XDP_STATISTICS 7
>> > #define XDP_OPTIONS 8
>> > +#define XDP_CQ_EVENT 9
>> >
>> > struct xdp_umem_reg {
>> > __u64 addr; /* Start of packet data area */
>> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>> > index cfbec39..0c53403 100644
>> > --- a/net/xdp/xsk.c
>> > +++ b/net/xdp/xsk.c
>> > @@ -285,7 +285,16 @@ void __xsk_map_flush(void)
>> >
>> > void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
>> > {
>> > + struct xdp_sock *xs;
>> > +
>> > xskq_prod_submit_n(pool->cq, nb_entries);
>> > +
>> > + rcu_read_lock();
>> > + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
>> > + if (xs->cq_event)
>> > + sock_def_readable(&xs->sk);
>> > + }
>> > + rcu_read_unlock();
>> > }
>> > EXPORT_SYMBOL(xsk_tx_completed);
>> >
>> > @@ -495,6 +504,9 @@ static __poll_t xsk_poll(struct file *file, struct
>> > socket *sock,
>> > __xsk_sendmsg(sk);
>> > }
>> >
>> > + if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
>> > + mask |= EPOLLIN | EPOLLRDNORM;
>> > +
>> > if (xs->rx && !xskq_prod_is_empty(xs->rx))
>> > mask |= EPOLLIN | EPOLLRDNORM;
>> > if (xs->tx && !xskq_cons_is_full(xs->tx))
>> > @@ -882,6 +894,22 @@ static int xsk_setsockopt(struct socket *sock, int
>> > level, int optname,
>> > mutex_unlock(&xs->mutex);
>> > return err;
>> > }
>> > + case XDP_CQ_EVENT:
>> > + {
>> > + int cq_event;
>> > +
>> > + if (optlen < sizeof(cq_event))
>> > + return -EINVAL;
>> > + if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
>> > + return -EFAULT;
>> > +
>> > + if (cq_event)
>> > + xs->cq_event = true;
>> > + else
>> > + xs->cq_event = false;
>>
>> It's false by default, isn't it?
>
> I add cq_event inside "xdp_sock", that is got by sk_alloc, this call
> sk_prot_alloc by __GFP_ZERO. So I think it is false.

Right, I meant that what's the point to set it explicitly to 'false'?

>
> Thanks.
>
>>
>> > +
>> > + return 0;
>> > + }
>> > default:
>> > break;
>> > }
>> > --
>> > 1.8.3.1
>> >
>> >
>

2020-11-16 10:25:25

by Denis Kirjanov

[permalink] [raw]
Subject: Re: [PATCH] xsk: add cq event

On 11/16/20, Denis Kirjanov <[email protected]> wrote:
> On 11/16/20, Xuan Zhuo <[email protected]> wrote:
>> On Mon, 16 Nov 2020 12:13:21 +0300, Denis Kirjanov
>> <[email protected]>
>> wrote:
>>> On 11/16/20, Xuan Zhuo <[email protected]> wrote:
>>> > When we write all cq items to tx, we have to wait for a new event
>>> > based
>>> > on poll to indicate that it is writable. But the current writability
>>> > is
>>> > triggered based on whether tx is full or not, and In fact, when tx is
>>> > dissatisfied, the user of cq's item may not necessarily get it,
>>> > because
>>> > it
>>> > may still be occupied by the network card. In this case, we need to
>>> > know
>>> > when cq is available, so this patch adds a socket option, When the
>>> > user
>>> > configures this option using setsockopt, when cq is available, a
>>> > readable event is generated for all xsk bound to this umem.
>>> >
>>> > I can't find a better description of this event,
>>> > I think it can also be 'readable', although it is indeed different
>>> > from
>>> > the 'readable' of the new data. But the overhead of xsk checking
>>> > whether
>>> > cq or rx is readable is small.
>>> >
>>> > Signed-off-by: Xuan Zhuo <[email protected]>
>>> > ---
>>> > include/net/xdp_sock.h | 1 +
>>> > include/uapi/linux/if_xdp.h | 1 +
>>> > net/xdp/xsk.c | 28 ++++++++++++++++++++++++++++
>>> > 3 files changed, 30 insertions(+)
>>> >
>>> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
>>> > index 1a9559c..faf5b1a 100644
>>> > --- a/include/net/xdp_sock.h
>>> > +++ b/include/net/xdp_sock.h
>>> > @@ -49,6 +49,7 @@ struct xdp_sock {
>>> > struct xsk_buff_pool *pool;
>>> > u16 queue_id;
>>> > bool zc;
>>> > + bool cq_event;
>>> > enum {
>>> > XSK_READY = 0,
>>> > XSK_BOUND,
>>> > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
>>> > index a78a809..2dba3cb 100644
>>> > --- a/include/uapi/linux/if_xdp.h
>>> > +++ b/include/uapi/linux/if_xdp.h
>>> > @@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
>>> > #define XDP_UMEM_COMPLETION_RING 6
>>> > #define XDP_STATISTICS 7
>>> > #define XDP_OPTIONS 8
>>> > +#define XDP_CQ_EVENT 9
>>> >
>>> > struct xdp_umem_reg {
>>> > __u64 addr; /* Start of packet data area */
>>> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>>> > index cfbec39..0c53403 100644
>>> > --- a/net/xdp/xsk.c
>>> > +++ b/net/xdp/xsk.c
>>> > @@ -285,7 +285,16 @@ void __xsk_map_flush(void)
>>> >
>>> > void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
>>> > {
>>> > + struct xdp_sock *xs;
>>> > +
>>> > xskq_prod_submit_n(pool->cq, nb_entries);
>>> > +
>>> > + rcu_read_lock();
>>> > + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
>>> > + if (xs->cq_event)
>>> > + sock_def_readable(&xs->sk);
>>> > + }
>>> > + rcu_read_unlock();
>>> > }
>>> > EXPORT_SYMBOL(xsk_tx_completed);
>>> >
>>> > @@ -495,6 +504,9 @@ static __poll_t xsk_poll(struct file *file, struct
>>> > socket *sock,
>>> > __xsk_sendmsg(sk);
>>> > }
>>> >
>>> > + if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
>>> > + mask |= EPOLLIN | EPOLLRDNORM;
>>> > +
>>> > if (xs->rx && !xskq_prod_is_empty(xs->rx))
>>> > mask |= EPOLLIN | EPOLLRDNORM;
>>> > if (xs->tx && !xskq_cons_is_full(xs->tx))
>>> > @@ -882,6 +894,22 @@ static int xsk_setsockopt(struct socket *sock,
>>> > int
>>> > level, int optname,
>>> > mutex_unlock(&xs->mutex);
>>> > return err;
>>> > }
>>> > + case XDP_CQ_EVENT:
>>> > + {
>>> > + int cq_event;
>>> > +
>>> > + if (optlen < sizeof(cq_event))
>>> > + return -EINVAL;
>>> > + if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
>>> > + return -EFAULT;
>>> > +
>>> > + if (cq_event)
>>> > + xs->cq_event = true;
>>> > + else
>>> > + xs->cq_event = false;
>>>
>>> It's false by default, isn't it?
>>
>> I add cq_event inside "xdp_sock", that is got by sk_alloc, this call
>> sk_prot_alloc by __GFP_ZERO. So I think it is false.
>
> Right, I meant that what's the point to set it explicitly to 'false'?

Nevermind, It's okay.

>
>>
>> Thanks.
>>
>>>
>>> > +
>>> > + return 0;
>>> > + }
>>> > default:
>>> > break;
>>> > }
>>> > --
>>> > 1.8.3.1
>>> >
>>> >
>>
>

2020-11-16 14:34:20

by Björn Töpel

[permalink] [raw]
Subject: Re: [PATCH] xsk: add cq event

On 2020-11-16 09:10, Xuan Zhuo wrote:
> When we write all cq items to tx, we have to wait for a new event based
> on poll to indicate that it is writable. But the current writability is
> triggered based on whether tx is full or not, and In fact, when tx is
> dissatisfied, the user of cq's item may not necessarily get it, because it
> may still be occupied by the network card. In this case, we need to know
> when cq is available, so this patch adds a socket option, When the user
> configures this option using setsockopt, when cq is available, a
> readable event is generated for all xsk bound to this umem.
>
> I can't find a better description of this event,
> I think it can also be 'readable', although it is indeed different from
> the 'readable' of the new data. But the overhead of xsk checking whether
> cq or rx is readable is small.
>
> Signed-off-by: Xuan Zhuo <[email protected]>

Thanks for the patch!

I'm not a fan of having two different "readable" event (both Rx and cq).
Could you explain a bit what the use case is, so I get a better
understanding.

The Tx queues has a back-pressure mechanism, determined of the number of
elements in cq. Is it related to that?

Please explain a bit more what you're trying to solve, and maybe we can
figure out a better way forward!


Thanks!
Björn

2020-11-17 10:03:59

by Björn Töpel

[permalink] [raw]
Subject: Re: [PATCH] xsk: add cq event

On 2020-11-16 17:12, Xuan Zhuo wrote:
> On Mon, 16 Nov 2020 15:31:20 +0100, =?UTF-8?B?QmrDtnJuIFTDtnBlbA==?= <[email protected]> wrote:
>
>> On 2020-11-16 09:10, Xuan Zhuo wrote:
>
>>> When we write all cq items to tx, we have to wait for a new event based
>
>>> on poll to indicate that it is writable. But the current writability is
>
>>> triggered based on whether tx is full or not, and In fact, when tx is
>
>>> dissatisfied, the user of cq's item may not necessarily get it, because it
>
>>> may still be occupied by the network card. In this case, we need to know
>
>>> when cq is available, so this patch adds a socket option, When the user
>
>>> configures this option using setsockopt, when cq is available, a
>
>>> readable event is generated for all xsk bound to this umem.
>
>>>
>
>>> I can't find a better description of this event,
>
>>> I think it can also be 'readable', although it is indeed different from
>
>>> the 'readable' of the new data. But the overhead of xsk checking whether
>
>>> cq or rx is readable is small.
>
>>>
>
>>> Signed-off-by: Xuan Zhuo <[email protected]>
>
>>
>
>> Thanks for the patch!
>
>>
>
>> I'm not a fan of having two different "readable" event (both Rx and cq).
>
>> Could you explain a bit what the use case is, so I get a better
>
>> understanding.
>
>>
>
>> The Tx queues has a back-pressure mechanism, determined of the number of
>
>> elements in cq. Is it related to that?
>
>>
>
>> Please explain a bit more what you're trying to solve, and maybe we can
>
>> figure out a better way forward!
>
>>
>
>>
>
>> Thanks!
>
>> Björn
>
> I want to implement a tool for mass sending. For example, the size of cq is
>
> 1024, and I set the size of tx also to 1024, so that I will put all cq in tx at
>
> once, and then I have to wait for an event, come Indicates that there is new
>
> write space or new cq is available.
>
>
>
> At present, we can only monitor the event of write able. This indicates whether
>
> tx is full, but in fact, tx is basically not full, because the full state is
>
> very short, and those tx items are being used by the network card. And
>
> epoll_wait will be awakened directly every time, without waiting, but I cannot
>
> get the cq item, so I still cannot successfully send the package again.
>
>
>
> Of course, I don't like the "readable" event very much. This is a suitable
>
> one I found in the existing epoll event. ^_^
>

More questions! By "Mass sending" do you mean maximum throughput, or
does that mean "in very large batches"?

For the latter to do 1k batches, you could increase the Tx/cq buffer
size to say 4k.

For maximum thoughput it's better to use smaller batches (e.g. what the
txpush scenario in samples/xdpsock does).

You're right that even if there's space in the Tx ring, it wont be sent
unless there's sufficient space in the cq ring. Maybe it would make
sense to be more restrictive when triggering the "writable" socket
event? E.g. only trigger it when there's space in Tx *and* sufficient cq
space?


Björn

>
>
> Thanks.
>

2020-11-18 08:26:52

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH 0/3] xsk: fix for xsk_poll writeable

I tried to combine cq available and tx writeable, but I found it very difficult.
Sometimes we pay attention to the status of "available" for both, but sometimes,
we may only pay attention to one, such as tx writeable, because we can use the
item of fq to write to tx. And this kind of demand may be constantly changing,
and it may be necessary to set it every time before entering xsk_poll, so
setsockopt is not very convenient. I feel even more that using a new event may
be a better solution, such as EPOLLPRI, I think it can be used here, after all,
xsk should not have OOB data ^_^.

However, two other problems were discovered during the test:

* The mask returned by datagram_poll always contains EPOLLOUT
* It is not particularly reasonable to return EPOLLOUT based on tx not full

After fixing these two problems, I found that when the process is awakened by
EPOLLOUT, the process can always get the item from cq.

Because the number of packets that the network card can send at a time is
actually limited, suppose this value is "nic_num". Once the number of
consumed items in the tx queue is greater than nic_num, this means that there
must also be new recycled items in the cq queue from nic.

In this way, as long as the tx configured by the user is larger, we won't have
the situation that tx is already in the writeable state but cannot get the item
from cq.

Xuan Zhuo (3):
xsk: replace datagram_poll by sock_poll_wait
xsk: change the tx writeable condition
xsk: set tx/rx the min entries

include/uapi/linux/if_xdp.h | 2 ++
net/xdp/xsk.c | 26 ++++++++++++++++++++++----
net/xdp/xsk_queue.h | 6 ++++++
3 files changed, 30 insertions(+), 4 deletions(-)

--
1.8.3.1

2020-11-18 08:26:55

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH 1/3] xsk: replace datagram_poll by sock_poll_wait

datagram_poll will judge the current socket status (EPOLLIN, EPOLLOUT)
based on the traditional socket information (eg: sk_wmem_alloc), but
this does not apply to xsk. So this patch uses sock_poll_wait instead of
datagram_poll, and the mask is calculated by xsk_poll.

Signed-off-by: Xuan Zhuo <[email protected]>
---
net/xdp/xsk.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cfbec39..7f0353e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -477,11 +477,13 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
static __poll_t xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
- __poll_t mask = datagram_poll(file, sock, wait);
+ __poll_t mask = 0;
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
struct xsk_buff_pool *pool;

+ sock_poll_wait(file, sock, wait);
+
if (unlikely(!xsk_is_bound(xs)))
return mask;

--
1.8.3.1

2020-11-18 08:28:12

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH 3/3] xsk: set tx/rx the min entries

We expect tx entries to be greater than twice the number of packets that
the network card can send at a time, so that when the remaining number
of the tx queue is less than half of the queue, it can be guaranteed
that there are recycled items in the cq that can be used.

At the same time, rx will not cause packet loss because it cannot
receive the packets uploaded by the network card at one time.

Of course, the 1024 here is only an estimated value, and the number of
packets sent by each network card at a time may be different.

Signed-off-by: Xuan Zhuo <[email protected]>
---
include/uapi/linux/if_xdp.h | 2 ++
net/xdp/xsk.c | 2 ++
2 files changed, 4 insertions(+)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index a78a809..d55ba79 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -64,6 +64,8 @@ struct xdp_mmap_offsets {
#define XDP_STATISTICS 7
#define XDP_OPTIONS 8

+#define XDP_RXTX_RING_MIN_ENTRIES 1024
+
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
__u64 len; /* Length of packet data area */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index bc3d4ece..e62c795 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -831,6 +831,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EINVAL;
if (copy_from_sockptr(&entries, optval, sizeof(entries)))
return -EFAULT;
+ if (entries < XDP_RXTX_RING_MIN_ENTRIES)
+ return -EINVAL;

mutex_lock(&xs->mutex);
if (xs->state != XSK_READY) {
--
1.8.3.1

2020-11-18 08:28:55

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH 2/3] xsk: change the tx writeable condition

Modify the tx writeable condition from the queue is not full to the
number of remaining tx queues is less than the half of the total number
of queues. Because the tx queue not full is a very short time, this will
cause a large number of EPOLLOUT events, and cause a large number of
process wake up.

Signed-off-by: Xuan Zhuo <[email protected]>
---
net/xdp/xsk.c | 20 +++++++++++++++++---
net/xdp/xsk_queue.h | 6 ++++++
2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7f0353e..bc3d4ece 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -211,6 +211,17 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
return 0;
}

+static bool xsk_writeable(struct xdp_sock *xs)
+{
+ if (!xs->tx)
+ return false;
+
+ if (xskq_cons_left(xs->tx) > xs->tx->nentries / 2)
+ return false;
+
+ return true;
+}
+
static bool xsk_is_bound(struct xdp_sock *xs)
{
if (READ_ONCE(xs->state) == XSK_BOUND) {
@@ -296,7 +307,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool)
rcu_read_lock();
list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
__xskq_cons_release(xs->tx);
- xs->sk.sk_write_space(&xs->sk);
+ if (xsk_writeable(xs))
+ xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
}
@@ -442,7 +454,8 @@ static int xsk_generic_xmit(struct sock *sk)

out:
if (sent_frame)
- sk->sk_write_space(sk);
+ if (xsk_writeable(xs))
+ sk->sk_write_space(sk);

mutex_unlock(&xs->mutex);
return err;
@@ -499,7 +512,8 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,

if (xs->rx && !xskq_prod_is_empty(xs->rx))
mask |= EPOLLIN | EPOLLRDNORM;
- if (xs->tx && !xskq_cons_is_full(xs->tx))
+
+ if (xsk_writeable(xs))
mask |= EPOLLOUT | EPOLLWRNORM;

return mask;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index cdb9cf3..82a5228 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -264,6 +264,12 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
q->nentries;
}

+static inline __u64 xskq_cons_left(struct xsk_queue *q)
+{
+ /* No barriers needed since data is not accessed */
+ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
+}
+
/* Functions for producers */

static inline bool xskq_prod_is_full(struct xsk_queue *q)
--
1.8.3.1

2020-11-23 14:02:52

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH 0/3] xsk: fix for xsk_poll writeable

On Wed, Nov 18, 2020 at 9:25 AM Xuan Zhuo <[email protected]> wrote:
>
> I tried to combine cq available and tx writeable, but I found it very difficult.
> Sometimes we pay attention to the status of "available" for both, but sometimes,
> we may only pay attention to one, such as tx writeable, because we can use the
> item of fq to write to tx. And this kind of demand may be constantly changing,
> and it may be necessary to set it every time before entering xsk_poll, so
> setsockopt is not very convenient. I feel even more that using a new event may
> be a better solution, such as EPOLLPRI, I think it can be used here, after all,
> xsk should not have OOB data ^_^.
>
> However, two other problems were discovered during the test:
>
> * The mask returned by datagram_poll always contains EPOLLOUT
> * It is not particularly reasonable to return EPOLLOUT based on tx not full
>
> After fixing these two problems, I found that when the process is awakened by
> EPOLLOUT, the process can always get the item from cq.
>
> Because the number of packets that the network card can send at a time is
> actually limited, suppose this value is "nic_num". Once the number of
> consumed items in the tx queue is greater than nic_num, this means that there
> must also be new recycled items in the cq queue from nic.
>
> In this way, as long as the tx configured by the user is larger, we won't have
> the situation that tx is already in the writeable state but cannot get the item
> from cq.

I think the overall approach of tying this into poll() instead of
setsockopt() is the right way to go. But we need a more robust
solution. Your patch #3 also breaks backwards compatibility and that
is not allowed. Could you please post some simple code example of what
it is you would like to do in user space? So you would like to wake up
when there are entries in the cq that can be retrieved and the reason
you would like to do this is that you then know you can put some more
entries into the Tx ring and they will get sent as there now are free
slots in the cq. Correct me if wrong. Would an event that wakes you up
when there is both space in the Tx ring and space in the cq work? Is
there a case in which we would like to be woken up when only the Tx
ring is non-full? Maybe there are as it might be beneficial to fill
the Tx and while doing that some entries in the cq has been completed
and away the packets go. But it would be great if you could post some
simple example code, does not need to compile or anything. Can be
pseudo code.

It would also be good to know if your goal is max throughput, max
burst size, or something else.

Thanks: Magnus


> Xuan Zhuo (3):
> xsk: replace datagram_poll by sock_poll_wait
> xsk: change the tx writeable condition
> xsk: set tx/rx the min entries
>
> include/uapi/linux/if_xdp.h | 2 ++
> net/xdp/xsk.c | 26 ++++++++++++++++++++++----
> net/xdp/xsk_queue.h | 6 ++++++
> 3 files changed, 30 insertions(+), 4 deletions(-)
>
> --
> 1.8.3.1
>

2020-11-23 14:13:50

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH 1/3] xsk: replace datagram_poll by sock_poll_wait

On Wed, Nov 18, 2020 at 9:26 AM Xuan Zhuo <[email protected]> wrote:
>
> datagram_poll will judge the current socket status (EPOLLIN, EPOLLOUT)
> based on the traditional socket information (eg: sk_wmem_alloc), but
> this does not apply to xsk. So this patch uses sock_poll_wait instead of
> datagram_poll, and the mask is calculated by xsk_poll.
>
> Signed-off-by: Xuan Zhuo <[email protected]>
> ---
> net/xdp/xsk.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index cfbec39..7f0353e 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -477,11 +477,13 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> static __poll_t xsk_poll(struct file *file, struct socket *sock,
> struct poll_table_struct *wait)
> {
> - __poll_t mask = datagram_poll(file, sock, wait);
> + __poll_t mask = 0;

It would indeed be nice to not execute a number of tests in
datagram_poll that will never be triggered. It will speed up things
for sure. But we need to make sure that removing those flags that
datagram_poll sets do not have any bad effects in the code above this.
But let us tentatively keep this patch for the next version of the
patch set. Just need to figure out how to solve your problem in a nice
way first. See discussion in patch 0/3.

> struct sock *sk = sock->sk;
> struct xdp_sock *xs = xdp_sk(sk);
> struct xsk_buff_pool *pool;
>
> + sock_poll_wait(file, sock, wait);
> +
> if (unlikely(!xsk_is_bound(xs)))
> return mask;
>
> --
> 1.8.3.1
>

2020-11-24 09:33:25

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH 2/3] xsk: change the tx writeable condition

On Wed, Nov 18, 2020 at 9:25 AM Xuan Zhuo <[email protected]> wrote:
>
> Modify the tx writeable condition from the queue is not full to the
> number of remaining tx queues is less than the half of the total number
> of queues. Because the tx queue not full is a very short time, this will
> cause a large number of EPOLLOUT events, and cause a large number of
> process wake up.
>
> Signed-off-by: Xuan Zhuo <[email protected]>
> ---
> net/xdp/xsk.c | 20 +++++++++++++++++---
> net/xdp/xsk_queue.h | 6 ++++++
> 2 files changed, 23 insertions(+), 3 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7f0353e..bc3d4ece 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -211,6 +211,17 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
> return 0;
> }
>
> +static bool xsk_writeable(struct xdp_sock *xs)

Not clear what this function does from the name. How about
xsk_tx_half_free() or maybe xsk_tx_writeable()?

> +{
> + if (!xs->tx)
> + return false;

Skip this test as it will slow down the code. It is only needed in one
place below.

> + if (xskq_cons_left(xs->tx) > xs->tx->nentries / 2)
> + return false;
> +
> + return true;
> +}
> +
> static bool xsk_is_bound(struct xdp_sock *xs)
> {
> if (READ_ONCE(xs->state) == XSK_BOUND) {
> @@ -296,7 +307,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool)
> rcu_read_lock();
> list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
> __xskq_cons_release(xs->tx);
> - xs->sk.sk_write_space(&xs->sk);
> + if (xsk_writeable(xs))
> + xs->sk.sk_write_space(&xs->sk);
> }
> rcu_read_unlock();
> }
> @@ -442,7 +454,8 @@ static int xsk_generic_xmit(struct sock *sk)
>
> out:
> if (sent_frame)
> - sk->sk_write_space(sk);
> + if (xsk_writeable(xs))
> + sk->sk_write_space(sk);
>
> mutex_unlock(&xs->mutex);
> return err;
> @@ -499,7 +512,8 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
>
> if (xs->rx && !xskq_prod_is_empty(xs->rx))
> mask |= EPOLLIN | EPOLLRDNORM;
> - if (xs->tx && !xskq_cons_is_full(xs->tx))
> +

No reason to introduce a newline here.

> + if (xsk_writeable(xs))

Add an explicit "xs->tx &&" in the if statement here as we removed the
test in xsk_writeable.

> mask |= EPOLLOUT | EPOLLWRNORM;
>
> return mask;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index cdb9cf3..82a5228 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -264,6 +264,12 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
> q->nentries;
> }
>
> +static inline __u64 xskq_cons_left(struct xsk_queue *q)

Let us call this xskq_cons_entries_present() or
xskq_cons_filled_entries(). The word "left" has the connotation that I
still have stuff left to do. While this is kind of true for this case,
it might not be for other cases that can use your function. The
function provides how many (filled) entries that are present in the
ring. Can you come up with a better name as I am not super fond of my
suggestions? It would have been nice to call it xskq_cons_nb_entries()
but there is already such a function that is lazy in nature and that
allows access to the entries.

> +{
> + /* No barriers needed since data is not accessed */
> + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
> +}
> +
> /* Functions for producers */
>
> static inline bool xskq_prod_is_full(struct xsk_queue *q)
> --
> 1.8.3.1
>

2020-11-24 11:40:42

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH 1/3] xsk: replace datagram_poll by sock_poll_wait

On Mon, Nov 23, 2020 at 3:11 PM Magnus Karlsson
<[email protected]> wrote:
>
> On Wed, Nov 18, 2020 at 9:26 AM Xuan Zhuo <[email protected]> wrote:
> >
> > datagram_poll will judge the current socket status (EPOLLIN, EPOLLOUT)
> > based on the traditional socket information (eg: sk_wmem_alloc), but
> > this does not apply to xsk. So this patch uses sock_poll_wait instead of
> > datagram_poll, and the mask is calculated by xsk_poll.
> >
> > Signed-off-by: Xuan Zhuo <[email protected]>
> > ---
> > net/xdp/xsk.c | 4 +++-
> > 1 file changed, 3 insertions(+), 1 deletion(-)
> >
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index cfbec39..7f0353e 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -477,11 +477,13 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> > static __poll_t xsk_poll(struct file *file, struct socket *sock,
> > struct poll_table_struct *wait)
> > {
> > - __poll_t mask = datagram_poll(file, sock, wait);
> > + __poll_t mask = 0;
>
> It would indeed be nice to not execute a number of tests in
> datagram_poll that will never be triggered. It will speed up things
> for sure. But we need to make sure that removing those flags that
> datagram_poll sets do not have any bad effects in the code above this.
> But let us tentatively keep this patch for the next version of the
> patch set. Just need to figure out how to solve your problem in a nice
> way first. See discussion in patch 0/3.
>
> > struct sock *sk = sock->sk;
> > struct xdp_sock *xs = xdp_sk(sk);
> > struct xsk_buff_pool *pool;
> >
> > + sock_poll_wait(file, sock, wait);
> > +
> > if (unlikely(!xsk_is_bound(xs)))
> > return mask;
> >
> > --
> > 1.8.3.1
> >

The fix looks correct and it will speed things up too as a bonus.
Please include this patch in the v2 as outlined in my answer to 0/3.

Thanks!

2020-11-25 01:58:51

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH 0/3] xsk: fix for xsk_poll writeable

On Mon, Nov 23, 2020 at 4:21 PM Xuan Zhuo <[email protected]> wrote:
>
> On Mon, 23 Nov 2020 15:00:48 +0100, Magnus Karlsson <[email protected]> wrote:
> > On Wed, Nov 18, 2020 at 9:25 AM Xuan Zhuo <[email protected]> wrote:
> > >
> > > I tried to combine cq available and tx writeable, but I found it very difficult.
> > > Sometimes we pay attention to the status of "available" for both, but sometimes,
> > > we may only pay attention to one, such as tx writeable, because we can use the
> > > item of fq to write to tx. And this kind of demand may be constantly changing,
> > > and it may be necessary to set it every time before entering xsk_poll, so
> > > setsockopt is not very convenient. I feel even more that using a new event may
> > > be a better solution, such as EPOLLPRI, I think it can be used here, after all,
> > > xsk should not have OOB data ^_^.
> > >
> > > However, two other problems were discovered during the test:
> > >
> > > * The mask returned by datagram_poll always contains EPOLLOUT
> > > * It is not particularly reasonable to return EPOLLOUT based on tx not full
> > >
> > > After fixing these two problems, I found that when the process is awakened by
> > > EPOLLOUT, the process can always get the item from cq.
> > >
> > > Because the number of packets that the network card can send at a time is
> > > actually limited, suppose this value is "nic_num". Once the number of
> > > consumed items in the tx queue is greater than nic_num, this means that there
> > > must also be new recycled items in the cq queue from nic.
> > >
> > > In this way, as long as the tx configured by the user is larger, we won't have
> > > the situation that tx is already in the writeable state but cannot get the item
> > > from cq.
> >
> > I think the overall approach of tying this into poll() instead of
> > setsockopt() is the right way to go. But we need a more robust
> > solution. Your patch #3 also breaks backwards compatibility and that
> > is not allowed. Could you please post some simple code example of what
> > it is you would like to do in user space? So you would like to wake up
> > when there are entries in the cq that can be retrieved and the reason
> > you would like to do this is that you then know you can put some more
> > entries into the Tx ring and they will get sent as there now are free
> > slots in the cq. Correct me if wrong. Would an event that wakes you up
> > when there is both space in the Tx ring and space in the cq work? Is
> > there a case in which we would like to be woken up when only the Tx
> > ring is non-full? Maybe there are as it might be beneficial to fill
> > the Tx and while doing that some entries in the cq has been completed
> > and away the packets go. But it would be great if you could post some
> > simple example code, does not need to compile or anything. Can be
> > pseudo code.
> >
> > It would also be good to know if your goal is max throughput, max
> > burst size, or something else.
> >
> > Thanks: Magnus
> >
>
> My goal is max pps, If possible, increase the size of buf appropriately to
> improve throughput. like pktgen.
>
> The code like this: (tx and umem cq also is 1024, and that works with zero
> copy.)
>
> ```
> void send_handler(xsk)
> {
> char buf[22];
>
> while (true) {
> while (true){
> if (send_buf_to_tx_ring(xsk, buf, sizeof(buf)))
> break; // break this when no cq or tx is full
> }
>
> if (sendto(xsk->fd))
> break;
> }
> }
> }
>
>
> static int loop(int efd, xsk)
> {
> struct epoll_event e[1024];
> struct epoll_event ee;
> int n, i;
>
> ee.events = EPOLLOUT;
> ee.data.ptr = NULL;
>
> epoll_ctl(efd, EPOLL_CTL_ADD, xsk->fd, &e);
>
> while (1) {
> n = epoll_wait(efd, e, sizeof(e)/sizeof(e[0]), -1);
>
> if (n == 0)
> continue;
>
> if (n < 0) {
> continue;
> }
>
> for (i = 0; i < n; ++i) {
> send_handler(xsk);
> }
> }
> }
> ```
>
> 1. Now, since datagram_poll(that determine whether it is write able based on
> sock_writeable function) will return EPOLLOUT every time, epoll_wait will
> always return directly(this results in cpu 100%).

We should keep patch #1. Just need to make sure we do not break
anything as I am not familiar with the path after xsk_poll returns.

> 2. After removing datagram_poll, since tx full is a very short moment, so every
> time tx is not full is always true, epoll_wait will still return directly
> 3. After epoll_wait returns, app will try to get cq and writes it to tx again,
> but this time basically it will fail when getting cq. My analysis is that
> cq item has not returned from the network card at this time.
>
>
> Under normal circumstances, the judgment preparation for this event that can be
> written is not whether the queue or buffer is full. The judgment criterion of
> tcp is whether the free space is more than half.
> This is the origin of my #2 patch, and I found that after adding this patch, my
> above problems no longer appear.
> 1. epoll_wait no longer exits directly
> 2. Every time you receive EPOLLOUT, you can always get cq

Got it. Make sense. And good that there is some precedence that you
are not supposed to wake up when there is one free slot. Instead you
should wake up when a lot of them are free so you can insert a batch.
So let us also keep patch #2, though I might have some comments on it,
but I will reply to that patch in that case.

But patch #3 needs to go. How about you instead make the Tx ring
double the size of the completion ring? Let us assume patch #1 and #2
are in place. You will get woken up when at least half the entries in
the Tx ring are available. At this point fill the Tx ring completely
and after that start cleaning the completion ring. Hopefully by this
time, there will be a number of entries in there that can be cleaned
up. Then you call sendto(). It might even be a good idea to do cq, Tx,
cq in that order.

I consider #1 and #2 bug fixes so please base them on the bpf tree and
note this in your mail header like this: "[PATCH bpf 0/3] xsk: fix for
xsk_poll writeable".

>
> In addition:
> What is the goal of TX_BATCH_SIZE and why this "restriction" should be added,
> which causes a lot of trouble in programming without using zero copy

You are right, this is likely too low. I never thought of this as
something that would be used as a "fast-path". It was only a generic
fall back. But it need not be. Please produce a patch #3 that sets
this to a higher value. We do need the limit though. How about 512?

If you are interested in improving the performance of the Tx SKB path,
then there might be other avenues to try if you are interested. Here
are some examples:

* Batch dev_direct_xmit. Maybe skb lists can be used.
* Do not unlock and lock for every single packet in dev_direct_xmit().
Can be combined with the above.
* Use fragments instead of copying packets into the skb itself
* Can the bool more in netdev_start_xmit be used to increase performance

>
> Thanks.
>
> >
> > > Xuan Zhuo (3):
> > > xsk: replace datagram_poll by sock_poll_wait
> > > xsk: change the tx writeable condition
> > > xsk: set tx/rx the min entries
> > >
> > > include/uapi/linux/if_xdp.h | 2 ++
> > > net/xdp/xsk.c | 26 ++++++++++++++++++++++----
> > > net/xdp/xsk_queue.h | 6 ++++++
> > > 3 files changed, 30 insertions(+), 4 deletions(-)
> > >
> > > --
> > > 1.8.3.1
> > >

2020-11-25 02:01:44

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH 0/3] xsk: fix for xsk_poll writeable

On Tue, Nov 24, 2020 at 10:01 AM Magnus Karlsson
<[email protected]> wrote:
>
> On Mon, Nov 23, 2020 at 4:21 PM Xuan Zhuo <[email protected]> wrote:
> >
> > On Mon, 23 Nov 2020 15:00:48 +0100, Magnus Karlsson <[email protected]> wrote:
> > > On Wed, Nov 18, 2020 at 9:25 AM Xuan Zhuo <[email protected]> wrote:
> > > >
> > > > I tried to combine cq available and tx writeable, but I found it very difficult.
> > > > Sometimes we pay attention to the status of "available" for both, but sometimes,
> > > > we may only pay attention to one, such as tx writeable, because we can use the
> > > > item of fq to write to tx. And this kind of demand may be constantly changing,
> > > > and it may be necessary to set it every time before entering xsk_poll, so
> > > > setsockopt is not very convenient. I feel even more that using a new event may
> > > > be a better solution, such as EPOLLPRI, I think it can be used here, after all,
> > > > xsk should not have OOB data ^_^.
> > > >
> > > > However, two other problems were discovered during the test:
> > > >
> > > > * The mask returned by datagram_poll always contains EPOLLOUT
> > > > * It is not particularly reasonable to return EPOLLOUT based on tx not full
> > > >
> > > > After fixing these two problems, I found that when the process is awakened by
> > > > EPOLLOUT, the process can always get the item from cq.
> > > >
> > > > Because the number of packets that the network card can send at a time is
> > > > actually limited, suppose this value is "nic_num". Once the number of
> > > > consumed items in the tx queue is greater than nic_num, this means that there
> > > > must also be new recycled items in the cq queue from nic.
> > > >
> > > > In this way, as long as the tx configured by the user is larger, we won't have
> > > > the situation that tx is already in the writeable state but cannot get the item
> > > > from cq.
> > >
> > > I think the overall approach of tying this into poll() instead of
> > > setsockopt() is the right way to go. But we need a more robust
> > > solution. Your patch #3 also breaks backwards compatibility and that
> > > is not allowed. Could you please post some simple code example of what
> > > it is you would like to do in user space? So you would like to wake up
> > > when there are entries in the cq that can be retrieved and the reason
> > > you would like to do this is that you then know you can put some more
> > > entries into the Tx ring and they will get sent as there now are free
> > > slots in the cq. Correct me if wrong. Would an event that wakes you up
> > > when there is both space in the Tx ring and space in the cq work? Is
> > > there a case in which we would like to be woken up when only the Tx
> > > ring is non-full? Maybe there are as it might be beneficial to fill
> > > the Tx and while doing that some entries in the cq has been completed
> > > and away the packets go. But it would be great if you could post some
> > > simple example code, does not need to compile or anything. Can be
> > > pseudo code.
> > >
> > > It would also be good to know if your goal is max throughput, max
> > > burst size, or something else.
> > >
> > > Thanks: Magnus
> > >
> >
> > My goal is max pps, If possible, increase the size of buf appropriately to
> > improve throughput. like pktgen.
> >
> > The code like this: (tx and umem cq also is 1024, and that works with zero
> > copy.)
> >
> > ```
> > void send_handler(xsk)
> > {
> > char buf[22];
> >
> > while (true) {
> > while (true){
> > if (send_buf_to_tx_ring(xsk, buf, sizeof(buf)))
> > break; // break this when no cq or tx is full
> > }
> >
> > if (sendto(xsk->fd))
> > break;
> > }
> > }
> > }
> >
> >
> > static int loop(int efd, xsk)
> > {
> > struct epoll_event e[1024];
> > struct epoll_event ee;
> > int n, i;
> >
> > ee.events = EPOLLOUT;
> > ee.data.ptr = NULL;
> >
> > epoll_ctl(efd, EPOLL_CTL_ADD, xsk->fd, &e);
> >
> > while (1) {
> > n = epoll_wait(efd, e, sizeof(e)/sizeof(e[0]), -1);
> >
> > if (n == 0)
> > continue;
> >
> > if (n < 0) {
> > continue;
> > }
> >
> > for (i = 0; i < n; ++i) {
> > send_handler(xsk);
> > }
> > }
> > }
> > ```
> >
> > 1. Now, since datagram_poll(that determine whether it is write able based on
> > sock_writeable function) will return EPOLLOUT every time, epoll_wait will
> > always return directly(this results in cpu 100%).
>
> We should keep patch #1. Just need to make sure we do not break
> anything as I am not familiar with the path after xsk_poll returns.
>
> > 2. After removing datagram_poll, since tx full is a very short moment, so every
> > time tx is not full is always true, epoll_wait will still return directly
> > 3. After epoll_wait returns, app will try to get cq and writes it to tx again,
> > but this time basically it will fail when getting cq. My analysis is that
> > cq item has not returned from the network card at this time.
> >
> >
> > Under normal circumstances, the judgment preparation for this event that can be
> > written is not whether the queue or buffer is full. The judgment criterion of
> > tcp is whether the free space is more than half.
> > This is the origin of my #2 patch, and I found that after adding this patch, my
> > above problems no longer appear.
> > 1. epoll_wait no longer exits directly
> > 2. Every time you receive EPOLLOUT, you can always get cq
>
> Got it. Make sense. And good that there is some precedence that you
> are not supposed to wake up when there is one free slot. Instead you
> should wake up when a lot of them are free so you can insert a batch.
> So let us also keep patch #2, though I might have some comments on it,
> but I will reply to that patch in that case.
>
> But patch #3 needs to go. How about you instead make the Tx ring
> double the size of the completion ring? Let us assume patch #1 and #2
> are in place. You will get woken up when at least half the entries in
> the Tx ring are available. At this point fill the Tx ring completely
> and after that start cleaning the completion ring. Hopefully by this
> time, there will be a number of entries in there that can be cleaned
> up. Then you call sendto(). It might even be a good idea to do cq, Tx,
> cq in that order.
>
> I consider #1 and #2 bug fixes so please base them on the bpf tree and
> note this in your mail header like this: "[PATCH bpf 0/3] xsk: fix for
> xsk_poll writeable".
>
> >
> > In addition:
> > What is the goal of TX_BATCH_SIZE and why this "restriction" should be added,
> > which causes a lot of trouble in programming without using zero copy
>
> You are right, this is likely too low. I never thought of this as
> something that would be used as a "fast-path". It was only a generic
> fall back. But it need not be. Please produce a patch #3 that sets
> this to a higher value. We do need the limit though. How about 512?

Please produce one patch set with #1 and #2 based on the bpf tree and
keep the TX_BATCH_SIZE patch separate. It is only a performance
optimization and should be based on bpf-next and sent as a sole patch
on its own.

Thanks!

> If you are interested in improving the performance of the Tx SKB path,
> then there might be other avenues to try if you are interested. Here
> are some examples:
>
> * Batch dev_direct_xmit. Maybe skb lists can be used.
> * Do not unlock and lock for every single packet in dev_direct_xmit().
> Can be combined with the above.
> * Use fragments instead of copying packets into the skb itself
> * Can the bool more in netdev_start_xmit be used to increase performance
>
> >
> > Thanks.
> >
> > >
> > > > Xuan Zhuo (3):
> > > > xsk: replace datagram_poll by sock_poll_wait
> > > > xsk: change the tx writeable condition
> > > > xsk: set tx/rx the min entries
> > > >
> > > > include/uapi/linux/if_xdp.h | 2 ++
> > > > net/xdp/xsk.c | 26 ++++++++++++++++++++++----
> > > > net/xdp/xsk_queue.h | 6 ++++++
> > > > 3 files changed, 30 insertions(+), 4 deletions(-)
> > > >
> > > > --
> > > > 1.8.3.1
> > > >

2020-11-25 06:51:18

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH bpf v2 0/2] xsk: fix for xsk_poll writeable

Thanks for magnus very much.

V2:
#2 patch made some changes following magnus' opinions.

Xuan Zhuo (2):
xsk: replace datagram_poll by sock_poll_wait
xsk: change the tx writeable condition

net/xdp/xsk.c | 20 ++++++++++++++++----
net/xdp/xsk_queue.h | 6 ++++++
2 files changed, 22 insertions(+), 4 deletions(-)

--
1.8.3.1

2020-11-25 06:52:28

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH bpf v2 2/2] xsk: change the tx writeable condition

Modify the tx writeable condition from the queue is not full to the
number of present tx queues is less than the half of the total number
of queues. Because the tx queue not full is a very short time, this will
cause a large number of EPOLLOUT events, and cause a large number of
process wake up.

Signed-off-by: Xuan Zhuo <[email protected]>
---
net/xdp/xsk.c | 16 +++++++++++++---
net/xdp/xsk_queue.h | 6 ++++++
2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 0df8651..22e35e9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -211,6 +211,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
return 0;
}

+static bool xsk_tx_writeable(struct xdp_sock *xs)
+{
+ if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
+ return false;
+
+ return true;
+}
+
static bool xsk_is_bound(struct xdp_sock *xs)
{
if (READ_ONCE(xs->state) == XSK_BOUND) {
@@ -296,7 +304,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool)
rcu_read_lock();
list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
__xskq_cons_release(xs->tx);
- xs->sk.sk_write_space(&xs->sk);
+ if (xsk_tx_writeable(xs))
+ xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
}
@@ -499,7 +508,8 @@ static int xsk_generic_xmit(struct sock *sk)

out:
if (sent_frame)
- sk->sk_write_space(sk);
+ if (xsk_tx_writeable(xs))
+ sk->sk_write_space(sk);

mutex_unlock(&xs->mutex);
return err;
@@ -556,7 +566,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,

if (xs->rx && !xskq_prod_is_empty(xs->rx))
mask |= EPOLLIN | EPOLLRDNORM;
- if (xs->tx && !xskq_cons_is_full(xs->tx))
+ if (xs->tx && xsk_tx_writeable(xs))
mask |= EPOLLOUT | EPOLLWRNORM;

return mask;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index b936c46..b655004 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -307,6 +307,12 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
q->nentries;
}

+static inline __u64 xskq_cons_present_entries(struct xsk_queue *q)
+{
+ /* No barriers needed since data is not accessed */
+ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
+}
+
/* Functions for producers */

static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
--
1.8.3.1

2020-11-25 06:53:00

by Xuan Zhuo

[permalink] [raw]
Subject: [PATCH bpf v2 1/2] xsk: replace datagram_poll by sock_poll_wait

datagram_poll will judge the current socket status (EPOLLIN, EPOLLOUT)
based on the traditional socket information (eg: sk_wmem_alloc), but
this does not apply to xsk. So this patch uses sock_poll_wait instead of
datagram_poll, and the mask is calculated by xsk_poll.

Signed-off-by: Xuan Zhuo <[email protected]>
---
net/xdp/xsk.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b014197..0df8651 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -534,11 +534,13 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
static __poll_t xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
- __poll_t mask = datagram_poll(file, sock, wait);
+ __poll_t mask = 0;
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
struct xsk_buff_pool *pool;

+ sock_poll_wait(file, sock, wait);
+
if (unlikely(!xsk_is_bound(xs)))
return mask;

--
1.8.3.1

2020-11-27 08:53:35

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH bpf v2 1/2] xsk: replace datagram_poll by sock_poll_wait

On Wed, Nov 25, 2020 at 7:49 AM Xuan Zhuo <[email protected]> wrote:
>
> datagram_poll will judge the current socket status (EPOLLIN, EPOLLOUT)
> based on the traditional socket information (eg: sk_wmem_alloc), but
> this does not apply to xsk. So this patch uses sock_poll_wait instead of
> datagram_poll, and the mask is calculated by xsk_poll.
>
> Signed-off-by: Xuan Zhuo <[email protected]>
> ---
> net/xdp/xsk.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)

Acked-by: Magnus Karlsson <[email protected]>

> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index b014197..0df8651 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -534,11 +534,13 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> static __poll_t xsk_poll(struct file *file, struct socket *sock,
> struct poll_table_struct *wait)
> {
> - __poll_t mask = datagram_poll(file, sock, wait);
> + __poll_t mask = 0;
> struct sock *sk = sock->sk;
> struct xdp_sock *xs = xdp_sk(sk);
> struct xsk_buff_pool *pool;
>
> + sock_poll_wait(file, sock, wait);
> +
> if (unlikely(!xsk_is_bound(xs)))
> return mask;
>
> --
> 1.8.3.1
>

2020-11-27 08:54:13

by Magnus Karlsson

[permalink] [raw]
Subject: Re: [PATCH bpf v2 2/2] xsk: change the tx writeable condition

On Wed, Nov 25, 2020 at 7:49 AM Xuan Zhuo <[email protected]> wrote:
>
> Modify the tx writeable condition from the queue is not full to the
> number of present tx queues is less than the half of the total number
> of queues. Because the tx queue not full is a very short time, this will
> cause a large number of EPOLLOUT events, and cause a large number of
> process wake up.
>
> Signed-off-by: Xuan Zhuo <[email protected]>

Thank you Xuan!

Acked-by: Magnus Karlsson <[email protected]>

> ---
> net/xdp/xsk.c | 16 +++++++++++++---
> net/xdp/xsk_queue.h | 6 ++++++
> 2 files changed, 19 insertions(+), 3 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 0df8651..22e35e9 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -211,6 +211,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
> return 0;
> }
>
> +static bool xsk_tx_writeable(struct xdp_sock *xs)
> +{
> + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
> + return false;
> +
> + return true;
> +}
> +
> static bool xsk_is_bound(struct xdp_sock *xs)
> {
> if (READ_ONCE(xs->state) == XSK_BOUND) {
> @@ -296,7 +304,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool)
> rcu_read_lock();
> list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
> __xskq_cons_release(xs->tx);
> - xs->sk.sk_write_space(&xs->sk);
> + if (xsk_tx_writeable(xs))
> + xs->sk.sk_write_space(&xs->sk);
> }
> rcu_read_unlock();
> }
> @@ -499,7 +508,8 @@ static int xsk_generic_xmit(struct sock *sk)
>
> out:
> if (sent_frame)
> - sk->sk_write_space(sk);
> + if (xsk_tx_writeable(xs))
> + sk->sk_write_space(sk);
>
> mutex_unlock(&xs->mutex);
> return err;
> @@ -556,7 +566,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
>
> if (xs->rx && !xskq_prod_is_empty(xs->rx))
> mask |= EPOLLIN | EPOLLRDNORM;
> - if (xs->tx && !xskq_cons_is_full(xs->tx))
> + if (xs->tx && xsk_tx_writeable(xs))
> mask |= EPOLLOUT | EPOLLWRNORM;
>
> return mask;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index b936c46..b655004 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -307,6 +307,12 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
> q->nentries;
> }
>
> +static inline __u64 xskq_cons_present_entries(struct xsk_queue *q)
> +{
> + /* No barriers needed since data is not accessed */
> + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
> +}
> +
> /* Functions for producers */
>
> static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
> --
> 1.8.3.1
>

2020-11-27 21:33:45

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH bpf v2 2/2] xsk: change the tx writeable condition

On 11/25/20 7:48 AM, Xuan Zhuo wrote:
> Modify the tx writeable condition from the queue is not full to the
> number of present tx queues is less than the half of the total number
> of queues. Because the tx queue not full is a very short time, this will
> cause a large number of EPOLLOUT events, and cause a large number of
> process wake up.
>
> Signed-off-by: Xuan Zhuo <[email protected]>

This one doesn't apply cleanly against bpf tree, please rebase. Small comment
inline while looking over the patch:

> ---
> net/xdp/xsk.c | 16 +++++++++++++---
> net/xdp/xsk_queue.h | 6 ++++++
> 2 files changed, 19 insertions(+), 3 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 0df8651..22e35e9 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -211,6 +211,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
> return 0;
> }
>
> +static bool xsk_tx_writeable(struct xdp_sock *xs)
> +{
> + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
> + return false;
> +
> + return true;
> +}
> +
> static bool xsk_is_bound(struct xdp_sock *xs)
> {
> if (READ_ONCE(xs->state) == XSK_BOUND) {
> @@ -296,7 +304,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool)
> rcu_read_lock();
> list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
> __xskq_cons_release(xs->tx);
> - xs->sk.sk_write_space(&xs->sk);
> + if (xsk_tx_writeable(xs))
> + xs->sk.sk_write_space(&xs->sk);
> }
> rcu_read_unlock();
> }
> @@ -499,7 +508,8 @@ static int xsk_generic_xmit(struct sock *sk)
>
> out:
> if (sent_frame)
> - sk->sk_write_space(sk);
> + if (xsk_tx_writeable(xs))
> + sk->sk_write_space(sk);
>
> mutex_unlock(&xs->mutex);
> return err;
> @@ -556,7 +566,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
>
> if (xs->rx && !xskq_prod_is_empty(xs->rx))
> mask |= EPOLLIN | EPOLLRDNORM;
> - if (xs->tx && !xskq_cons_is_full(xs->tx))
> + if (xs->tx && xsk_tx_writeable(xs))
> mask |= EPOLLOUT | EPOLLWRNORM;
>
> return mask;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index b936c46..b655004 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -307,6 +307,12 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
> q->nentries;
> }
>
> +static inline __u64 xskq_cons_present_entries(struct xsk_queue *q)

Types prefixed with __ are mainly for user-space facing things like uapi headers,
so in-kernel should be u64. Is there a reason this is not done as u32 (and thus
same as producer and producer)?

> +{
> + /* No barriers needed since data is not accessed */
> + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
> +}
> +
> /* Functions for producers */
>
> static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
>