2021-09-27 20:35:19

by Johannes Lundberg

[permalink] [raw]
Subject: [PATCH] fs: eventpoll: add empty event

The EPOLLEMPTY event will trigger when the TCP write buffer becomes
empty, i.e., when all outgoing data have been ACKed.

The need for this functionality comes from a business requirement
of measuring with higher precision how much time is spent
transmitting data to a client. For reference, similar functionality
was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.

Signed-off-by: Johannes Lundberg <[email protected]>
---
include/net/sock.h | 11 +++++++++++
include/uapi/linux/eventpoll.h | 1 +
net/core/sock.c | 5 +++++
net/core/stream.c | 14 ++++++++++++++
net/ipv4/tcp.c | 5 +++++
5 files changed, 36 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index c005c3c750e8..9047a9e225a9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -516,6 +516,7 @@ struct sock {
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk);
void (*sk_write_space)(struct sock *sk);
+ void (*sk_empty)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
@@ -965,6 +966,7 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val)
WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

+void sk_stream_empty(struct sock *sk);
void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
@@ -1288,6 +1290,11 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)

INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));

+static inline bool sk_stream_is_empty(const struct sock *sk)
+{
+ return (sk->sk_wmem_queued == 0);
+}
+
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1559,6 +1566,10 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk_wmem_queued_add(sk, -skb->truesize);
+
+ if (sk_stream_is_empty(sk))
+ sk->sk_empty(sk);
+
sk_mem_uncharge(sk, skb->truesize);
if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
!sk->sk_tx_skb_cache && !skb_cloned(skb)) {
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..aab9f1f624d0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -39,6 +39,7 @@
#define EPOLLWRNORM (__force __poll_t)0x00000100
#define EPOLLWRBAND (__force __poll_t)0x00000200
#define EPOLLMSG (__force __poll_t)0x00000400
+#define EPOLLEMPTY (__force __poll_t)0x00000800
#define EPOLLRDHUP (__force __poll_t)0x00002000

/* Set exclusive wakeup mode for the target file descriptor */
diff --git a/net/core/sock.c b/net/core/sock.c
index 512e629f9780..f917791d8149 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3062,6 +3062,10 @@ static void sock_def_write_space(struct sock *sk)
rcu_read_unlock();
}

+static void sock_def_empty(struct sock *sk)
+{
+}
+
static void sock_def_destruct(struct sock *sk)
{
}
@@ -3136,6 +3140,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
+ sk->sk_empty = sock_def_empty;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;

diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..c7e4135542a2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -21,6 +21,20 @@
#include <linux/wait.h>
#include <net/sock.h>

+void sk_stream_empty(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct socket_wq *wq;
+
+ if (sk_stream_is_empty(sk) && sock) {
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, EPOLLEMPTY);
+ rcu_read_unlock();
+ }
+}
+
/**
* sk_stream_write_space - stream socket write_space callback.
* @sk: socket
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..550bae79af06 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;

+ sk->sk_empty = sk_stream_empty;
+
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

@@ -561,6 +563,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;

+ if (sk_stream_is_empty(sk))
+ mask |= EPOLLEMPTY;
+
if (tcp_stream_is_readable(sk, target))
mask |= EPOLLIN | EPOLLRDNORM;

--
2.17.1


2021-09-27 20:51:49

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs: eventpoll: add empty event

On Mon, Sep 27, 2021 at 1:30 PM Johannes Lundberg <[email protected]> wrote:
>
> The EPOLLEMPTY event will trigger when the TCP write buffer becomes
> empty, i.e., when all outgoing data have been ACKed.
>
> The need for this functionality comes from a business requirement
> of measuring with higher precision how much time is spent
> transmitting data to a client. For reference, similar functionality
> was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.


Adding yet another indirect call [1] in TCP fast path, for something
(measuring with higher precision..)
which is already implemented differently in TCP stack [2] is not desirable.

Our timestamping infrastructure should be ported to FreeBSD instead :)

[1] CONFIG_RETPOLINE=y

[2] Refs :
commit e1c8a607b28190cd09a271508aa3025d3c2f312e
net-timestamp: ACK timestamp for bytestreams
tools/testing/selftests/net/txtimestamp.c

2021-09-27 21:19:26

by Johannes Lundberg

[permalink] [raw]
Subject: Re: [PATCH] fs: eventpoll: add empty event


On 9/27/21 1:47 PM, Eric Dumazet wrote:
> On Mon, Sep 27, 2021 at 1:30 PM Johannes Lundberg <[email protected]> wrote:
>> The EPOLLEMPTY event will trigger when the TCP write buffer becomes
>> empty, i.e., when all outgoing data have been ACKed.
>>
>> The need for this functionality comes from a business requirement
>> of measuring with higher precision how much time is spent
>> transmitting data to a client. For reference, similar functionality
>> was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.
>
> Adding yet another indirect call [1] in TCP fast path, for something
> (measuring with higher precision..)
> which is already implemented differently in TCP stack [2] is not desirable.
>
> Our timestamping infrastructure should be ported to FreeBSD instead :)
>
> [1] CONFIG_RETPOLINE=y
>
> [2] Refs :
> commit e1c8a607b28190cd09a271508aa3025d3c2f312e
> net-timestamp: ACK timestamp for bytestreams
> tools/testing/selftests/net/txtimestamp.c

Hi Eric

Thanks for the feedback! If there's a way to achieve the same thing with
current Linux I'm all for it. I'll look into how to use timestamps for this.

2021-09-27 21:38:02

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs: eventpoll: add empty event



On 9/27/21 2:17 PM, Johannes Lundberg wrote:
>
> On 9/27/21 1:47 PM, Eric Dumazet wrote:
>> On Mon, Sep 27, 2021 at 1:30 PM Johannes Lundberg <[email protected]> wrote:
>>> The EPOLLEMPTY event will trigger when the TCP write buffer becomes
>>> empty, i.e., when all outgoing data have been ACKed.
>>>
>>> The need for this functionality comes from a business requirement
>>> of measuring with higher precision how much time is spent
>>> transmitting data to a client. For reference, similar functionality
>>> was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.
>>
>> Adding yet another indirect call [1] in TCP fast path, for something
>> (measuring with higher precision..)
>> which is already implemented differently in TCP stack [2] is not desirable.
>>
>> Our timestamping infrastructure should be ported to FreeBSD instead :)
>>
>> [1] CONFIG_RETPOLINE=y
>>
>> [2] Refs :
>>     commit e1c8a607b28190cd09a271508aa3025d3c2f312e
>>        net-timestamp: ACK timestamp for bytestreams
>>      tools/testing/selftests/net/txtimestamp.c
>
> Hi Eric
>
> Thanks for the feedback! If there's a way to achieve the same thing with current Linux I'm all for it. I'll look into how to use timestamps for this.
>

You are welcome !

Note that timestamping allows to trigger many events, even if write queue is not empty.

This is particularly useful when an application does not want a write queue to be drained,
since this would add transmit stalls.

Also, since the events are time stamped exactly when the relevant ACK are processed,
they are more accurate than something based on epoll, since I guess you would
get timestamps after a thread wakeup.