We've got some issues when sending a compressed ack is deferred to
release phrase due to the socket owned by another user:
1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
flag.
2. the tp->compressed_ack counter should be decremented by 1.
3. we cannot pass timeout check and reset the delack timer in
tcp_delack_timer_handler().
4. we are not supposed to increment the LINUX_MIB_DELAYEDACKS counter.
...
The reason why it could happen is that we previously reuse the delayed
ack logic when handling the sack compression. With this patch applied,
the sack compression logic would go into the same function
(tcp_compack_timer_handler()) whether we defer sending ack or not.
Therefore, those two issued could be easily solved.
Here are more details in the old logic:
When sack compression is triggered in the tcp_compressed_ack_kick(),
if the sock is owned by user, it will set TCP_DELACK_TIMER_DEFERRED and
then defer to the release cb phrase. Later once user releases the sock,
tcp_delack_timer_handler() should send a ack as expected, which, however,
cannot happen due to lack of ICSK_ACK_TIMER flag. Therefore, the receiver
would not sent an ack until the sender's retransmission timeout. It
definitely increases unnecessary latency.
This issue happens rarely in the production environment. I used kprobe
to hook some key functions like tcp_compressed_ack_kick, tcp_release_cb,
tcp_delack_timer_handler and then found that when tcp_delack_timer_handler
was called, value of icsk_ack.pending was 1, which means we only had
flag ICSK_ACK_SCHED set, not including ICSK_ACK_TIMER. It was against
our expectations.
In conclusion, we chose to separate the sack compression from delayed
ack logic to solve issues only happening when the process is deferred.
Fixes: 5d9f4262b7ea ("tcp: add SACK compression")
Signed-off-by: fuyuanli <[email protected]>
Signed-off-by: Jason Xing <[email protected]>
---
include/linux/tcp.h | 2 ++
include/net/tcp.h | 1 +
net/ipv4/tcp_output.c | 4 ++++
net/ipv4/tcp_timer.c | 28 +++++++++++++++++++---------
4 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b4c08ac86983..cd15a9972c48 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -461,6 +461,7 @@ enum tsq_enum {
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
* tcp_v{4|6}_mtu_reduced()
*/
+ TCP_COMPACK_TIMER_DEFERRED, /* tcp_compressed_ack_kick() found socket was owned */
};
enum tsq_flags {
@@ -470,6 +471,7 @@ enum tsq_flags {
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
+ TCPF_COMPACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
};
#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 18a038d16434..e310d7bf400c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -342,6 +342,7 @@ void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
+void tcp_compack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cfe128b81a01..1703caab6632 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1110,6 +1110,10 @@ void tcp_release_cb(struct sock *sk)
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
+ if (flags & TCPF_COMPACK_TIMER_DEFERRED) {
+ tcp_compack_timer_handler(sk);
+ __sock_put(sk);
+ }
if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b839c2f91292..069f6442069b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -318,6 +318,23 @@ void tcp_delack_timer_handler(struct sock *sk)
}
}
+/* Called with BH disabled */
+void tcp_compack_timer_handler(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ return;
+
+ if (tp->compressed_ack) {
+ /* Since we have to send one ack finally,
+ * subtract one from tp->compressed_ack to keep
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
+ */
+ tp->compressed_ack--;
+ tcp_send_ack(sk);
+ }
+}
/**
* tcp_delack_timer() - The TCP delayed ACK timeout handler
@@ -757,16 +774,9 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
- if (tp->compressed_ack) {
- /* Since we have to send one ack finally,
- * subtract one from tp->compressed_ack to keep
- * LINUX_MIB_TCPACKCOMPRESSED accurate.
- */
- tp->compressed_ack--;
- tcp_send_ack(sk);
- }
+ tcp_compack_timer_handler(sk);
} else {
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+ if (!test_and_set_bit(TCP_COMPACK_TIMER_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
}
--
2.17.1
On Mon, May 29, 2023 at 7:38 PM fuyuanli <[email protected]> wrote:
>
> We've got some issues when sending a compressed ack is deferred to
> release phrase due to the socket owned by another user:
> 1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
> flag.
> 2. the tp->compressed_ack counter should be decremented by 1.
> 3. we cannot pass timeout check and reset the delack timer in
> tcp_delack_timer_handler().
> 4. we are not supposed to increment the LINUX_MIB_DELAYEDACKS counter.
> ...
>
> The reason why it could happen is that we previously reuse the delayed
> ack logic when handling the sack compression. With this patch applied,
> the sack compression logic would go into the same function
> (tcp_compack_timer_handler()) whether we defer sending ack or not.
> Therefore, those two issued could be easily solved.
>
> Here are more details in the old logic:
> When sack compression is triggered in the tcp_compressed_ack_kick(),
> if the sock is owned by user, it will set TCP_DELACK_TIMER_DEFERRED and
> then defer to the release cb phrase. Later once user releases the sock,
> tcp_delack_timer_handler() should send a ack as expected, which, however,
> cannot happen due to lack of ICSK_ACK_TIMER flag. Therefore, the receiver
> would not sent an ack until the sender's retransmission timeout. It
> definitely increases unnecessary latency.
>
> This issue happens rarely in the production environment. I used kprobe
> to hook some key functions like tcp_compressed_ack_kick, tcp_release_cb,
> tcp_delack_timer_handler and then found that when tcp_delack_timer_handler
> was called, value of icsk_ack.pending was 1, which means we only had
> flag ICSK_ACK_SCHED set, not including ICSK_ACK_TIMER. It was against
> our expectations.
>
> In conclusion, we chose to separate the sack compression from delayed
> ack logic to solve issues only happening when the process is deferred.
>
> Fixes: 5d9f4262b7ea ("tcp: add SACK compression")
> Signed-off-by: fuyuanli <[email protected]>
> Signed-off-by: Jason Xing <[email protected]>
> ---
> include/linux/tcp.h | 2 ++
> include/net/tcp.h | 1 +
> net/ipv4/tcp_output.c | 4 ++++
> net/ipv4/tcp_timer.c | 28 +++++++++++++++++++---------
> 4 files changed, 26 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index b4c08ac86983..cd15a9972c48 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -461,6 +461,7 @@ enum tsq_enum {
> TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
> * tcp_v{4|6}_mtu_reduced()
> */
> + TCP_COMPACK_TIMER_DEFERRED, /* tcp_compressed_ack_kick() found socket was owned */
> };
>
> enum tsq_flags {
> @@ -470,6 +471,7 @@ enum tsq_flags {
> TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
> TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
> + TCPF_COMPACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> };
>
> #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 18a038d16434..e310d7bf400c 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -342,6 +342,7 @@ void tcp_release_cb(struct sock *sk);
> void tcp_wfree(struct sk_buff *skb);
> void tcp_write_timer_handler(struct sock *sk);
> void tcp_delack_timer_handler(struct sock *sk);
> +void tcp_compack_timer_handler(struct sock *sk);
> int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
> int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
> void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index cfe128b81a01..1703caab6632 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1110,6 +1110,10 @@ void tcp_release_cb(struct sock *sk)
> tcp_delack_timer_handler(sk);
> __sock_put(sk);
> }
> + if (flags & TCPF_COMPACK_TIMER_DEFERRED) {
> + tcp_compack_timer_handler(sk);
> + __sock_put(sk);
> + }
> if (flags & TCPF_MTU_REDUCED_DEFERRED) {
> inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
> __sock_put(sk);
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index b839c2f91292..069f6442069b 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -318,6 +318,23 @@ void tcp_delack_timer_handler(struct sock *sk)
> }
> }
>
> +/* Called with BH disabled */
> +void tcp_compack_timer_handler(struct sock *sk)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> +
> + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
> + return;
> +
> + if (tp->compressed_ack) {
> + /* Since we have to send one ack finally,
> + * subtract one from tp->compressed_ack to keep
> + * LINUX_MIB_TCPACKCOMPRESSED accurate.
> + */
> + tp->compressed_ack--;
> + tcp_send_ack(sk);
> + }
> +}
>
> /**
> * tcp_delack_timer() - The TCP delayed ACK timeout handler
> @@ -757,16 +774,9 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
>
> bh_lock_sock(sk);
> if (!sock_owned_by_user(sk)) {
> - if (tp->compressed_ack) {
> - /* Since we have to send one ack finally,
> - * subtract one from tp->compressed_ack to keep
> - * LINUX_MIB_TCPACKCOMPRESSED accurate.
> - */
> - tp->compressed_ack--;
> - tcp_send_ack(sk);
> - }
> + tcp_compack_timer_handler(sk);
> } else {
> - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
> + if (!test_and_set_bit(TCP_COMPACK_TIMER_DEFERRED,
> &sk->sk_tsq_flags))
> sock_hold(sk);
> }
> --
> 2.17.1
>
I checked the patchwork and then found that it warns me because of not
CCing two other maintainers: ycheng and toke. But what made me curious
is I cannot find them in the MAINTAINERS file.
so CC them here and wait for more suggestions about this patch during
this period before fuyuanli submits v2 patch with all the maintainers
CC'ed.
CC:
[email protected] and [email protected]
Thanks,
Jason
On Mon, May 29, 2023 at 1:38 PM fuyuanli <[email protected]> wrote:
>
> We've got some issues when sending a compressed ack is deferred to
> release phrase due to the socket owned by another user:
> 1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
> flag.
Are you sure ? Just add it then, your patch will be a one-liner
instead of a complex one adding
more code in a fast path.
> 2. the tp->compressed_ack counter should be decremented by 1.
> 3. we cannot pass timeout check and reset the delack timer in
> tcp_delack_timer_handler().
> 4. we are not supposed to increment the LINUX_MIB_DELAYEDACKS counter.
> ...
>
> The reason why it could happen is that we previously reuse the delayed
> ack logic when handling the sack compression. With this patch applied,
> the sack compression logic would go into the same function
> (tcp_compack_timer_handler()) whether we defer sending ack or not.
> Therefore, those two issued could be easily solved.
>
> Here are more details in the old logic:
> When sack compression is triggered in the tcp_compressed_ack_kick(),
> if the sock is owned by user, it will set TCP_DELACK_TIMER_DEFERRED and
> then defer to the release cb phrase. Later once user releases the sock,
> tcp_delack_timer_handler() should send a ack as expected, which, however,
> cannot happen due to lack of ICSK_ACK_TIMER flag. Therefore, the receiver
> would not sent an ack until the sender's retransmission timeout. It
> definitely increases unnecessary latency.
>
> This issue happens rarely in the production environment. I used kprobe
> to hook some key functions like tcp_compressed_ack_kick, tcp_release_cb,
> tcp_delack_timer_handler and then found that when tcp_delack_timer_handler
> was called, value of icsk_ack.pending was 1, which means we only had
> flag ICSK_ACK_SCHED set, not including ICSK_ACK_TIMER. It was against
> our expectations.
>
> In conclusion, we chose to separate the sack compression from delayed
> ack logic to solve issues only happening when the process is deferred.
>
> Fixes: 5d9f4262b7ea ("tcp: add SACK compression")
> Signed-off-by: fuyuanli <[email protected]>
> Signed-off-by: Jason Xing <[email protected]>
> ---
> include/linux/tcp.h | 2 ++
> include/net/tcp.h | 1 +
> net/ipv4/tcp_output.c | 4 ++++
> net/ipv4/tcp_timer.c | 28 +++++++++++++++++++---------
> 4 files changed, 26 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index b4c08ac86983..cd15a9972c48 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -461,6 +461,7 @@ enum tsq_enum {
> TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
> * tcp_v{4|6}_mtu_reduced()
> */
> + TCP_COMPACK_TIMER_DEFERRED, /* tcp_compressed_ack_kick() found socket was owned */
> };
>
> enum tsq_flags {
> @@ -470,6 +471,7 @@ enum tsq_flags {
> TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
> TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
> + TCPF_COMPACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> };
>
> #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 18a038d16434..e310d7bf400c 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -342,6 +342,7 @@ void tcp_release_cb(struct sock *sk);
> void tcp_wfree(struct sk_buff *skb);
> void tcp_write_timer_handler(struct sock *sk);
> void tcp_delack_timer_handler(struct sock *sk);
> +void tcp_compack_timer_handler(struct sock *sk);
> int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
> int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
> void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index cfe128b81a01..1703caab6632 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1110,6 +1110,10 @@ void tcp_release_cb(struct sock *sk)
> tcp_delack_timer_handler(sk);
> __sock_put(sk);
> }
> + if (flags & TCPF_COMPACK_TIMER_DEFERRED) {
> + tcp_compack_timer_handler(sk);
> + __sock_put(sk);
> + }
Please do not add another test in the fast path.
Just make sure tcp_delack_timer_handler() handles the case (this
certainly was my intent)
> if (flags & TCPF_MTU_REDUCED_DEFERRED) {
> inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
> __sock_put(sk);
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index b839c2f91292..069f6442069b 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -318,6 +318,23 @@ void tcp_delack_timer_handler(struct sock *sk)
> }
> }
>
> +/* Called with BH disabled */
> +void tcp_compack_timer_handler(struct sock *sk)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> +
> + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
> + return;
> +
> + if (tp->compressed_ack) {
> + /* Since we have to send one ack finally,
> + * subtract one from tp->compressed_ack to keep
> + * LINUX_MIB_TCPACKCOMPRESSED accurate.
> + */
> + tp->compressed_ack--;
> + tcp_send_ack(sk);
> + }
> +}
>
> /**
> * tcp_delack_timer() - The TCP delayed ACK timeout handler
> @@ -757,16 +774,9 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
>
> bh_lock_sock(sk);
> if (!sock_owned_by_user(sk)) {
> - if (tp->compressed_ack) {
> - /* Since we have to send one ack finally,
> - * subtract one from tp->compressed_ack to keep
> - * LINUX_MIB_TCPACKCOMPRESSED accurate.
> - */
> - tp->compressed_ack--;
> - tcp_send_ack(sk);
> - }
> + tcp_compack_timer_handler(sk);
> } else {
> - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
See, I was clearly intending to let tcp_delack_timer_handler() deal with this.
> + if (!test_and_set_bit(TCP_COMPACK_TIMER_DEFERRED,
> &sk->sk_tsq_flags))
> sock_hold(sk);
> }
> --
> 2.17.1
>
On Tue, May 30, 2023 at 3:45 PM Jason Xing <[email protected]> wrote:
>
> On Tue, May 30, 2023 at 3:12 PM Eric Dumazet <[email protected]> wrote:
> >
> > On Mon, May 29, 2023 at 1:38 PM fuyuanli <[email protected]> wrote:
> > >
> > > We've got some issues when sending a compressed ack is deferred to
> > > release phrase due to the socket owned by another user:
> > > 1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
> > > flag.
> >
> > Are you sure ? Just add it then, your patch will be a one-liner
> > instead of a complex one adding
> > more code in a fast path.
>
> Honestly, at the very beginning, we just added one line[1] to fix
> this. After I digged more into this part, I started to doubt if we
> should reuse the delayed ack logic.
>
> Because in the sack compression logic there is no need to do more
> things as delayed ack does in the tcp_delack_timer_handler() function.
>
> Besides, here are some things extra to be done if we defer to send an
> ack in sack compression:
> 1) decrease tp->compressed_ack. The same as "tp->compressed_ack--;" in
> tcp_compressed_ack_kick().
> 2) initialize icsk->icsk_ack.timeout. Actually we don't need to do
> this because we don't modify the expiration time in the sack
> compression hrtimer.
> 3) don't need to count the LINUX_MIB_DELAYEDACKS counter.
> 4) I wonder even if those checks about the ack schedule or ping pong
> mode in tcp_delack_timer_handler() for sack compression? I'm not sure
> about it.
>
> So one line cannot solve it perfectly. That's the reason why we
> introduce a new logic which can be clearer.
>
> I'm wondering if adding one check in the fast path is really that
> unacceptable (it may hurt performance?) because a new logic would be
> clearer for the whole sack compression.
>
> [1]
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index cc072d2cfcd8..d9e76d761cc6 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5568,6 +5568,7 @@ static void __tcp_ack_snd_check(struct sock *sk,
> int ofo_possible)
>
> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
> rtt * (NSEC_PER_USEC >> 3)/20);
> sock_hold(sk);
> + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_TIMER;
Oh, one more thing I forget to say is adding this flag here would
cause us to handle when and where to remove this flag without messing
up with the delayed ack logic.
Thanks,
Jason
> hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
>
> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
> HRTIMER_MODE_REL_PINNED_SOFT);
>
> >
> > > 2. the tp->compressed_ack counter should be decremented by 1.
> > > 3. we cannot pass timeout check and reset the delack timer in
> > > tcp_delack_timer_handler().
> > > 4. we are not supposed to increment the LINUX_MIB_DELAYEDACKS counter.
> > > ...
> > >
> > > The reason why it could happen is that we previously reuse the delayed
> > > ack logic when handling the sack compression. With this patch applied,
> > > the sack compression logic would go into the same function
> > > (tcp_compack_timer_handler()) whether we defer sending ack or not.
> > > Therefore, those two issued could be easily solved.
> > >
> > > Here are more details in the old logic:
> > > When sack compression is triggered in the tcp_compressed_ack_kick(),
> > > if the sock is owned by user, it will set TCP_DELACK_TIMER_DEFERRED and
> > > then defer to the release cb phrase. Later once user releases the sock,
> > > tcp_delack_timer_handler() should send a ack as expected, which, however,
> > > cannot happen due to lack of ICSK_ACK_TIMER flag. Therefore, the receiver
> > > would not sent an ack until the sender's retransmission timeout. It
> > > definitely increases unnecessary latency.
> > >
> > > This issue happens rarely in the production environment. I used kprobe
> > > to hook some key functions like tcp_compressed_ack_kick, tcp_release_cb,
> > > tcp_delack_timer_handler and then found that when tcp_delack_timer_handler
> > > was called, value of icsk_ack.pending was 1, which means we only had
> > > flag ICSK_ACK_SCHED set, not including ICSK_ACK_TIMER. It was against
> > > our expectations.
> > >
> > > In conclusion, we chose to separate the sack compression from delayed
> > > ack logic to solve issues only happening when the process is deferred.
> > >
> > > Fixes: 5d9f4262b7ea ("tcp: add SACK compression")
> > > Signed-off-by: fuyuanli <[email protected]>
> > > Signed-off-by: Jason Xing <[email protected]>
> > > ---
> > > include/linux/tcp.h | 2 ++
> > > include/net/tcp.h | 1 +
> > > net/ipv4/tcp_output.c | 4 ++++
> > > net/ipv4/tcp_timer.c | 28 +++++++++++++++++++---------
> > > 4 files changed, 26 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> > > index b4c08ac86983..cd15a9972c48 100644
> > > --- a/include/linux/tcp.h
> > > +++ b/include/linux/tcp.h
> > > @@ -461,6 +461,7 @@ enum tsq_enum {
> > > TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
> > > * tcp_v{4|6}_mtu_reduced()
> > > */
> > > + TCP_COMPACK_TIMER_DEFERRED, /* tcp_compressed_ack_kick() found socket was owned */
> > > };
> > >
> > > enum tsq_flags {
> > > @@ -470,6 +471,7 @@ enum tsq_flags {
> > > TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
> > > TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> > > TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
> > > + TCPF_COMPACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> > > };
> > >
> > > #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
> > > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > > index 18a038d16434..e310d7bf400c 100644
> > > --- a/include/net/tcp.h
> > > +++ b/include/net/tcp.h
> > > @@ -342,6 +342,7 @@ void tcp_release_cb(struct sock *sk);
> > > void tcp_wfree(struct sk_buff *skb);
> > > void tcp_write_timer_handler(struct sock *sk);
> > > void tcp_delack_timer_handler(struct sock *sk);
> > > +void tcp_compack_timer_handler(struct sock *sk);
> > > int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
> > > int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
> > > void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
> > > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> > > index cfe128b81a01..1703caab6632 100644
> > > --- a/net/ipv4/tcp_output.c
> > > +++ b/net/ipv4/tcp_output.c
> > > @@ -1110,6 +1110,10 @@ void tcp_release_cb(struct sock *sk)
> > > tcp_delack_timer_handler(sk);
> > > __sock_put(sk);
> > > }
> > > + if (flags & TCPF_COMPACK_TIMER_DEFERRED) {
> > > + tcp_compack_timer_handler(sk);
> > > + __sock_put(sk);
> > > + }
> >
> > Please do not add another test in the fast path.
> >
> > Just make sure tcp_delack_timer_handler() handles the case (this
> > certainly was my intent)
> >
> >
> > > if (flags & TCPF_MTU_REDUCED_DEFERRED) {
> > > inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
> > > __sock_put(sk);
> > > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> > > index b839c2f91292..069f6442069b 100644
> > > --- a/net/ipv4/tcp_timer.c
> > > +++ b/net/ipv4/tcp_timer.c
> > > @@ -318,6 +318,23 @@ void tcp_delack_timer_handler(struct sock *sk)
> > > }
> > > }
> > >
> > > +/* Called with BH disabled */
> > > +void tcp_compack_timer_handler(struct sock *sk)
> > > +{
> > > + struct tcp_sock *tp = tcp_sk(sk);
> > > +
> > > + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
> > > + return;
> > > +
> > > + if (tp->compressed_ack) {
> > > + /* Since we have to send one ack finally,
> > > + * subtract one from tp->compressed_ack to keep
> > > + * LINUX_MIB_TCPACKCOMPRESSED accurate.
> > > + */
> > > + tp->compressed_ack--;
> > > + tcp_send_ack(sk);
> > > + }
> > > +}
> > >
> > > /**
> > > * tcp_delack_timer() - The TCP delayed ACK timeout handler
> > > @@ -757,16 +774,9 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
> > >
> > > bh_lock_sock(sk);
> > > if (!sock_owned_by_user(sk)) {
> > > - if (tp->compressed_ack) {
> > > - /* Since we have to send one ack finally,
> > > - * subtract one from tp->compressed_ack to keep
> > > - * LINUX_MIB_TCPACKCOMPRESSED accurate.
> > > - */
> > > - tp->compressed_ack--;
> > > - tcp_send_ack(sk);
> > > - }
> > > + tcp_compack_timer_handler(sk);
> > > } else {
> > > - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
> >
> > See, I was clearly intending to let tcp_delack_timer_handler() deal with this.
>
> I knew that :)
>
> Thanks,
> Jason
>
> >
> > > + if (!test_and_set_bit(TCP_COMPACK_TIMER_DEFERRED,
> > > &sk->sk_tsq_flags))
> > > sock_hold(sk);
> > > }
> > > --
> > > 2.17.1
> > >
On Tue, May 30, 2023 at 3:12 PM Eric Dumazet <[email protected]> wrote:
>
> On Mon, May 29, 2023 at 1:38 PM fuyuanli <[email protected]> wrote:
> >
> > We've got some issues when sending a compressed ack is deferred to
> > release phrase due to the socket owned by another user:
> > 1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
> > flag.
>
> Are you sure ? Just add it then, your patch will be a one-liner
> instead of a complex one adding
> more code in a fast path.
Honestly, at the very beginning, we just added one line[1] to fix
this. After I digged more into this part, I started to doubt if we
should reuse the delayed ack logic.
Because in the sack compression logic there is no need to do more
things as delayed ack does in the tcp_delack_timer_handler() function.
Besides, here are some things extra to be done if we defer to send an
ack in sack compression:
1) decrease tp->compressed_ack. The same as "tp->compressed_ack--;" in
tcp_compressed_ack_kick().
2) initialize icsk->icsk_ack.timeout. Actually we don't need to do
this because we don't modify the expiration time in the sack
compression hrtimer.
3) don't need to count the LINUX_MIB_DELAYEDACKS counter.
4) I wonder even if those checks about the ack schedule or ping pong
mode in tcp_delack_timer_handler() for sack compression? I'm not sure
about it.
So one line cannot solve it perfectly. That's the reason why we
introduce a new logic which can be clearer.
I'm wondering if adding one check in the fast path is really that
unacceptable (it may hurt performance?) because a new logic would be
clearer for the whole sack compression.
[1]
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cc072d2cfcd8..d9e76d761cc6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5568,6 +5568,7 @@ static void __tcp_ack_snd_check(struct sock *sk,
int ofo_possible)
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk);
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_TIMER;
hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
HRTIMER_MODE_REL_PINNED_SOFT);
>
> > 2. the tp->compressed_ack counter should be decremented by 1.
> > 3. we cannot pass timeout check and reset the delack timer in
> > tcp_delack_timer_handler().
> > 4. we are not supposed to increment the LINUX_MIB_DELAYEDACKS counter.
> > ...
> >
> > The reason why it could happen is that we previously reuse the delayed
> > ack logic when handling the sack compression. With this patch applied,
> > the sack compression logic would go into the same function
> > (tcp_compack_timer_handler()) whether we defer sending ack or not.
> > Therefore, those two issued could be easily solved.
> >
> > Here are more details in the old logic:
> > When sack compression is triggered in the tcp_compressed_ack_kick(),
> > if the sock is owned by user, it will set TCP_DELACK_TIMER_DEFERRED and
> > then defer to the release cb phrase. Later once user releases the sock,
> > tcp_delack_timer_handler() should send a ack as expected, which, however,
> > cannot happen due to lack of ICSK_ACK_TIMER flag. Therefore, the receiver
> > would not sent an ack until the sender's retransmission timeout. It
> > definitely increases unnecessary latency.
> >
> > This issue happens rarely in the production environment. I used kprobe
> > to hook some key functions like tcp_compressed_ack_kick, tcp_release_cb,
> > tcp_delack_timer_handler and then found that when tcp_delack_timer_handler
> > was called, value of icsk_ack.pending was 1, which means we only had
> > flag ICSK_ACK_SCHED set, not including ICSK_ACK_TIMER. It was against
> > our expectations.
> >
> > In conclusion, we chose to separate the sack compression from delayed
> > ack logic to solve issues only happening when the process is deferred.
> >
> > Fixes: 5d9f4262b7ea ("tcp: add SACK compression")
> > Signed-off-by: fuyuanli <[email protected]>
> > Signed-off-by: Jason Xing <[email protected]>
> > ---
> > include/linux/tcp.h | 2 ++
> > include/net/tcp.h | 1 +
> > net/ipv4/tcp_output.c | 4 ++++
> > net/ipv4/tcp_timer.c | 28 +++++++++++++++++++---------
> > 4 files changed, 26 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> > index b4c08ac86983..cd15a9972c48 100644
> > --- a/include/linux/tcp.h
> > +++ b/include/linux/tcp.h
> > @@ -461,6 +461,7 @@ enum tsq_enum {
> > TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
> > * tcp_v{4|6}_mtu_reduced()
> > */
> > + TCP_COMPACK_TIMER_DEFERRED, /* tcp_compressed_ack_kick() found socket was owned */
> > };
> >
> > enum tsq_flags {
> > @@ -470,6 +471,7 @@ enum tsq_flags {
> > TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
> > TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> > TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
> > + TCPF_COMPACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
> > };
> >
> > #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 18a038d16434..e310d7bf400c 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -342,6 +342,7 @@ void tcp_release_cb(struct sock *sk);
> > void tcp_wfree(struct sk_buff *skb);
> > void tcp_write_timer_handler(struct sock *sk);
> > void tcp_delack_timer_handler(struct sock *sk);
> > +void tcp_compack_timer_handler(struct sock *sk);
> > int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
> > int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
> > void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
> > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> > index cfe128b81a01..1703caab6632 100644
> > --- a/net/ipv4/tcp_output.c
> > +++ b/net/ipv4/tcp_output.c
> > @@ -1110,6 +1110,10 @@ void tcp_release_cb(struct sock *sk)
> > tcp_delack_timer_handler(sk);
> > __sock_put(sk);
> > }
> > + if (flags & TCPF_COMPACK_TIMER_DEFERRED) {
> > + tcp_compack_timer_handler(sk);
> > + __sock_put(sk);
> > + }
>
> Please do not add another test in the fast path.
>
> Just make sure tcp_delack_timer_handler() handles the case (this
> certainly was my intent)
>
>
> > if (flags & TCPF_MTU_REDUCED_DEFERRED) {
> > inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
> > __sock_put(sk);
> > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> > index b839c2f91292..069f6442069b 100644
> > --- a/net/ipv4/tcp_timer.c
> > +++ b/net/ipv4/tcp_timer.c
> > @@ -318,6 +318,23 @@ void tcp_delack_timer_handler(struct sock *sk)
> > }
> > }
> >
> > +/* Called with BH disabled */
> > +void tcp_compack_timer_handler(struct sock *sk)
> > +{
> > + struct tcp_sock *tp = tcp_sk(sk);
> > +
> > + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
> > + return;
> > +
> > + if (tp->compressed_ack) {
> > + /* Since we have to send one ack finally,
> > + * subtract one from tp->compressed_ack to keep
> > + * LINUX_MIB_TCPACKCOMPRESSED accurate.
> > + */
> > + tp->compressed_ack--;
> > + tcp_send_ack(sk);
> > + }
> > +}
> >
> > /**
> > * tcp_delack_timer() - The TCP delayed ACK timeout handler
> > @@ -757,16 +774,9 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
> >
> > bh_lock_sock(sk);
> > if (!sock_owned_by_user(sk)) {
> > - if (tp->compressed_ack) {
> > - /* Since we have to send one ack finally,
> > - * subtract one from tp->compressed_ack to keep
> > - * LINUX_MIB_TCPACKCOMPRESSED accurate.
> > - */
> > - tp->compressed_ack--;
> > - tcp_send_ack(sk);
> > - }
> > + tcp_compack_timer_handler(sk);
> > } else {
> > - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
>
> See, I was clearly intending to let tcp_delack_timer_handler() deal with this.
I knew that :)
Thanks,
Jason
>
> > + if (!test_and_set_bit(TCP_COMPACK_TIMER_DEFERRED,
> > &sk->sk_tsq_flags))
> > sock_hold(sk);
> > }
> > --
> > 2.17.1
> >
On Tue, May 30, 2023 at 9:45 AM Jason Xing <[email protected]> wrote:
>
> On Tue, May 30, 2023 at 3:12 PM Eric Dumazet <[email protected]> wrote:
> >
> > On Mon, May 29, 2023 at 1:38 PM fuyuanli <[email protected]> wrote:
> > >
> > > We've got some issues when sending a compressed ack is deferred to
> > > release phrase due to the socket owned by another user:
> > > 1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
> > > flag.
> >
> > Are you sure ? Just add it then, your patch will be a one-liner
> > instead of a complex one adding
> > more code in a fast path.
>
> Honestly, at the very beginning, we just added one line[1] to fix
> this. After I digged more into this part, I started to doubt if we
> should reuse the delayed ack logic.
>
> Because in the sack compression logic there is no need to do more
> things as delayed ack does in the tcp_delack_timer_handler() function.
>
> Besides, here are some things extra to be done if we defer to send an
> ack in sack compression:
> 1) decrease tp->compressed_ack. The same as "tp->compressed_ack--;" in
> tcp_compressed_ack_kick().
> 2) initialize icsk->icsk_ack.timeout. Actually we don't need to do
> this because we don't modify the expiration time in the sack
> compression hrtimer.
Yes, we do not need this, see my following comment.
> 3) don't need to count the LINUX_MIB_DELAYEDACKS counter.
> 4) I wonder even if those checks about the ack schedule or ping pong
> mode in tcp_delack_timer_handler() for sack compression? I'm not sure
> about it.
>
> So one line cannot solve it perfectly. That's the reason why we
> introduce a new logic which can be clearer.
>
> I'm wondering if adding one check in the fast path is really that
> unacceptable (it may hurt performance?) because a new logic would be
> clearer for the whole sack compression.
We definitely can solve the minor issue by not polluting the fast path.
We also want simple/localised fixes, not something invasive (and risky).
>
> [1]
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index cc072d2cfcd8..d9e76d761cc6 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5568,6 +5568,7 @@ static void __tcp_ack_snd_check(struct sock *sk,
> int ofo_possible)
>
> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
> rtt * (NSEC_PER_USEC >> 3)/20);
> sock_hold(sk);
> + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_TIMER;
Why not simply use existing storage/variables (tp->compressed_ack),
instead of trying
to reuse something else or add another bit, then complain that this
does not work well ?
Again, just fix tcp_delack_timer_handler(), it already can fetch existing state.
As a bonus, no need to send one patch for net, and another in net-next,
trying to 'fix' issues that should have been fixed cleanly in a single patch.
Something like:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b839c2f91292f7346f33d6dcbf597594473a5aca..16bc4cedceb8a5e88f61f9abc2c0a8cc9322676a
100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -290,10 +290,20 @@ static int tcp_write_timeout(struct sock *sk)
void tcp_delack_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
- if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
- !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ return;
+
+ if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER) &&
+ !tp->compressed_ack)
+ return;
+
+ if (tp->compressed_ack) {
+ tcp_mstamp_refresh(tp);
+ tcp_sack_compress_send_ack(sk);
return;
+ }
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_delack_timer,
icsk->icsk_ack.timeout);
@@ -312,7 +322,7 @@ void tcp_delack_timer_handler(struct sock *sk)
inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
- tcp_mstamp_refresh(tcp_sk(sk));
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
On Tue, May 30, 2023 at 6:01 PM Eric Dumazet <[email protected]> wrote:
>
> On Tue, May 30, 2023 at 9:45 AM Jason Xing <[email protected]> wrote:
> >
> > On Tue, May 30, 2023 at 3:12 PM Eric Dumazet <[email protected]> wrote:
> > >
> > > On Mon, May 29, 2023 at 1:38 PM fuyuanli <[email protected]> wrote:
> > > >
> > > > We've got some issues when sending a compressed ack is deferred to
> > > > release phrase due to the socket owned by another user:
> > > > 1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
> > > > flag.
> > >
> > > Are you sure ? Just add it then, your patch will be a one-liner
> > > instead of a complex one adding
> > > more code in a fast path.
> >
> > Honestly, at the very beginning, we just added one line[1] to fix
> > this. After I digged more into this part, I started to doubt if we
> > should reuse the delayed ack logic.
> >
> > Because in the sack compression logic there is no need to do more
> > things as delayed ack does in the tcp_delack_timer_handler() function.
> >
> > Besides, here are some things extra to be done if we defer to send an
> > ack in sack compression:
> > 1) decrease tp->compressed_ack. The same as "tp->compressed_ack--;" in
> > tcp_compressed_ack_kick().
> > 2) initialize icsk->icsk_ack.timeout. Actually we don't need to do
> > this because we don't modify the expiration time in the sack
> > compression hrtimer.
>
> Yes, we do not need this, see my following comment.
>
> > 3) don't need to count the LINUX_MIB_DELAYEDACKS counter.
> > 4) I wonder even if those checks about the ack schedule or ping pong
> > mode in tcp_delack_timer_handler() for sack compression? I'm not sure
> > about it.
> >
> > So one line cannot solve it perfectly. That's the reason why we
> > introduce a new logic which can be clearer.
> >
> > I'm wondering if adding one check in the fast path is really that
> > unacceptable (it may hurt performance?) because a new logic would be
> > clearer for the whole sack compression.
>
> We definitely can solve the minor issue by not polluting the fast path.
>
> We also want simple/localised fixes, not something invasive (and risky).
Now I got it :)
>
> >
> > [1]
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index cc072d2cfcd8..d9e76d761cc6 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -5568,6 +5568,7 @@ static void __tcp_ack_snd_check(struct sock *sk,
> > int ofo_possible)
> >
> > READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
> > rtt * (NSEC_PER_USEC >> 3)/20);
> > sock_hold(sk);
> > + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_TIMER;
>
> Why not simply use existing storage/variables (tp->compressed_ack),
> instead of trying
> to reuse something else or add another bit, then complain that this
> does not work well ?
>
> Again, just fix tcp_delack_timer_handler(), it already can fetch existing state.
>
> As a bonus, no need to send one patch for net, and another in net-next,
> trying to 'fix' issues that should have been fixed cleanly in a single patch.
>
> Something like:
>
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index b839c2f91292f7346f33d6dcbf597594473a5aca..16bc4cedceb8a5e88f61f9abc2c0a8cc9322676a
> 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -290,10 +290,20 @@ static int tcp_write_timeout(struct sock *sk)
> void tcp_delack_timer_handler(struct sock *sk)
> {
> struct inet_connection_sock *icsk = inet_csk(sk);
> + struct tcp_sock *tp = tcp_sk(sk);
>
> - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
> - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
> + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> + return;
> +
> + if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER) &&
> + !tp->compressed_ack)
> + return;
> +
> + if (tp->compressed_ack) {
> + tcp_mstamp_refresh(tp);
> + tcp_sack_compress_send_ack(sk);
I wonder if we could use this combination as below instead since the
above function counts the snmp counter and clears the @compressed_ack,
which is against what we normally do in tcp_compressed_ack_kick() if
the socket is not owned?
"
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
tcp_send_ack(sk);
"
Above is extracted from tcp_sack_compress_send_ack()
> return;
> + }
>
> if (time_after(icsk->icsk_ack.timeout, jiffies)) {
> sk_reset_timer(sk, &icsk->icsk_delack_timer,
> icsk->icsk_ack.timeout);
> @@ -312,7 +322,7 @@ void tcp_delack_timer_handler(struct sock *sk)
> inet_csk_exit_pingpong_mode(sk);
> icsk->icsk_ack.ato = TCP_ATO_MIN;
> }
> - tcp_mstamp_refresh(tcp_sk(sk));
> + tcp_mstamp_refresh(tp);
> tcp_send_ack(sk);
> __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
> }
Thank you so much, Eric.
I will let fuyuanli submit a v2 patch including that one without
introducing a new logic/flag.
Thanks again,
Jason
On Tue, May 30, 2023 at 7:10 PM Jason Xing <[email protected]> wrote:
>
> On Tue, May 30, 2023 at 6:01 PM Eric Dumazet <[email protected]> wrote:
> >
> > On Tue, May 30, 2023 at 9:45 AM Jason Xing <[email protected]> wrote:
> > >
> > > On Tue, May 30, 2023 at 3:12 PM Eric Dumazet <[email protected]> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 1:38 PM fuyuanli <[email protected]> wrote:
> > > > >
> > > > > We've got some issues when sending a compressed ack is deferred to
> > > > > release phrase due to the socket owned by another user:
> > > > > 1. a compressed ack would not be sent because of lack of ICSK_ACK_TIMER
> > > > > flag.
> > > >
> > > > Are you sure ? Just add it then, your patch will be a one-liner
> > > > instead of a complex one adding
> > > > more code in a fast path.
> > >
> > > Honestly, at the very beginning, we just added one line[1] to fix
> > > this. After I digged more into this part, I started to doubt if we
> > > should reuse the delayed ack logic.
> > >
> > > Because in the sack compression logic there is no need to do more
> > > things as delayed ack does in the tcp_delack_timer_handler() function.
> > >
> > > Besides, here are some things extra to be done if we defer to send an
> > > ack in sack compression:
> > > 1) decrease tp->compressed_ack. The same as "tp->compressed_ack--;" in
> > > tcp_compressed_ack_kick().
> > > 2) initialize icsk->icsk_ack.timeout. Actually we don't need to do
> > > this because we don't modify the expiration time in the sack
> > > compression hrtimer.
> >
> > Yes, we do not need this, see my following comment.
> >
> > > 3) don't need to count the LINUX_MIB_DELAYEDACKS counter.
> > > 4) I wonder even if those checks about the ack schedule or ping pong
> > > mode in tcp_delack_timer_handler() for sack compression? I'm not sure
> > > about it.
> > >
> > > So one line cannot solve it perfectly. That's the reason why we
> > > introduce a new logic which can be clearer.
> > >
> > > I'm wondering if adding one check in the fast path is really that
> > > unacceptable (it may hurt performance?) because a new logic would be
> > > clearer for the whole sack compression.
> >
> > We definitely can solve the minor issue by not polluting the fast path.
> >
> > We also want simple/localised fixes, not something invasive (and risky).
>
> Now I got it :)
>
> >
> > >
> > > [1]
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index cc072d2cfcd8..d9e76d761cc6 100644
> > > --- a/net/ipv4/tcp_input.c
> > > +++ b/net/ipv4/tcp_input.c
> > > @@ -5568,6 +5568,7 @@ static void __tcp_ack_snd_check(struct sock *sk,
> > > int ofo_possible)
> > >
> > > READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
> > > rtt * (NSEC_PER_USEC >> 3)/20);
> > > sock_hold(sk);
> > > + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_TIMER;
> >
> > Why not simply use existing storage/variables (tp->compressed_ack),
> > instead of trying
> > to reuse something else or add another bit, then complain that this
> > does not work well ?
> >
> > Again, just fix tcp_delack_timer_handler(), it already can fetch existing state.
> >
> > As a bonus, no need to send one patch for net, and another in net-next,
> > trying to 'fix' issues that should have been fixed cleanly in a single patch.
> >
> > Something like:
> >
> > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> > index b839c2f91292f7346f33d6dcbf597594473a5aca..16bc4cedceb8a5e88f61f9abc2c0a8cc9322676a
> > 100644
> > --- a/net/ipv4/tcp_timer.c
> > +++ b/net/ipv4/tcp_timer.c
> > @@ -290,10 +290,20 @@ static int tcp_write_timeout(struct sock *sk)
> > void tcp_delack_timer_handler(struct sock *sk)
> > {
> > struct inet_connection_sock *icsk = inet_csk(sk);
> > + struct tcp_sock *tp = tcp_sk(sk);
> >
> > - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
> > - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
> > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > + return;
> > +
> > + if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER) &&
> > + !tp->compressed_ack)
> > + return;
> > +
> > + if (tp->compressed_ack) {
> > + tcp_mstamp_refresh(tp);
> > + tcp_sack_compress_send_ack(sk);
>
> I wonder if we could use this combination as below instead since the
> above function counts the snmp counter and clears the @compressed_ack,
> which is against what we normally do in tcp_compressed_ack_kick() if
> the socket is not owned?
I take it back. After I considered it for a while, I think it's
working because @compressed_ack and snmp counter are a pair which
means we can do both of them together.
It doesn't have any impact though it looks a bit different from before.
Thanks,
Jason
>
> "
> if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
> __sock_put(sk);
> tcp_send_ack(sk);
> "
>
> Above is extracted from tcp_sack_compress_send_ack()
>
> > return;
> > + }
> >
> > if (time_after(icsk->icsk_ack.timeout, jiffies)) {
> > sk_reset_timer(sk, &icsk->icsk_delack_timer,
> > icsk->icsk_ack.timeout);
> > @@ -312,7 +322,7 @@ void tcp_delack_timer_handler(struct sock *sk)
> > inet_csk_exit_pingpong_mode(sk);
> > icsk->icsk_ack.ato = TCP_ATO_MIN;
> > }
> > - tcp_mstamp_refresh(tcp_sk(sk));
> > + tcp_mstamp_refresh(tp);
> > tcp_send_ack(sk);
> > __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
> > }
>
> Thank you so much, Eric.
>
> I will let fuyuanli submit a v2 patch including that one without
> introducing a new logic/flag.
>
> Thanks again,
> Jason