TCP pingpong threshold is 1 by default. But some applications, like SQL DB
may prefer a higher pingpong threshold to activate delayed acks in quick
ack mode for better performance.
The pingpong threshold and related code were changed to 3 in the year
2019 in:
commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
And reverted to 1 in the year 2022 in:
commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
There is no single value that fits all applications.
Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
optimal performance based on the application needs.
Signed-off-by: Haiyang Zhang <[email protected]>
---
v2: Make it per-namesapce setting, and other updates suggested by Neal Cardwell,
and Kuniyuki Iwashima.
---
Documentation/networking/ip-sysctl.rst | 8 ++++++++
include/net/inet_connection_sock.h | 16 ++++++++++++----
include/net/netns/ipv4.h | 1 +
net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
net/ipv4/tcp_ipv4.c | 2 ++
net/ipv4/tcp_output.c | 4 ++--
6 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 5bfa1837968c..c0308b65dc2f 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1183,6 +1183,14 @@ tcp_plb_cong_thresh - INTEGER
Default: 128
+tcp_pingpong_thresh - INTEGER
+ TCP pingpong threshold is 1 by default, but some application may need a
+ higher threshold for optimal performance.
+
+ Possible Values: 1 - 255
+
+ Default: 1
+
UDP variables
=============
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5d2fcc137b88..0182f27bce40 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -325,11 +325,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
-#define TCP_PINGPONG_THRESH 1
-
static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
{
- inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
+ inet_csk(sk)->icsk_ack.pingpong =
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
}
static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
@@ -339,7 +338,16 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
{
- return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
+ return inet_csk(sk)->icsk_ack.pingpong >=
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
+}
+
+static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ack.pingpong < U8_MAX)
+ icsk->icsk_ack.pingpong++;
}
static inline bool inet_csk_has_ulp(const struct sock *sk)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d96d05b08819..9f1b3eb9473e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -191,6 +191,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_plb_rehash_rounds;
u8 sysctl_tcp_plb_suspend_rto_sec;
int sysctl_tcp_plb_cong_thresh;
+ u8 sysctl_tcp_pingpong_thresh;
int sysctl_udp_wmem_min;
int sysctl_udp_rmem_min;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e7f024d93572..f63a545a7374 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1498,6 +1498,14 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "tcp_pingpong_thresh",
+ .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
{ }
};
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a441740616d7..f603ad9307af 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3288,6 +3288,8 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
net->ipv4.sysctl_tcp_shrink_window = 0;
+ net->ipv4.sysctl_tcp_pingpong_thresh = 1;
+
return 0;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8885552dff8e..5736a736b59c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -170,10 +170,10 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
tp->lsndtime = now;
/* If it is a reply for ato after last received
- * packet, enter pingpong mode.
+ * packet, increase pingpong count.
*/
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- inet_csk_enter_pingpong_mode(sk);
+ inet_csk_inc_pingpong_cnt(sk);
}
/* Account for an ACK we sent. */
--
2.25.1
On Tue, Oct 10, 2023 at 3:24 PM Haiyang Zhang <[email protected]> wrote:
>
> TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> may prefer a higher pingpong threshold to activate delayed acks in quick
> ack mode for better performance.
>
> The pingpong threshold and related code were changed to 3 in the year
> 2019 in:
> commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> And reverted to 1 in the year 2022 in:
> commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
>
> There is no single value that fits all applications.
> Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> optimal performance based on the application needs.
>
> Signed-off-by: Haiyang Zhang <[email protected]>
> ---
> v2: Make it per-namesapce setting, and other updates suggested by Neal Cardwell,
> and Kuniyuki Iwashima.
>
> ---
> Documentation/networking/ip-sysctl.rst | 8 ++++++++
> include/net/inet_connection_sock.h | 16 ++++++++++++----
> include/net/netns/ipv4.h | 1 +
> net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
> net/ipv4/tcp_ipv4.c | 2 ++
> net/ipv4/tcp_output.c | 4 ++--
> 6 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 5bfa1837968c..c0308b65dc2f 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -1183,6 +1183,14 @@ tcp_plb_cong_thresh - INTEGER
>
> Default: 128
>
> +tcp_pingpong_thresh - INTEGER
> + TCP pingpong threshold is 1 by default, but some application may need a
> + higher threshold for optimal performance.
> +
> + Possible Values: 1 - 255
> +
> + Default: 1
> +
It would be good to document what the meaning of the parameter is.
Perhaps consider something like:
'The number of estimated data replies sent for estimated incoming data
requests that must happen before TCP estimates that a connection is a
"ping-pong" (request-response) connection for which delayed
acknowledgments can provide benefits. This threshold is 1 by default,
but some applications may need a higher threshold for optimal
performance.'
Thanks for the patch!
best,
neal
From: Haiyang Zhang <[email protected]>
Date: Tue, 10 Oct 2023 12:23:30 -0700
> TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> may prefer a higher pingpong threshold to activate delayed acks in quick
> ack mode for better performance.
>
> The pingpong threshold and related code were changed to 3 in the year
> 2019 in:
> commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> And reverted to 1 in the year 2022 in:
> commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
>
> There is no single value that fits all applications.
> Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> optimal performance based on the application needs.
>
> Signed-off-by: Haiyang Zhang <[email protected]>
> ---
> v2: Make it per-namesapce setting, and other updates suggested by Neal Cardwell,
> and Kuniyuki Iwashima.
>
> ---
> Documentation/networking/ip-sysctl.rst | 8 ++++++++
> include/net/inet_connection_sock.h | 16 ++++++++++++----
> include/net/netns/ipv4.h | 1 +
> net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
> net/ipv4/tcp_ipv4.c | 2 ++
> net/ipv4/tcp_output.c | 4 ++--
> 6 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 5bfa1837968c..c0308b65dc2f 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -1183,6 +1183,14 @@ tcp_plb_cong_thresh - INTEGER
>
> Default: 128
>
> +tcp_pingpong_thresh - INTEGER
> + TCP pingpong threshold is 1 by default, but some application may need a
> + higher threshold for optimal performance.
> +
> + Possible Values: 1 - 255
> +
> + Default: 1
> +
> UDP variables
> =============
>
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 5d2fcc137b88..0182f27bce40 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -325,11 +325,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
>
> struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
>
> -#define TCP_PINGPONG_THRESH 1
> -
> static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
> {
> - inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
> + inet_csk(sk)->icsk_ack.pingpong =
> + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
> }
>
> static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
> @@ -339,7 +338,16 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
>
> static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
> {
> - return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
> + return inet_csk(sk)->icsk_ack.pingpong >=
> + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
> +}
> +
> +static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
> +{
> + struct inet_connection_sock *icsk = inet_csk(sk);
> +
> + if (icsk->icsk_ack.pingpong < U8_MAX)
> + icsk->icsk_ack.pingpong++;
> }
>
> static inline bool inet_csk_has_ulp(const struct sock *sk)
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index d96d05b08819..9f1b3eb9473e 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -191,6 +191,7 @@ struct netns_ipv4 {
> u8 sysctl_tcp_plb_rehash_rounds;
> u8 sysctl_tcp_plb_suspend_rto_sec;
> int sysctl_tcp_plb_cong_thresh;
> + u8 sysctl_tcp_pingpong_thresh;
>
> int sysctl_udp_wmem_min;
> int sysctl_udp_rmem_min;
Maybe a hole after sysctl_tcp_backlog_ack_defer is a good place
to put a new TCP knob.
After sysctl_tcp_plb_cong_thresh, we can fill 1-byte hole but the
cacheline seems cold for TCP.
$ pahole -C netns_ipv4 vmlinux
struct netns_ipv4 {
...
u8 sysctl_tcp_backlog_ack_defer; /* 402 1 */
/* XXX 1 byte hole, try to pack */
int sysctl_tcp_reordering; /* 404 4 */
...
int sysctl_tcp_plb_cong_thresh; /* 572 4 */
/* --- cacheline 9 boundary (576 bytes) --- */
int sysctl_udp_wmem_min; /* 576 4 */
int sysctl_udp_rmem_min; /* 580 4 */
u8 sysctl_fib_notify_on_flag_change; /* 584 1 */
u8 sysctl_tcp_syn_linear_timeouts; /* 585 1 */
u8 sysctl_igmp_llm_reports; /* 586 1 */
/* XXX 1 byte hole, try to pack */
...
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index e7f024d93572..f63a545a7374 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -1498,6 +1498,14 @@ static struct ctl_table ipv4_net_table[] = {
> .extra1 = SYSCTL_ZERO,
> .extra2 = SYSCTL_ONE,
> },
> + {
> + .procname = "tcp_pingpong_thresh",
> + .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh,
> + .maxlen = sizeof(u8),
> + .mode = 0644,
> + .proc_handler = proc_dou8vec_minmax,
> + .extra1 = SYSCTL_ONE,
> + },
> { }
> };
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index a441740616d7..f603ad9307af 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3288,6 +3288,8 @@ static int __net_init tcp_sk_init(struct net *net)
> net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
> net->ipv4.sysctl_tcp_shrink_window = 0;
>
> + net->ipv4.sysctl_tcp_pingpong_thresh = 1;
> +
> return 0;
> }
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 8885552dff8e..5736a736b59c 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -170,10 +170,10 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
> tp->lsndtime = now;
>
> /* If it is a reply for ato after last received
> - * packet, enter pingpong mode.
> + * packet, increase pingpong count.
> */
> if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
> - inet_csk_enter_pingpong_mode(sk);
> + inet_csk_inc_pingpong_cnt(sk);
> }
>
> /* Account for an ACK we sent. */
> --
> 2.25.1
> -----Original Message-----
> From: Neal Cardwell <[email protected]>
> Sent: Tuesday, October 10, 2023 4:07 PM
> To: Haiyang Zhang <[email protected]>
> Cc: [email protected]; [email protected]; KY Srinivasan
> <[email protected]>; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-
> [email protected]
> Subject: Re: [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl
>
> [You don't often get email from [email protected]. Learn why this is
> important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On Tue, Oct 10, 2023 at 3:24 PM Haiyang Zhang <[email protected]>
> wrote:
> >
> > TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> > may prefer a higher pingpong threshold to activate delayed acks in quick
> > ack mode for better performance.
> >
> > The pingpong threshold and related code were changed to 3 in the year
> > 2019 in:
> > commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> > And reverted to 1 in the year 2022 in:
> > commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
> >
> > There is no single value that fits all applications.
> > Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> > optimal performance based on the application needs.
> >
> > Signed-off-by: Haiyang Zhang <[email protected]>
> > ---
> > v2: Make it per-namesapce setting, and other updates suggested by Neal
> Cardwell,
> > and Kuniyuki Iwashima.
> >
> > ---
> > Documentation/networking/ip-sysctl.rst | 8 ++++++++
> > include/net/inet_connection_sock.h | 16 ++++++++++++----
> > include/net/netns/ipv4.h | 1 +
> > net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
> > net/ipv4/tcp_ipv4.c | 2 ++
> > net/ipv4/tcp_output.c | 4 ++--
> > 6 files changed, 33 insertions(+), 6 deletions(-)
> >
> > diff --git a/Documentation/networking/ip-sysctl.rst
> b/Documentation/networking/ip-sysctl.rst
> > index 5bfa1837968c..c0308b65dc2f 100644
> > --- a/Documentation/networking/ip-sysctl.rst
> > +++ b/Documentation/networking/ip-sysctl.rst
> > @@ -1183,6 +1183,14 @@ tcp_plb_cong_thresh - INTEGER
> >
> > Default: 128
> >
> > +tcp_pingpong_thresh - INTEGER
> > + TCP pingpong threshold is 1 by default, but some application may need
> a
> > + higher threshold for optimal performance.
> > +
> > + Possible Values: 1 - 255
> > +
> > + Default: 1
> > +
>
> It would be good to document what the meaning of the parameter is.
> Perhaps consider something like:
>
> 'The number of estimated data replies sent for estimated incoming data
> requests that must happen before TCP estimates that a connection is a
> "ping-pong" (request-response) connection for which delayed
> acknowledgments can provide benefits. This threshold is 1 by default,
> but some applications may need a higher threshold for optimal
> performance.'
>
Will do.
Thanks,
- Haiyang
> -----Original Message-----
> From: Kuniyuki Iwashima <[email protected]>
> Sent: Tuesday, October 10, 2023 4:12 PM
> To: Haiyang Zhang <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected]; KY
> Srinivasan <[email protected]>; [email protected]; linux-
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Subject: Re: [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl
>
> [You don't often get email from [email protected]. Learn why this is
> important at https://aka.ms/LearnAboutSenderIdentification ]
>
> From: Haiyang Zhang <[email protected]>
> Date: Tue, 10 Oct 2023 12:23:30 -0700
> > TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> > may prefer a higher pingpong threshold to activate delayed acks in quick
> > ack mode for better performance.
> >
> > The pingpong threshold and related code were changed to 3 in the year
> > 2019 in:
> > commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> > And reverted to 1 in the year 2022 in:
> > commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
> >
> > There is no single value that fits all applications.
> > Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> > optimal performance based on the application needs.
> >
> > Signed-off-by: Haiyang Zhang <[email protected]>
> > ---
> > v2: Make it per-namesapce setting, and other updates suggested by Neal
> Cardwell,
> > and Kuniyuki Iwashima.
> >
> > ---
> > Documentation/networking/ip-sysctl.rst | 8 ++++++++
> > include/net/inet_connection_sock.h | 16 ++++++++++++----
> > include/net/netns/ipv4.h | 1 +
> > net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
> > net/ipv4/tcp_ipv4.c | 2 ++
> > net/ipv4/tcp_output.c | 4 ++--
> > 6 files changed, 33 insertions(+), 6 deletions(-)
> >
> > diff --git a/Documentation/networking/ip-sysctl.rst
> b/Documentation/networking/ip-sysctl.rst
> > index 5bfa1837968c..c0308b65dc2f 100644
> > --- a/Documentation/networking/ip-sysctl.rst
> > +++ b/Documentation/networking/ip-sysctl.rst
> > @@ -1183,6 +1183,14 @@ tcp_plb_cong_thresh - INTEGER
> >
> > Default: 128
> >
> > +tcp_pingpong_thresh - INTEGER
> > + TCP pingpong threshold is 1 by default, but some application may need a
> > + higher threshold for optimal performance.
> > +
> > + Possible Values: 1 - 255
> > +
> > + Default: 1
> > +
> > UDP variables
> > =============
> >
> > diff --git a/include/net/inet_connection_sock.h
> b/include/net/inet_connection_sock.h
> > index 5d2fcc137b88..0182f27bce40 100644
> > --- a/include/net/inet_connection_sock.h
> > +++ b/include/net/inet_connection_sock.h
> > @@ -325,11 +325,10 @@ void inet_csk_update_fastreuse(struct
> inet_bind_bucket *tb,
> >
> > struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
> >
> > -#define TCP_PINGPONG_THRESH 1
> > -
> > static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
> > {
> > - inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
> > + inet_csk(sk)->icsk_ack.pingpong =
> > + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
> > }
> >
> > static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
> > @@ -339,7 +338,16 @@ static inline void
> inet_csk_exit_pingpong_mode(struct sock *sk)
> >
> > static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
> > {
> > - return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
> > + return inet_csk(sk)->icsk_ack.pingpong >=
> > + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
> > +}
> > +
> > +static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
> > +{
> > + struct inet_connection_sock *icsk = inet_csk(sk);
> > +
> > + if (icsk->icsk_ack.pingpong < U8_MAX)
> > + icsk->icsk_ack.pingpong++;
> > }
> >
> > static inline bool inet_csk_has_ulp(const struct sock *sk)
> > diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> > index d96d05b08819..9f1b3eb9473e 100644
> > --- a/include/net/netns/ipv4.h
> > +++ b/include/net/netns/ipv4.h
> > @@ -191,6 +191,7 @@ struct netns_ipv4 {
> > u8 sysctl_tcp_plb_rehash_rounds;
> > u8 sysctl_tcp_plb_suspend_rto_sec;
> > int sysctl_tcp_plb_cong_thresh;
> > + u8 sysctl_tcp_pingpong_thresh;
> >
> > int sysctl_udp_wmem_min;
> > int sysctl_udp_rmem_min;
>
> Maybe a hole after sysctl_tcp_backlog_ack_defer is a good place
> to put a new TCP knob.
>
> After sysctl_tcp_plb_cong_thresh, we can fill 1-byte hole but the
> cacheline seems cold for TCP.
>
> $ pahole -C netns_ipv4 vmlinux
> struct netns_ipv4 {
> ...
> u8 sysctl_tcp_backlog_ack_defer; /* 402 1 */
>
> /* XXX 1 byte hole, try to pack */
>
> int sysctl_tcp_reordering; /* 404 4 */
> ...
> int sysctl_tcp_plb_cong_thresh; /* 572 4 */
> /* --- cacheline 9 boundary (576 bytes) --- */
> int sysctl_udp_wmem_min; /* 576 4 */
> int sysctl_udp_rmem_min; /* 580 4 */
> u8 sysctl_fib_notify_on_flag_change; /* 584 1 */
> u8 sysctl_tcp_syn_linear_timeouts; /* 585 1 */
> u8 sysctl_igmp_llm_reports; /* 586 1 */
>
> /* XXX 1 byte hole, try to pack */
> ...
>
Will do.
Thanks,
- Haiyang
On Tue, 10 Oct 2023 12:23:30 -0700
Haiyang Zhang <[email protected]> wrote:
> TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> may prefer a higher pingpong threshold to activate delayed acks in quick
> ack mode for better performance.
>
> The pingpong threshold and related code were changed to 3 in the year
> 2019 in:
> commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> And reverted to 1 in the year 2022 in:
> commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
>
> There is no single value that fits all applications.
> Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> optimal performance based on the application needs.
>
> Signed-off-by: Haiyang Zhang <[email protected]>
If this an application specific optimization, it should be in a socket option
rather than system wide via sysctl.
On Tue, Oct 10, 2023 at 3:14 PM Stephen Hemminger
<[email protected]> wrote:
>
> On Tue, 10 Oct 2023 12:23:30 -0700
> Haiyang Zhang <[email protected]> wrote:
>
> > TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> > may prefer a higher pingpong threshold to activate delayed acks in quick
> > ack mode for better performance.
> >
> > The pingpong threshold and related code were changed to 3 in the year
> > 2019 in:
> > commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> > And reverted to 1 in the year 2022 in:
> > commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
> >
> > There is no single value that fits all applications.
> > Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> > optimal performance based on the application needs.
> >
> > Signed-off-by: Haiyang Zhang <[email protected]>
>
> If this an application specific optimization, it should be in a socket option
> rather than system wide via sysctl.
Initially I had a similar comment but later decided a sysctl could
still be useful if
1) the entire host (e.g. virtual machine) is dedicated to that application
2) that application is difficult to change
> -----Original Message-----
> From: Yuchung Cheng <[email protected]>
> Sent: Tuesday, October 10, 2023 6:27 PM
> To: Stephen Hemminger <[email protected]>
> Cc: Haiyang Zhang <[email protected]>; [email protected];
> [email protected]; KY Srinivasan <[email protected]>;
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-
> [email protected]; [email protected]; [email protected]
> Subject: Re: [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl
>
> On Tue, Oct 10, 2023 at 3:14 PM Stephen Hemminger
> <[email protected]> wrote:
> >
> > On Tue, 10 Oct 2023 12:23:30 -0700
> > Haiyang Zhang <[email protected]> wrote:
> >
> > > TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> > > may prefer a higher pingpong threshold to activate delayed acks in quick
> > > ack mode for better performance.
> > >
> > > The pingpong threshold and related code were changed to 3 in the year
> > > 2019 in:
> > > commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> > > And reverted to 1 in the year 2022 in:
> > > commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
> > >
> > > There is no single value that fits all applications.
> > > Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> > > optimal performance based on the application needs.
> > >
> > > Signed-off-by: Haiyang Zhang <[email protected]>
> >
> > If this an application specific optimization, it should be in a socket option
> > rather than system wide via sysctl.
> Initially I had a similar comment but later decided a sysctl could
> still be useful if
> 1) the entire host (e.g. virtual machine) is dedicated to that application
> 2) that application is difficult to change
Yes, the customer actually wants a global setting. But as suggested by Neal,
I changed it to be per-namespace to match other TCP tunables.
Thanks,
- Haiyang
On Tue, 10 Oct 2023 22:59:49 +0000
Haiyang Zhang <[email protected]> wrote:
> > > If this an application specific optimization, it should be in a socket option
> > > rather than system wide via sysctl.
> > Initially I had a similar comment but later decided a sysctl could
> > still be useful if
> > 1) the entire host (e.g. virtual machine) is dedicated to that application
> > 2) that application is difficult to change
>
> Yes, the customer actually wants a global setting. But as suggested by Neal,
> I changed it to be per-namespace to match other TCP tunables.
Like congestion control choice, it could be both a sysctl and a socket option.
The reason is that delayed ack is already controlled by socket options.
> -----Original Message-----
> From: Stephen Hemminger <[email protected]>
> Sent: Tuesday, October 10, 2023 10:16 PM
> To: Haiyang Zhang <[email protected]>
> Cc: Yuchung Cheng <[email protected]>; [email protected];
> [email protected]; KY Srinivasan <[email protected]>;
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-
> [email protected]; [email protected]; [email protected]
> Subject: Re: [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl
>
> On Tue, 10 Oct 2023 22:59:49 +0000
> Haiyang Zhang <[email protected]> wrote:
>
> > > > If this an application specific optimization, it should be in a socket option
> > > > rather than system wide via sysctl.
> > > Initially I had a similar comment but later decided a sysctl could
> > > still be useful if
> > > 1) the entire host (e.g. virtual machine) is dedicated to that application
> > > 2) that application is difficult to change
> >
> > Yes, the customer actually wants a global setting. But as suggested by Neal,
> > I changed it to be per-namespace to match other TCP tunables.
>
> Like congestion control choice, it could be both a sysctl and a socket option.
> The reason is that delayed ack is already controlled by socket options.
I see. I am updating the doc and variable location for this sysctl tunable patch
as suggested by the reviewers, and will resubmit it.
I will also work on a separate patch for the setsockopt option.
Thanks,
- Haiyang
On Wed, Oct 11, 2023 at 8:49 PM Haiyang Zhang <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Stephen Hemminger <[email protected]>
> > Sent: Tuesday, October 10, 2023 10:16 PM
> > To: Haiyang Zhang <[email protected]>
> > Cc: Yuchung Cheng <[email protected]>; [email protected];
> > [email protected]; KY Srinivasan <[email protected]>;
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; linux-
> > [email protected]; [email protected]; [email protected]
> > Subject: Re: [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl
> >
> > On Tue, 10 Oct 2023 22:59:49 +0000
> > Haiyang Zhang <[email protected]> wrote:
> >
> > > > > If this an application specific optimization, it should be in a socket option
> > > > > rather than system wide via sysctl.
> > > > Initially I had a similar comment but later decided a sysctl could
> > > > still be useful if
> > > > 1) the entire host (e.g. virtual machine) is dedicated to that application
> > > > 2) that application is difficult to change
> > >
> > > Yes, the customer actually wants a global setting. But as suggested by Neal,
> > > I changed it to be per-namespace to match other TCP tunables.
> >
> > Like congestion control choice, it could be both a sysctl and a socket option.
> > The reason is that delayed ack is already controlled by socket options.
>
> I see. I am updating the doc and variable location for this sysctl tunable patch
> as suggested by the reviewers, and will resubmit it.
>
> I will also work on a separate patch for the setsockopt option.
>
>
I am not sure about adding a socket option, and finding room in the
socket structure.
See our recent effort reshuffling fields in tcp socket for better
performance (stalled at this time).
I would rather experiment first with a sysctl.