Major change: Limit number of thin linear timeout tries to TCP_THIN_LT_RETRIES (currently 6).
>From ec71404702149bc9197c749e5d1d68656c87f98f Mon Sep 17 00:00:00 2001
From: Andreas Petlund <[email protected]>
Date: Mon, 8 Feb 2010 14:05:53 +0100
Subject: [PATCH 2/3] net: TCP thin linear timeouts
Signed-off-by: Andreas Petlund <[email protected]>
---
include/linux/sysctl.h | 1 +
include/linux/tcp.h | 3 +++
include/net/tcp.h | 4 ++++
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/tcp.c | 5 +++++
net/ipv4/tcp_timer.c | 19 ++++++++++++++++++-
6 files changed, 38 insertions(+), 1 deletions(-)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 9f236cd..d840d75 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -425,6 +425,7 @@ enum
NET_TCP_ALLOWED_CONG_CONTROL=123,
NET_TCP_MAX_SSTHRESH=124,
NET_TCP_FRTO_RESPONSE=125,
+ NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS=126,
};
enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7fee8a4..67da706 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -103,6 +103,7 @@ enum {
#define TCP_CONGESTION 13 /* Congestion control algorithm */
#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
#define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */
+#define TCP_THIN_LT 16 /* Use linear timeouts for thin streams*/
/* for TCP_INFO socket option */
#define TCPI_OPT_TIMESTAMPS 1
@@ -341,6 +342,8 @@ struct tcp_sock {
u16 advmss; /* Advertised MSS */
u8 frto_counter; /* Number of new acks after RTO */
u8 nonagle; /* Disable Nagle algorithm? */
+ u8 thin_lt : 1,/* Use linear timeouts for thin streams */
+ thin_undef : 7;
/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e5e2056..bc5856a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_NAGLE_CORK 2 /* Socket is corked */
#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
+/* TCP thin-stream limits */
+#define TCP_THIN_LT_RETRIES 6 /* After 6 linear retries, do exp. backoff */
+
extern struct inet_timewait_death_row tcp_death_row;
/* sysctl variables for tcp */
@@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_max_ssthresh;
extern int sysctl_tcp_cookie_size;
+extern int sysctl_tcp_force_thin_linear_timeouts;
extern atomic_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712c..cb2ed35 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_force_thin_linear_timeouts",
+ .data = &sysctl_tcp_force_thin_linear_timeouts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5d69ea..cbc1ee3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2229,6 +2229,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
}
break;
+ case TCP_THIN_LT:
+ if (val)
+ tp->thin_lt = 1;
+ break;
+
case TCP_CORK:
/* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index de7d1bf..f01a585 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_force_thin_linear_timeouts __read_mostly;
static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
@@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk)
icsk->icsk_retransmits++;
out_reset_timer:
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+ * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
+ * backoff behaviour to avoid continue hammering linear-timeout
+ * retransmissions into a black hole*/
+ if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
+ tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
+ icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
+ icsk->icsk_backoff = 0;
+ icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ } else {
+ /* Use normal (exponential) backoff */
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ }
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
__sk_dst_reset(sk);
--
1.6.3.3
Andreas Petlund schrieb:
> Major change: Limit number of thin linear timeout tries to TCP_THIN_LT_RETRIES (currently 6).
>
>>From ec71404702149bc9197c749e5d1d68656c87f98f Mon Sep 17 00:00:00 2001
> From: Andreas Petlund <[email protected]>
> Date: Mon, 8 Feb 2010 14:05:53 +0100
> Subject: [PATCH 2/3] net: TCP thin linear timeouts
>
>
> Signed-off-by: Andreas Petlund <[email protected]>
> ---
> include/linux/sysctl.h | 1 +
> include/linux/tcp.h | 3 +++
> include/net/tcp.h | 4 ++++
> net/ipv4/sysctl_net_ipv4.c | 7 +++++++
> net/ipv4/tcp.c | 5 +++++
> net/ipv4/tcp_timer.c | 19 ++++++++++++++++++-
> 6 files changed, 38 insertions(+), 1 deletions(-)
>
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 9f236cd..d840d75 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -425,6 +425,7 @@ enum
> NET_TCP_ALLOWED_CONG_CONTROL=123,
> NET_TCP_MAX_SSTHRESH=124,
> NET_TCP_FRTO_RESPONSE=125,
> + NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS=126,
> };
>
> enum {
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 7fee8a4..67da706 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -103,6 +103,7 @@ enum {
> #define TCP_CONGESTION 13 /* Congestion control algorithm */
> #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
> #define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */
> +#define TCP_THIN_LT 16 /* Use linear timeouts for thin streams*/
>
> /* for TCP_INFO socket option */
> #define TCPI_OPT_TIMESTAMPS 1
> @@ -341,6 +342,8 @@ struct tcp_sock {
> u16 advmss; /* Advertised MSS */
> u8 frto_counter; /* Number of new acks after RTO */
> u8 nonagle; /* Disable Nagle algorithm? */
> + u8 thin_lt : 1,/* Use linear timeouts for thin streams */
> + thin_undef : 7;
>
> /* RTT measurement */
> u32 srtt; /* smoothed round trip time << 3 */
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index e5e2056..bc5856a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
> #define TCP_NAGLE_CORK 2 /* Socket is corked */
> #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
>
> +/* TCP thin-stream limits */
> +#define TCP_THIN_LT_RETRIES 6 /* After 6 linear retries, do exp. backoff */
> +
> extern struct inet_timewait_death_row tcp_death_row;
>
> /* sysctl variables for tcp */
> @@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
> extern int sysctl_tcp_slow_start_after_idle;
> extern int sysctl_tcp_max_ssthresh;
> extern int sysctl_tcp_cookie_size;
> +extern int sysctl_tcp_force_thin_linear_timeouts;
>
> extern atomic_t tcp_memory_allocated;
> extern struct percpu_counter tcp_sockets_allocated;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 7e3712c..cb2ed35 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
> .proc_handler = proc_dointvec
> },
> {
> + .procname = "tcp_force_thin_linear_timeouts",
> + .data = &sysctl_tcp_force_thin_linear_timeouts,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec
> + },
> + {
> .procname = "udp_mem",
> .data = &sysctl_udp_mem,
> .maxlen = sizeof(sysctl_udp_mem),
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index d5d69ea..cbc1ee3 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2229,6 +2229,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
> }
> break;
>
> + case TCP_THIN_LT:
> + if (val)
> + tp->thin_lt = 1;
> + break;
> +
> case TCP_CORK:
> /* When set indicates to always queue non-full frames.
> * Later the user clears this option and we transmit
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index de7d1bf..f01a585 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
> int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
> int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
> int sysctl_tcp_orphan_retries __read_mostly;
> +int sysctl_tcp_force_thin_linear_timeouts __read_mostly;
>
> static void tcp_write_timer(unsigned long);
> static void tcp_delack_timer(unsigned long);
> @@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk)
> icsk->icsk_retransmits++;
>
> out_reset_timer:
> - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
> + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
> + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
> + * might be increased if the stream oscillates between thin and thick,
> + * thus the old value might already be too high compared to the value
> + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
> + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
> + * backoff behaviour to avoid continue hammering linear-timeout
> + * retransmissions into a black hole*/
> + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
> + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
> + icsk->icsk_backoff = 0;
Hi,
I think, this value should be at least 1, as icsk_backoff
might be decreased to -1 and used for bit-shifting in tcp_v4_err().
A lower boundary check might be even better.
Regards
Damian
> + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
> + } else {
> + /* Use normal (exponential) backoff */
> + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
> + }
> inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
> if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
> __sk_dst_reset(sk);
Le lundi 08 février 2010 à 15:25 +0100, Andreas Petlund a écrit :
>
> + case TCP_THIN_LT:
> + if (val)
> + tp->thin_lt = 1;
> + break;
> +
Why not allowing user to clear thin_lt ?
On 02/08/2010 09:50 PM, Damian Lukowski wrote:
>> out_reset_timer:
>> - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
>> + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
>> + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
>> + * might be increased if the stream oscillates between thin and thick,
>> + * thus the old value might already be too high compared to the value
>> + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
>> + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
>> + * backoff behaviour to avoid continue hammering linear-timeout
>> + * retransmissions into a black hole*/
>> + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
>> + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
>> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
>> + icsk->icsk_backoff = 0;
>
> Hi,
> I think, this value should be at least 1, as icsk_backoff
> might be decreased to -1 and used for bit-shifting in tcp_v4_err().
> A lower boundary check might be even better.
Hi
Thanks for the feedback.
As far as I can see, the check a couple of lines above the decrementation
stops the icsk->icsk_backoff from being decremented if already zero.
Beyond that I cannot find any more places where this situation may arise.
Please correct me if I'm wrong and a boundary check is indeed warranted.
Excerpt from tcp_ipv4.c
------------------
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
!icsk->icsk_backoff)
break;
icsk->icsk_backoff--;
inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
icsk->icsk_backoff;
------------------
Best regards,
Andreas
On 02/09/2010 07:31 AM, Eric Dumazet wrote:
> Le lundi 08 février 2010 à 15:25 +0100, Andreas Petlund a écrit :
>>
>> + case TCP_THIN_LT:
>> + if (val)
>> + tp->thin_lt = 1;
>> + break;
>> +
>
> Why not allowing user to clear thin_lt ?
>
>
That is a very good point. I will fix that in the next iteration,
as well as error handling for out of bounds values.
Best regards,
Andreas
Am 09.02.2010, 17:40 Uhr, schrieb Andreas Petlund <[email protected]>:
> On 02/08/2010 09:50 PM, Damian Lukowski wrote:
>>> out_reset_timer:
>>> - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
>>> + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
>>> + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
>>> + * might be increased if the stream oscillates between thin and
>>> thick,
>>> + * thus the old value might already be too high compared to the value
>>> + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
>>> + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating
>>> exponential
>>> + * backoff behaviour to avoid continue hammering linear-timeout
>>> + * retransmissions into a black hole*/
>>> + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
>>> + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
>>> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
>>> + icsk->icsk_backoff = 0;
>>
>> Hi,
>> I think, this value should be at least 1, as icsk_backoff
>> might be decreased to -1 and used for bit-shifting in tcp_v4_err().
>> A lower boundary check might be even better.
>
> Hi
>
> Thanks for the feedback.
>
> As far as I can see, the check a couple of lines above the decrementation
> stops the icsk->icsk_backoff from being decremented if already zero.
> Beyond that I cannot find any more places where this situation may arise.
> Please correct me if I'm wrong and a boundary check is indeed warranted.
Oops, you are right, of course ...
I just had in mind, that a thin stream might also be a candidate for
backoff reversion when connectivity breaks down, that's why I said
"at least 1". And I really have forgotten the already existing check,
sorry.
So, setting icsk_backoff = 0 will prevent a backoff reversion, but that's
ok, as the RTO is not doubled in the first place.
It might have been an issue, if you had not used __tcp_set_rto() but left
the value unchanged *and* a non-thin stream became thin at some point in
the RTO retransmission phase (if that is even possible).
Damian