Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753216Ab1EQHl1 (ORCPT ); Tue, 17 May 2011 03:41:27 -0400 Received: from mail-qw0-f46.google.com ([209.85.216.46]:48170 "EHLO mail-qw0-f46.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752572Ab1EQHlX (ORCPT ); Tue, 17 May 2011 03:41:23 -0400 DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=from:to:cc:subject:date:message-id:x-mailer:in-reply-to:references; b=rP5fFSX88Q7swWGRvC+91UBsWR6BDQqM3sQW4wqQVufed/ABg6rIr/1glLcx4zEx1W luRloAvtqXmjROMGrfkFpqpjJiYFFRO4wdBxleNTpcCYheGIixQfq5b2QxOVeLjETQX8 mppk1biohRqM1wQCSxQAdNiTHLFkskNFmO44U= From: Benoit Sigoure To: davem@davemloft.net, kuznet@ms2.inr.ac.ru, pekkas@netcore.fi, jmorris@namei.org, yoshfuji@linux-ipv6.org, kaber@trash.net Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Benoit Sigoure Subject: [PATCH] tcp: Expose the initial RTO via a new sysctl. Date: Tue, 17 May 2011 00:40:20 -0700 Message-Id: <1305618020-72535-2-git-send-email-tsunanet@gmail.com> X-Mailer: git-send-email 1.7.0.2.157.gb7e7f In-Reply-To: <1305618020-72535-1-git-send-email-tsunanet@gmail.com> References: <1305618020-72535-1-git-send-email-tsunanet@gmail.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12749 Lines: 329 Instead of hardcoding the initial RTO to 3s and requiring the kernel to be recompiled to change it, expose it as a sysctl that can be tuned at runtime. Leave the default value unchanged. Signed-off-by: Benoit Sigoure --- Documentation/networking/ip-sysctl.txt | 6 ++++++ include/linux/sysctl.h | 1 + include/net/tcp.h | 3 ++- kernel/sysctl_binary.c | 1 + net/ipv4/syncookies.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 11 +++++++++++ net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 9 +++++---- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 6 +++--- 14 files changed, 44 insertions(+), 23 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d3d653a..c381c68 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -384,6 +384,12 @@ tcp_retries2 - INTEGER RFC 1122 recommends at least 100 seconds for the timeout, which corresponds to a value of at least 8. +tcp_initial_rto - INTEGER + This value sets the initial retransmit timeout, that is how long + the kernel will wait before retransmitting the initial SYN packet. + + RFC 1122 says that this SHOULD be 3 seconds, which is the default. + tcp_rfc1337 - BOOLEAN If set, the TCP stack behaves conforming to RFC1337. If unset, we are not conforming to RFC, but prevent TCP TIME_WAIT diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 11684d9..96a9b41 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -425,6 +425,7 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_IPV4_TCP_INITIAL_RTO=126, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index cda30ea..a2bb0f1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -213,6 +213,7 @@ extern int sysctl_tcp_syn_retries; extern int sysctl_tcp_synack_retries; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; +extern int sysctl_tcp_initial_rto; extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_syncookies; extern int sysctl_tcp_retrans_collapse; @@ -295,7 +296,7 @@ static inline void tcp_synq_overflow(struct sock *sk) static inline int tcp_synq_no_recent_overflow(const struct sock *sk) { unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - return time_after(jiffies, last_overflow + TCP_TIMEOUT_INIT); + return time_after(jiffies, last_overflow + sysctl_tcp_initial_rto); } extern struct proto tcp_prot; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 3b8e028..d608d84 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -354,6 +354,7 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, + { CTL_INT, NET_IPV4_TCP_INITIAL_RTO, "tcp_initial_rto" }, { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 8b44c6d..089bc92 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -186,7 +186,7 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) * sysctl_tcp_retries1. It's a rather complicated formula (exponential * backoff) to compute at runtime so it's currently hardcoded here. */ -#define COUNTER_TRIES 4 +#define COUNTER_TRIES (sysctl_tcp_initial_rto + 1) /* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 321e6e8..24dc21d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -30,6 +30,8 @@ static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int ip_ttl_min = 1; static int ip_ttl_max = 255; +static int tcp_initial_rto_min = TCP_RTO_MIN; +static int tcp_initial_rto_max = TCP_RTO_MAX; /* Update system visible IP port range */ static void set_local_port_range(int range[2]) @@ -246,6 +248,15 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_initial_rto", + .data = &sysctl_tcp_initial_rto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_initial_rto_min, + .extra2 = &tcp_initial_rto_max, + }, { .procname = "tcp_fin_timeout", .data = &sysctl_tcp_fin_timeout, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b22d450..e9e7c3f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2352,7 +2352,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ icsk->icsk_accept_queue.rskq_defer_accept = - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + secs_to_retrans(val, sysctl_tcp_initial_rto / HZ, TCP_RTO_MAX / HZ); break; @@ -2539,7 +2539,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, - TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); + sysctl_tcp_initial_rto / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bef9f04..39f6c27 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -890,7 +890,7 @@ static void tcp_init_metrics(struct sock *sk) if (dst_metric(dst, RTAX_RTT) == 0) goto reset; - if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (sysctl_tcp_initial_rto << 3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -916,7 +916,7 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); - if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) { + if (inet_csk(sk)->icsk_rto < sysctl_tcp_initial_rto && !tp->rx_opt.saw_tstamp) { reset: /* Play conservative. If timestamps are not * supported, TCP will fail to recalculate correct @@ -924,8 +924,8 @@ reset: */ if (!tp->rx_opt.saw_tstamp && tp->srtt) { tp->srtt = 0; - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + tp->mdev = tp->mdev_max = tp->rttvar = sysctl_tcp_initial_rto; + inet_csk(sk)->icsk_rto = sysctl_tcp_initial_rto; } } tp->snd_cwnd = tcp_init_cwnd(tp, dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f7e6c2c..21920e6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1383,7 +1383,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) want_cookie) goto drop_and_free; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, sysctl_tcp_initial_rto); return 0; drop_and_release: @@ -1834,8 +1834,8 @@ static int tcp_v4_init_sock(struct sock *sk) tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + icsk->icsk_rto = sysctl_tcp_initial_rto; + tp->mdev = sysctl_tcp_initial_rto; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 80b1f80..c63ffa0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -472,8 +472,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->rcv_isn); newtp->srtt = 0; - newtp->mdev = TCP_TIMEOUT_INIT; - newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newtp->mdev = sysctl_tcp_initial_rto; + newicsk->icsk_rto = sysctl_tcp_initial_rto; newtp->packets_out = 0; newtp->retrans_out = 0; @@ -582,7 +582,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<retrans); + tmp_opt.ts_recent_stamp = get_seconds() - ((sysctl_tcp_initial_rto/HZ)<retrans); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 17388c7..e34b0f6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2599,7 +2599,7 @@ static void tcp_connect_init(struct sock *sk) tp->rcv_wup = 0; tp->copied_seq = 0; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = sysctl_tcp_initial_rto; inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ecd44b0..b9da62b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES; int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; +int sysctl_tcp_initial_rto __read_mostly = TCP_TIMEOUT_INIT; int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; @@ -135,8 +136,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) /* This function calculates a "timeout" which is equivalent to the timeout of a * TCP connection after "boundary" unsuccessful, exponentially backed-off - * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if - * syn_set flag is set. + * retransmissions with an initial RTO of TCP_RTO_MIN or + * sysctl_tcp_initial_rto if syn_set flag is set. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, @@ -144,7 +145,7 @@ static bool retransmits_timed_out(struct sock *sk, bool syn_set) { unsigned int linear_backoff_thresh, start_ts; - unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + unsigned int rto_base = syn_set ? sysctl_tcp_initial_rto : TCP_RTO_MIN; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -495,7 +496,7 @@ out_unlock: static void tcp_synack_timer(struct sock *sk) { inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, - TCP_TIMEOUT_INIT, TCP_RTO_MAX); + sysctl_tcp_initial_rto, TCP_RTO_MAX); } void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 352c260..50baaec 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -45,7 +45,7 @@ static __u16 const msstab[] = { * sysctl_tcp_retries1. It's a rather complicated formula (exponential * backoff) to compute at runtime so it's currently hardcoded here. */ -#define COUNTER_TRIES 4 +#define COUNTER_TRIES (sysctl_tcp_initial_rto + 1) static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4f49e5d..7e791e6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1349,7 +1349,7 @@ have_isn: want_cookie) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + inet6_csk_reqsk_queue_hash_add(sk, req, sysctl_tcp_initial_rto); return 0; drop_and_release: @@ -1957,8 +1957,8 @@ static int tcp_v6_init_sock(struct sock *sk) tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + icsk->icsk_rto = sysctl_tcp_initial_rto; + tp->mdev = sysctl_tcp_initial_rto; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/