2021-05-26 12:39:08

by Leonard Crestez

[permalink] [raw]
Subject: [RFCv2 0/3] tcp: Improve mtu probe preconditions

According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
in order to accumulate enough data" but in practice linux only sends
probes when a lot of data accumulates on the send side.

Another improvement is to rely on TCP RACK performing timely loss detection
with fewer outstanding packets. If this is enabled the size required for a
probe can be shrunk.

Successive successful mtu probes will result in reducing the cwnd since
it's measured in packets and we send bigger packets. The cwnd value can get
stuck below 11 on low-latency links and this prevents further probing. The
cwnd logic in tcp_mtu_probe can be reworked to be based on the the number of
packets that we actually need to send instead of arbitrary constants.

It is difficult to improve this behavior without introducing unreasonable
delays or even stalls. Looking at the current behavior of tcp_mtu_probe it
already waits in some scenarios: when there is not enough room inside cwnd
or when there is a gap of unacklowledged data between snd_una and snd_nxt.
It appears that it is safe to wait if packets_in_flight() != 0.

Signed-off-by: Leonard Crestez <[email protected]>

---

Previous RFC: https://lore.kernel.org/netdev/[email protected]/

This series seems to be "correct" this time, I would appreciate any feedback.
It possible my understanding of when it is safe to return 0 from tcp_mtu_probe
is incorrect. It's possible that even current code would interact poorly with
delayed acks in some circumstances?

The tcp_xmit_size_goal changes were dropped. It's still possible to see strange
interactions between tcp_push_one and mtu probing: If the receiver window is
small (60k) the sender will do a "push_one" when half a window is accumulated
(30k) and that would prevent mtu probing even if the sender is writing more
than enough data in a single syscall.

Leonard Crestez (3):
tcp: Use smaller mtu probes if RACK is enabled
tcp: Adjust congestion window handling for mtu probe
tcp: Wait for sufficient data in tcp_mtu_probe

Documentation/networking/ip-sysctl.rst | 10 ++++
include/net/netns/ipv4.h | 2 +
net/ipv4/sysctl_net_ipv4.c | 14 ++++++
net/ipv4/tcp_ipv4.c | 2 +
net/ipv4/tcp_output.c | 70 +++++++++++++++++++++-----
5 files changed, 86 insertions(+), 12 deletions(-)


base-commit: e4e92ee78702b13ad55118d8b66f06e1aef62586
--
2.25.1


2021-05-26 16:51:30

by Leonard Crestez

[permalink] [raw]
Subject: [RFCv2 1/3] tcp: Use smaller mtu probes if RACK is enabled

RACK allows detecting a loss in rtt + min_rtt / 4 based on just one
extra packet. If enabled use this instead of relying of fast retransmit.

Suggested-by: Neal Cardwell <[email protected]>
Signed-off-by: Leonard Crestez <[email protected]>
---
Documentation/networking/ip-sysctl.rst | 5 +++++
include/net/netns/ipv4.h | 1 +
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/tcp_ipv4.c | 1 +
net/ipv4/tcp_output.c | 26 +++++++++++++++++++++++++-
5 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a5c250044500..7ab52a105a5d 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -349,10 +349,15 @@ tcp_mtu_probe_floor - INTEGER
If MTU probing is enabled this caps the minimum MSS used for search_low
for the connection.

Default : 48

+tcp_mtu_probe_rack - BOOLEAN
+ Try to use shorter probes if RACK is also enabled
+
+ Default: 1
+
tcp_min_snd_mss - INTEGER
TCP SYN and SYNACK messages usually advertise an ADVMSS option,
as described in RFC 1122 and RFC 6691.

If this ADVMSS option is smaller than tcp_min_snd_mss,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 746c80cd4257..b4ff12f25a7f 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -112,10 +112,11 @@ struct netns_ipv4 {
#ifdef CONFIG_NET_L3_MASTER_DEV
u8 sysctl_tcp_l3mdev_accept;
#endif
u8 sysctl_tcp_mtu_probing;
int sysctl_tcp_mtu_probe_floor;
+ int sysctl_tcp_mtu_probe_rack;
int sysctl_tcp_base_mss;
int sysctl_tcp_min_snd_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4fa77f182dcb..275c91fb9cf8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -847,10 +847,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &tcp_min_snd_mss_min,
.extra2 = &tcp_min_snd_mss_max,
},
+ {
+ .procname = "tcp_mtu_probe_rack",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
.mode = 0644,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4f5b68a90be9..ed8af4a7325b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2892,10 +2892,11 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
+ net->ipv4.sysctl_tcp_mtu_probe_rack = 1;

net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bde781f46b41..9691f435477b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2311,10 +2311,19 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
}

return true;
}

+/* Check if rack is supported for current connection */
+static int tcp_mtu_probe_is_rack(const struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ return (net->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION &&
+ net->ipv4.sysctl_tcp_mtu_probe_rack);
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
* changes resulting in larger path MTUs.
*
@@ -2351,11 +2360,26 @@ static int tcp_mtu_probe(struct sock *sk)
* smaller than a threshold, backoff from probing.
*/
mss_now = tcp_current_mss(sk);
probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
icsk->icsk_mtup.search_low) >> 1);
- size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+ /* Probing the MTU requires one packet which is larger that current MSS as well
+ * as enough following mtu-sized packets to ensure that a probe loss can be
+ * detected without a full Retransmit Time Out.
+ */
+ if (tcp_mtu_probe_is_rack(sk)) {
+ /* RACK allows recovering in min_rtt / 4 based on just one extra packet
+ * Use two to account for unrelated losses
+ */
+ size_needed = probe_size + 2 * tp->mss_cache;
+ } else {
+ /* Without RACK send enough extra packets to trigger fast retransmit
+ * This is dynamic DupThresh + 1
+ */
+ size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+ }
+
interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
/* When misfortune happens, we are reprobing actively,
* and then reprobe timer has expired. We stick with current
* probing process by not resetting search range to its orignal.
*/
--
2.25.1

2021-05-26 16:51:35

by Leonard Crestez

[permalink] [raw]
Subject: [RFCv2 2/3] tcp: Adjust congestion window handling for mtu probe

On very fast links after successive succesful MTU probes the cwnd (measured in
packets) shrinks and does not grow again because tcp_is_cwnd_limited returns
false unless at least half the window is being used. If snd_cwnd falls below 11
then no more probes are sent despite the link being otherwise idle.

When preparing an mtu probe linux checks for snd_cwnd >= 11 and for 2 more
packets to fit alongside what is currently in flight. The reasoning behind
these constants is unclear. Replace this with checks based on the required
probe size:

* Skip probing if congestion window is too small to ever fit a probe.
* Wait for the congestion window to drain if too many packets are
already in flight.

This is very similar to snd_wnd logic except packets are counted instead of
bytes. This patch also adds more documentation regarding how "return 0"
works in tcp_mtu_probe because I found it difficult to understand.

This patch allows mtu probing at smaller cwnd values and does not contradict
any standard. Since "0" is only returned if packets are in flight no stalls
should happen expect when many acks are lost.

Removing the snd_cwnd >= 11 check also allows probing to happen for
bursty traffic where the cwnd is reset to 10 after a few hundred ms of
idling.

It does not completely solve the problem of very small cwnds on fast links.

Signed-off-by: Leonard Crestez <[email protected]>
---
net/ipv4/tcp_output.c | 30 +++++++++++++++++++++---------
1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9691f435477b..362f97cfb09e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2328,32 +2328,35 @@ static int tcp_mtu_probe_is_rack(const struct sock *sk)
* changes resulting in larger path MTUs.
*
* Returns 0 if we should wait to probe (no cwnd available),
* 1 if a probe was sent,
* -1 otherwise
+ *
+ * Caller won't queue future write attempts if this returns 0. Zero is only
+ * returned if acks are pending from packets in flight which will trigger
+ * tcp_write_xmit again later.
*/
static int tcp_mtu_probe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
int probe_size;
int size_needed;
+ int packets_needed;
int copy, len;
int mss_now;
int interval;

/* Not currently probing/verifying,
* not in recovery,
- * have enough cwnd, and
* not SACKing (the variable headers throw things off)
*/
if (likely(!icsk->icsk_mtup.enabled ||
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;

/* Use binary search for probe_size between tcp_mss_base,
* and current mss_clamp. if (search_high - search_low)
@@ -2375,10 +2378,11 @@ static int tcp_mtu_probe(struct sock *sk)
/* Without RACK send enough extra packets to trigger fast retransmit
* This is dynamic DupThresh + 1
*/
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
}
+ packets_needed = DIV_ROUND_UP(size_needed, tp->mss_cache);

interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
/* When misfortune happens, we are reprobing actively,
* and then reprobe timer has expired. We stick with current
* probing process by not resetting search range to its orignal.
@@ -2394,22 +2398,30 @@ static int tcp_mtu_probe(struct sock *sk)

/* Have enough data in the send queue to probe? */
if (tp->write_seq - tp->snd_nxt < size_needed)
return -1;

+ /* Can probe fit inside congestion window? */
+ if (packets_needed > tp->snd_cwnd)
+ return -1;
+
+ /* Can probe fit inside receiver window? If not then skip probing.
+ * The receiver might increase the window as data is processed but
+ * don't assume that.
+ * If some data is inflight (between snd_una and snd_nxt) we wait for it to
+ * clear below.
+ */
if (tp->snd_wnd < size_needed)
return -1;
+
+ /* Do we need for more acks to clear the receive window? */
if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
return 0;

- /* Do we need to wait to drain cwnd? With none in flight, don't stall */
- if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
- if (!tcp_packets_in_flight(tp))
- return -1;
- else
- return 0;
- }
+ /* Do we need the congestion window to clear? */
+ if (tcp_packets_in_flight(tp) + packets_needed > tp->snd_cwnd)
+ return 0;

if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
return -1;

/* We're allowed to probe. Build it now. */
--
2.25.1