Tsval=tsoffset+local_clock, here tsoffset is randomized with saddr and daddr parameters in func
secure_tcp_ts_off. Most of time it is OK except for NAT mapping to the same port and daddr.
Consider the following scenario:
ns1: ns2:
+-----------+ +-----------+
| | | |
| | | |
| | | |
| veth1 | | vethb |
|192.168.1.1| |192.168.1.2|
+----+------+ +-----+-----+
| |
| |
| br0:192.168.1.254 |
+----------+----------+
veth0 | vetha
192.168.1.3 | 192.168.1.4
|
nat(192.168.1.x -->172.30.60.199)
|
V
eth0
172.30.60.199
|
|
+----> ... ... ---->server: 172.30.60.191
Let's say ns1 (192.168.1.1) generates a timestamp ts1, and ns2 (192.168.1.2) generates a timestamp
ts2, with ts1 > ts2.
If ns1 initiates a connection to a server, and then the server actively closes the connection,
entering the TIME_WAIT state, and ns2 attempts to connect to the server while port reuse is in
progress, due to the presence of NAT, the server sees both connections as originating from the
same IP address (e.g., 172.30.60.199) and port. However, since ts2 is smaller than ts1, the server
will respond with the acknowledgment (ACK) for the fourth handshake.
SERVER CLIENT
1. ESTABLISHED ESTABLISHED
(Close)
2. FIN-WAIT-1 --> <SEQ=100><ACK=300><TSval=20><CTL=FIN,ACK> --> CLOSE-WAIT
3. FIN-WAIT-2 <-- <SEQ=300><ACK=101><TSval=40><CTL=ACK> <-- CLOSE-WAIT
(Close)
4. TIME-WAIT <-- <SEQ=300><ACK=101><TSval=41><CTL=FIN,ACK> <-- LAST-ACK
5. TIME-WAIT --> <SEQ=101><ACK=301><TSval=25><CTL=ACK> --> CLOSED
- - - - - - - - - - - - - port reused - - - - - - - - - - - - - - -
5.1. TIME-WAIT <-- <SEQ=255><TSval=30><CTL=SYN> <-- SYN-SENT
5.2. TIME-WAIT --> <SEQ=101><ACK=301><TSval=35><CTL=ACK> --> SYN-SENT
5.3. CLOSED <-- <SEQ=301><CTL=RST> <-- SYN-SENT
6. SYN-RECV <-- <SEQ=255><TSval=34><CTL=SYN> <-- SYN-SENT
7. SYN-RECV --> <SEQ=400><ACK=301><TSval=40><CTL=SYN,ACK> --> ESTABLISHED
1. ESTABLISH <-- <SEQ=301><ACK=401><TSval=55><CTL=ACK> <-- ESTABLISHED
This enhancement uses sport and daddr rather than saddr and daddr, which keep the timestamp
monotonically increasing in the situation described above. Then the port reuse is like this:
SERVER CLIENT
1. ESTABLISHED ESTABLISHED
(Close)
2. FIN-WAIT-1 --> <SEQ=100><ACK=300><TSval=20><CTL=FIN,ACK> --> CLOSE-WAIT
3. FIN-WAIT-2 <-- <SEQ=300><ACK=101><TSval=40><CTL=ACK> <-- CLOSE-WAIT
(Close)
4. TIME-WAIT <-- <SEQ=300><ACK=101><TSval=41><CTL=FIN,ACK> <-- LAST-ACK
5. TIME-WAIT --> <SEQ=101><ACK=301><TSval=25><CTL=ACK> --> CLOSED
- - - - - - - - - - - - - port reused - - - - - - - - - - - - - - -
5.1. TIME-WAIT <-- <SEQ=300><TSval=50><CTL=SYN> <-- SYN-SENT
6. SYN-RECV --> <SEQ=400><ACK=301><TSval=40><CTL=SYN,ACK> --> ESTABLISHED
1. ESTABLISH <-- <SEQ=301><ACK=401><TSval=55><CTL=ACK> <-- ESTABLISHED
The enhancement lets port reused more efficiently.
Signed-off-by: George Guo <[email protected]>
---
include/net/secure_seq.h | 2 +-
net/core/secure_seq.c | 4 ++--
net/ipv4/syncookies.c | 4 ++--
net/ipv4/tcp_ipv4.c | 6 ++++--
4 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 21e7fa2a1813..40fb53520aa4 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -11,7 +11,7 @@ u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport);
u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport);
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr);
+u32 secure_tcp_ts_off(const struct net *net, __be16 sport, __be32 daddr);
u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport);
u32 secure_tcpv6_ts_off(const struct net *net,
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index b0ff6153be62..575b6afe39a4 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -118,13 +118,13 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
+u32 secure_tcp_ts_off(const struct net *net, __be16 sport, __be32 daddr)
{
if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
return 0;
ts_secret_init();
- return siphash_2u32((__force u32)saddr, (__force u32)daddr,
+ return siphash_2u32((__force u32)sport, (__force u32)daddr,
&ts_secret);
}
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index dc478a0574cb..df1757ff5956 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -360,8 +360,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
tsoff = secure_tcp_ts_off(sock_net(sk),
- ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr);
+ th->source,
+ ip_hdr(skb)->daddr);
tcp_opt.rcv_tsecr -= tsoff;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 27140e5cdc06..acad4b14ecf7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -104,7 +104,9 @@ static u32 tcp_v4_init_seq(const struct sk_buff *skb)
static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
- return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ return secure_tcp_ts_off(net, th->source, ip_hdr(skb)->daddr);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -309,7 +311,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_sport,
usin->sin_port));
WRITE_ONCE(tp->tsoffset,
- secure_tcp_ts_off(net, inet->inet_saddr,
+ secure_tcp_ts_off(net, inet->inet_sport,
inet->inet_daddr));
}
--
2.34.1
On Mon, Sep 18, 2023 at 3:46 AM George Guo <[email protected]> wrote:
>
> Tsval=tsoffset+local_clock, here tsoffset is randomized with saddr and daddr parameters in func
> secure_tcp_ts_off. Most of time it is OK except for NAT mapping to the same port and daddr.
> Consider the following scenario:
> ns1: ns2:
> +-----------+ +-----------+
> | | | |
> | | | |
> | | | |
> | veth1 | | vethb |
> |192.168.1.1| |192.168.1.2|
> +----+------+ +-----+-----+
> | |
> | |
> | br0:192.168.1.254 |
> +----------+----------+
> veth0 | vetha
> 192.168.1.3 | 192.168.1.4
> |
> nat(192.168.1.x -->172.30.60.199)
> |
> V
> eth0
> 172.30.60.199
> |
> |
> +----> ... ... ---->server: 172.30.60.191
>
> Let's say ns1 (192.168.1.1) generates a timestamp ts1, and ns2 (192.168.1.2) generates a timestamp
> ts2, with ts1 > ts2.
>
> If ns1 initiates a connection to a server, and then the server actively closes the connection,
> entering the TIME_WAIT state, and ns2 attempts to connect to the server while port reuse is in
> progress, due to the presence of NAT, the server sees both connections as originating from the
> same IP address (e.g., 172.30.60.199) and port. However, since ts2 is smaller than ts1, the server
> will respond with the acknowledgment (ACK) for the fourth handshake.
>
> SERVER CLIENT
>
> 1. ESTABLISHED ESTABLISHED
>
> (Close)
> 2. FIN-WAIT-1 --> <SEQ=100><ACK=300><TSval=20><CTL=FIN,ACK> --> CLOSE-WAIT
>
> 3. FIN-WAIT-2 <-- <SEQ=300><ACK=101><TSval=40><CTL=ACK> <-- CLOSE-WAIT
>
> (Close)
> 4. TIME-WAIT <-- <SEQ=300><ACK=101><TSval=41><CTL=FIN,ACK> <-- LAST-ACK
>
> 5. TIME-WAIT --> <SEQ=101><ACK=301><TSval=25><CTL=ACK> --> CLOSED
>
> - - - - - - - - - - - - - port reused - - - - - - - - - - - - - - -
>
> 5.1. TIME-WAIT <-- <SEQ=255><TSval=30><CTL=SYN> <-- SYN-SENT
>
> 5.2. TIME-WAIT --> <SEQ=101><ACK=301><TSval=35><CTL=ACK> --> SYN-SENT
>
> 5.3. CLOSED <-- <SEQ=301><CTL=RST> <-- SYN-SENT
>
> 6. SYN-RECV <-- <SEQ=255><TSval=34><CTL=SYN> <-- SYN-SENT
>
> 7. SYN-RECV --> <SEQ=400><ACK=301><TSval=40><CTL=SYN,ACK> --> ESTABLISHED
>
> 1. ESTABLISH <-- <SEQ=301><ACK=401><TSval=55><CTL=ACK> <-- ESTABLISHED
>
> This enhancement uses sport and daddr rather than saddr and daddr, which keep the timestamp
> monotonically increasing in the situation described above. Then the port reuse is like this:
>
> SERVER CLIENT
>
> 1. ESTABLISHED ESTABLISHED
>
> (Close)
> 2. FIN-WAIT-1 --> <SEQ=100><ACK=300><TSval=20><CTL=FIN,ACK> --> CLOSE-WAIT
>
> 3. FIN-WAIT-2 <-- <SEQ=300><ACK=101><TSval=40><CTL=ACK> <-- CLOSE-WAIT
>
> (Close)
> 4. TIME-WAIT <-- <SEQ=300><ACK=101><TSval=41><CTL=FIN,ACK> <-- LAST-ACK
>
> 5. TIME-WAIT --> <SEQ=101><ACK=301><TSval=25><CTL=ACK> --> CLOSED
>
> - - - - - - - - - - - - - port reused - - - - - - - - - - - - - - -
>
> 5.1. TIME-WAIT <-- <SEQ=300><TSval=50><CTL=SYN> <-- SYN-SENT
>
> 6. SYN-RECV --> <SEQ=400><ACK=301><TSval=40><CTL=SYN,ACK> --> ESTABLISHED
>
> 1. ESTABLISH <-- <SEQ=301><ACK=401><TSval=55><CTL=ACK> <-- ESTABLISHED
>
> The enhancement lets port reused more efficiently.
>
> Signed-off-by: George Guo <[email protected]>
>
CC Florian
I do not think we can 'fix' tcp timestamp vs NAT.
Unless the NAT device makes sure a port is dedicated for a peer,
and/or the NAT rewrites TS values
(which would be bad).
I personally prefer seeing the same timestamps from A to B regardless
of ports, it helps detect various issues.
Also, you seem to forget IPv6.
George Guo <[email protected]> wrote:
> Tsval=tsoffset+local_clock, here tsoffset is randomized with saddr and daddr parameters in func
> secure_tcp_ts_off. Most of time it is OK except for NAT mapping to the same port and daddr.
> Consider the following scenario:
> ns1: ns2:
> +-----------+ +-----------+
> | | | |
> | | | |
> | | | |
> | veth1 | | vethb |
> |192.168.1.1| |192.168.1.2|
> +----+------+ +-----+-----+
> | |
> | |
> | br0:192.168.1.254 |
> +----------+----------+
> veth0 | vetha
> 192.168.1.3 | 192.168.1.4
> |
> nat(192.168.1.x -->172.30.60.199)
> |
> V
> eth0
> 172.30.60.199
> |
> |
> +----> ... ... ---->server: 172.30.60.191
>
> Let's say ns1 (192.168.1.1) generates a timestamp ts1, and ns2 (192.168.1.2) generates a timestamp
> ts2, with ts1 > ts2.
>
> If ns1 initiates a connection to a server, and then the server actively closes the connection,
> entering the TIME_WAIT state, and ns2 attempts to connect to the server while port reuse is in
> progress, due to the presence of NAT, the server sees both connections as originating from the
> same IP address (e.g., 172.30.60.199) and port. However, since ts2 is smaller than ts1, the server
> will respond with the acknowledgment (ACK) for the fourth handshake.
>
> SERVER CLIENT
>
> 1. ESTABLISHED ESTABLISHED
>
> (Close)
> 2. FIN-WAIT-1 --> <SEQ=100><ACK=300><TSval=20><CTL=FIN,ACK> --> CLOSE-WAIT
>
> 3. FIN-WAIT-2 <-- <SEQ=300><ACK=101><TSval=40><CTL=ACK> <-- CLOSE-WAIT
>
> (Close)
> 4. TIME-WAIT <-- <SEQ=300><ACK=101><TSval=41><CTL=FIN,ACK> <-- LAST-ACK
>
> 5. TIME-WAIT --> <SEQ=101><ACK=301><TSval=25><CTL=ACK> --> CLOSED
>
> - - - - - - - - - - - - - port reused - - - - - - - - - - - - - - -
>
> 5.1. TIME-WAIT <-- <SEQ=255><TSval=30><CTL=SYN> <-- SYN-SENT
>
> 5.2. TIME-WAIT --> <SEQ=101><ACK=301><TSval=35><CTL=ACK> --> SYN-SENT
>
> 5.3. CLOSED <-- <SEQ=301><CTL=RST> <-- SYN-SENT
>
> 6. SYN-RECV <-- <SEQ=255><TSval=34><CTL=SYN> <-- SYN-SENT
>
> 7. SYN-RECV --> <SEQ=400><ACK=301><TSval=40><CTL=SYN,ACK> --> ESTABLISHED
>
> 1. ESTABLISH <-- <SEQ=301><ACK=401><TSval=55><CTL=ACK> <-- ESTABLISHED
>
> This enhancement uses sport and daddr rather than saddr and daddr, which keep the timestamp
> monotonically increasing in the situation described above. Then the port reuse is like this:
We used to have per-connection timestamps, i.e. hash used to include
port numbers as well.
Unfortunately there were problem reports, too many devices expect
monotonically increasing ts from the same address.
See 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets")
So, I don't think we can safely substitute saddr with sport.