There is one issue about bonding mode BOND_MODE_BROADCAST, and
two slaves with diffierent affinity, so packets will be handled
by different cpu. These are two pre-conditions in this case.
When two slaves receive the same syn packets at the same time,
two request sock(reqsk) will be created if below situation happens:
1. syn1 arrived tcp_conn_request, create reqsk1 and have not yet called
inet_csk_reqsk_queue_hash_add.
2. syn2 arrived tcp_v4_rcv, it goes to tcp_conn_request and create
reqsk2
because it can't find reqsk1 in the __inet_lookup_skb.
Then reqsk1 and reqsk2 are added to establish hash table, and two synack
with different
seq(seq1 and seq2) are sent to client, then tcp ack arrived and will be
processed in tcp_v4_rcv and tcp_check_req, if __inet_lookup_skb find the
reqsk2, and
tcp ack packet is ack_seq is seq1, it will be failed after checking:
TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)
and then tcp rst will be sent to client and close the connection.
To fix this, call __inet_lookup_established() before __sk_nulls_add_node_rcu()
in inet_ehash_insert(). If there is existed reqsk with same tuples in
established hash table, directly to remove current reqsk2, and does not send
synack to client.
Signed-off-by: Mao Wenan <[email protected]>
---
v2: move __inet_lookup_established from tcp_conn_request() to inet_ehash_insert()
as Eric suggested.
---
include/net/inet_connection_sock.h | 2 +-
net/ipv4/inet_connection_sock.c | 16 ++++++++++++----
net/ipv4/inet_hashtables.c | 13 +++++++++++++
net/ipv4/tcp_input.c | 7 ++++---
4 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c57d53e7e02c..2d3538e333cb 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -263,7 +263,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
struct request_sock *req,
struct sock *child);
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout);
struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
struct request_sock *req,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 13ec7c3a9c49..fd45ed2fd985 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -749,7 +749,7 @@ static void reqsk_timer_handler(struct timer_list *t)
inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
}
-static void reqsk_queue_hash_req(struct request_sock *req,
+static bool reqsk_queue_hash_req(struct request_sock *req,
unsigned long timeout)
{
req->num_retrans = 0;
@@ -759,19 +759,27 @@ static void reqsk_queue_hash_req(struct request_sock *req,
timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
mod_timer(&req->rsk_timer, jiffies + timeout);
- inet_ehash_insert(req_to_sk(req), NULL);
+ if (!inet_ehash_insert(req_to_sk(req), NULL)) {
+ if (timer_pending(&req->rsk_timer))
+ del_timer_sync(&req->rsk_timer);
+ return false;
+ }
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
refcount_set(&req->rsk_refcnt, 2 + 1);
+ return true;
}
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
- reqsk_queue_hash_req(req, timeout);
+ if (!reqsk_queue_hash_req(req, timeout))
+ return false;
+
inet_csk_reqsk_queue_added(sk);
+ return true;
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c4503073248b..b6a1b5334565 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -477,6 +477,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
struct inet_ehash_bucket *head;
spinlock_t *lock;
bool ret = true;
+ struct sock *reqsk = NULL;
WARN_ON_ONCE(!sk_unhashed(sk));
@@ -486,6 +487,18 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock(lock);
+ if (!osk)
+ reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
+ sk->sk_daddr, sk->sk_dport,
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_bound_dev_if, sk->sk_bound_dev_if);
+ if (unlikely(reqsk)) {
+ ret = false;
+ reqsk_free(inet_reqsk(sk));
+ spin_unlock(lock);
+ return ret;
+ }
+
if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
ret = sk_nulls_del_node_init_rcu(osk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38dfc308c0fb..358272394590 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6570,9 +6570,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
- if (!want_cookie)
- inet_csk_reqsk_queue_hash_add(sk, req,
- tcp_timeout_init((struct sock *)req));
+ if (!want_cookie && !inet_csk_reqsk_queue_hash_add(sk, req,
+ tcp_timeout_init((struct sock *)req)))
+ return 0;
+
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE);
--
2.20.1
From: Mao Wenan <[email protected]>
Date: Wed, 12 Jun 2019 11:57:15 +0800
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index c4503073248b..b6a1b5334565 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -477,6 +477,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
> struct inet_ehash_bucket *head;
> spinlock_t *lock;
> bool ret = true;
> + struct sock *reqsk = NULL;
Please preserve the reverse christmas tree local variable ordering here.
Thank you.
On Tue, Jun 11, 2019 at 8:49 PM Mao Wenan <[email protected]> wrote:
>
> There is one issue about bonding mode BOND_MODE_BROADCAST, and
> two slaves with diffierent affinity, so packets will be handled
> by different cpu. These are two pre-conditions in this case.
>
> When two slaves receive the same syn packets at the same time,
> two request sock(reqsk) will be created if below situation happens:
> 1. syn1 arrived tcp_conn_request, create reqsk1 and have not yet called
> inet_csk_reqsk_queue_hash_add.
> 2. syn2 arrived tcp_v4_rcv, it goes to tcp_conn_request and create
> reqsk2
> because it can't find reqsk1 in the __inet_lookup_skb.
>
> Then reqsk1 and reqsk2 are added to establish hash table, and two synack
> with different
> seq(seq1 and seq2) are sent to client, then tcp ack arrived and will be
> processed in tcp_v4_rcv and tcp_check_req, if __inet_lookup_skb find the
> reqsk2, and
> tcp ack packet is ack_seq is seq1, it will be failed after checking:
> TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)
> and then tcp rst will be sent to client and close the connection.
>
> To fix this, call __inet_lookup_established() before __sk_nulls_add_node_rcu()
> in inet_ehash_insert(). If there is existed reqsk with same tuples in
> established hash table, directly to remove current reqsk2, and does not send
> synack to client.
>
> Signed-off-by: Mao Wenan <[email protected]>
> ---
> v2: move __inet_lookup_established from tcp_conn_request() to inet_ehash_insert()
> as Eric suggested.
> ---
> include/net/inet_connection_sock.h | 2 +-
> net/ipv4/inet_connection_sock.c | 16 ++++++++++++----
> net/ipv4/inet_hashtables.c | 13 +++++++++++++
> net/ipv4/tcp_input.c | 7 ++++---
> 4 files changed, 30 insertions(+), 8 deletions(-)
>
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index c57d53e7e02c..2d3538e333cb 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -263,7 +263,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
> struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
> struct request_sock *req,
> struct sock *child);
> -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
> +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
> unsigned long timeout);
> struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
> struct request_sock *req,
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 13ec7c3a9c49..fd45ed2fd985 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -749,7 +749,7 @@ static void reqsk_timer_handler(struct timer_list *t)
> inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
> }
>
> -static void reqsk_queue_hash_req(struct request_sock *req,
> +static bool reqsk_queue_hash_req(struct request_sock *req,
> unsigned long timeout)
> {
> req->num_retrans = 0;
> @@ -759,19 +759,27 @@ static void reqsk_queue_hash_req(struct request_sock *req,
> timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
> mod_timer(&req->rsk_timer, jiffies + timeout);
>
> - inet_ehash_insert(req_to_sk(req), NULL);
> + if (!inet_ehash_insert(req_to_sk(req), NULL)) {
> + if (timer_pending(&req->rsk_timer))
> + del_timer_sync(&req->rsk_timer);
> + return false;
> + }
> /* before letting lookups find us, make sure all req fields
> * are committed to memory and refcnt initialized.
> */
> smp_wmb();
> refcount_set(&req->rsk_refcnt, 2 + 1);
> + return true;
> }
>
> -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
> +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
> unsigned long timeout)
> {
> - reqsk_queue_hash_req(req, timeout);
> + if (!reqsk_queue_hash_req(req, timeout))
> + return false;
> +
> inet_csk_reqsk_queue_added(sk);
> + return true;
> }
> EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
>
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index c4503073248b..b6a1b5334565 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -477,6 +477,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
> struct inet_ehash_bucket *head;
> spinlock_t *lock;
> bool ret = true;
> + struct sock *reqsk = NULL;
>
> WARN_ON_ONCE(!sk_unhashed(sk));
>
> @@ -486,6 +487,18 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
> lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
>
> spin_lock(lock);
> + if (!osk)
> + reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
> + sk->sk_daddr, sk->sk_dport,
> + sk->sk_rcv_saddr, sk->sk_num,
> + sk->sk_bound_dev_if, sk->sk_bound_dev_if);
> + if (unlikely(reqsk)) {
What reqsk would be a SYN_RECV socket, and not a ESTABLISH one (or a
TIME_WAIT ?)
> + ret = false;
> + reqsk_free(inet_reqsk(sk));
> + spin_unlock(lock);
> + return ret;
> + }
> +
> if (osk) {
This test should have be a hint here : Sometime we _expect_ to have an
old socket (TIMEWAIT) and remove it
> WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
> ret = sk_nulls_del_node_init_rcu(osk);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 38dfc308c0fb..358272394590 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -6570,9 +6570,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> sock_put(fastopen_sk);
> } else {
> tcp_rsk(req)->tfo_listener = false;
> - if (!want_cookie)
> - inet_csk_reqsk_queue_hash_add(sk, req,
> - tcp_timeout_init((struct sock *)req));
> + if (!want_cookie && !inet_csk_reqsk_queue_hash_add(sk, req,
> + tcp_timeout_init((struct sock *)req)))
> + return 0;
> +
> af_ops->send_synack(sk, dst, &fl, req, &foc,
> !want_cookie ? TCP_SYNACK_NORMAL :
> TCP_SYNACK_COOKIE);
> --
> 2.20.1
>
I believe the proper fix is more complicated.
Probably we need to move the locking in a less deeper location.
(Also a similar fix would be needed in IPv6)
Thanks.
On 2019/6/13 0:25, David Miller wrote:
> From: Mao Wenan <[email protected]>
> Date: Wed, 12 Jun 2019 11:57:15 +0800
>
>> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
>> index c4503073248b..b6a1b5334565 100644
>> --- a/net/ipv4/inet_hashtables.c
>> +++ b/net/ipv4/inet_hashtables.c
>> @@ -477,6 +477,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
>> struct inet_ehash_bucket *head;
>> spinlock_t *lock;
>> bool ret = true;
>> + struct sock *reqsk = NULL;
>
> Please preserve the reverse christmas tree local variable ordering here.
ok, thanks.
>
> Thank you.
>
> .
>
On 2019/6/13 0:30, Eric Dumazet wrote:
> On Tue, Jun 11, 2019 at 8:49 PM Mao Wenan <[email protected]> wrote:
>>
>> There is one issue about bonding mode BOND_MODE_BROADCAST, and
>> two slaves with diffierent affinity, so packets will be handled
>> by different cpu. These are two pre-conditions in this case.
>>
>> When two slaves receive the same syn packets at the same time,
>> two request sock(reqsk) will be created if below situation happens:
>> 1. syn1 arrived tcp_conn_request, create reqsk1 and have not yet called
>> inet_csk_reqsk_queue_hash_add.
>> 2. syn2 arrived tcp_v4_rcv, it goes to tcp_conn_request and create
>> reqsk2
>> because it can't find reqsk1 in the __inet_lookup_skb.
>>
>> Then reqsk1 and reqsk2 are added to establish hash table, and two synack
>> with different
>> seq(seq1 and seq2) are sent to client, then tcp ack arrived and will be
>> processed in tcp_v4_rcv and tcp_check_req, if __inet_lookup_skb find the
>> reqsk2, and
>> tcp ack packet is ack_seq is seq1, it will be failed after checking:
>> TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)
>> and then tcp rst will be sent to client and close the connection.
>>
>> To fix this, call __inet_lookup_established() before __sk_nulls_add_node_rcu()
>> in inet_ehash_insert(). If there is existed reqsk with same tuples in
>> established hash table, directly to remove current reqsk2, and does not send
>> synack to client.
>>
>> Signed-off-by: Mao Wenan <[email protected]>
>> ---
>> v2: move __inet_lookup_established from tcp_conn_request() to inet_ehash_insert()
>> as Eric suggested.
>> ---
>> include/net/inet_connection_sock.h | 2 +-
>> net/ipv4/inet_connection_sock.c | 16 ++++++++++++----
>> net/ipv4/inet_hashtables.c | 13 +++++++++++++
>> net/ipv4/tcp_input.c | 7 ++++---
>> 4 files changed, 30 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
>> index c57d53e7e02c..2d3538e333cb 100644
>> --- a/include/net/inet_connection_sock.h
>> +++ b/include/net/inet_connection_sock.h
>> @@ -263,7 +263,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
>> struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
>> struct request_sock *req,
>> struct sock *child);
>> -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
>> +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
>> unsigned long timeout);
>> struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
>> struct request_sock *req,
>> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
>> index 13ec7c3a9c49..fd45ed2fd985 100644
>> --- a/net/ipv4/inet_connection_sock.c
>> +++ b/net/ipv4/inet_connection_sock.c
>> @@ -749,7 +749,7 @@ static void reqsk_timer_handler(struct timer_list *t)
>> inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
>> }
>>
>> -static void reqsk_queue_hash_req(struct request_sock *req,
>> +static bool reqsk_queue_hash_req(struct request_sock *req,
>> unsigned long timeout)
>> {
>> req->num_retrans = 0;
>> @@ -759,19 +759,27 @@ static void reqsk_queue_hash_req(struct request_sock *req,
>> timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
>> mod_timer(&req->rsk_timer, jiffies + timeout);
>>
>> - inet_ehash_insert(req_to_sk(req), NULL);
>> + if (!inet_ehash_insert(req_to_sk(req), NULL)) {
>> + if (timer_pending(&req->rsk_timer))
>> + del_timer_sync(&req->rsk_timer);
>> + return false;
>> + }
>> /* before letting lookups find us, make sure all req fields
>> * are committed to memory and refcnt initialized.
>> */
>> smp_wmb();
>> refcount_set(&req->rsk_refcnt, 2 + 1);
>> + return true;
>> }
>>
>> -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
>> +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
>> unsigned long timeout)
>> {
>> - reqsk_queue_hash_req(req, timeout);
>> + if (!reqsk_queue_hash_req(req, timeout))
>> + return false;
>> +
>> inet_csk_reqsk_queue_added(sk);
>> + return true;
>> }
>> EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
>>
>> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
>> index c4503073248b..b6a1b5334565 100644
>> --- a/net/ipv4/inet_hashtables.c
>> +++ b/net/ipv4/inet_hashtables.c
>> @@ -477,6 +477,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
>> struct inet_ehash_bucket *head;
>> spinlock_t *lock;
>> bool ret = true;
>> + struct sock *reqsk = NULL;
>>
>> WARN_ON_ONCE(!sk_unhashed(sk));
>>
>> @@ -486,6 +487,18 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
>> lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
>>
>> spin_lock(lock);
>> + if (!osk)
>> + reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
>> + sk->sk_daddr, sk->sk_dport,
>> + sk->sk_rcv_saddr, sk->sk_num,
>> + sk->sk_bound_dev_if, sk->sk_bound_dev_if);
>> + if (unlikely(reqsk)) {
>
> What reqsk would be a SYN_RECV socket, and not a ESTABLISH one (or a
> TIME_WAIT ?)
It wouldn't be SYN_RECV,ESTABLISH or TIME_WAIT, just TCP_NEW_SYN_RECV.
When server receives the third handshake packet ACK, SYN_RECV sk will insert to hash with osk(!= NULL).
The looking up here just avoid to create two or more request sk with TCP_NEW_SYN_RECV when receive syn packet.
>
>> + ret = false;
>> + reqsk_free(inet_reqsk(sk));
>> + spin_unlock(lock);
>> + return ret;
>> + }
>> +
>> if (osk) {
>
> This test should have be a hint here : Sometime we _expect_ to have an
> old socket (TIMEWAIT) and remove it
I will check TIMEWAIT sk.
>
>
>> WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
>> ret = sk_nulls_del_node_init_rcu(osk);
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index 38dfc308c0fb..358272394590 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -6570,9 +6570,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>> sock_put(fastopen_sk);
>> } else {
>> tcp_rsk(req)->tfo_listener = false;
>> - if (!want_cookie)
>> - inet_csk_reqsk_queue_hash_add(sk, req,
>> - tcp_timeout_init((struct sock *)req));
>> + if (!want_cookie && !inet_csk_reqsk_queue_hash_add(sk, req,
>> + tcp_timeout_init((struct sock *)req)))
>> + return 0;
>> +
>> af_ops->send_synack(sk, dst, &fl, req, &foc,
>> !want_cookie ? TCP_SYNACK_NORMAL :
>> TCP_SYNACK_COOKIE);
>> --
>> 2.20.1
>>
>
> I believe the proper fix is more complicated.
yes, pretty complicated.
>
> Probably we need to move the locking in a less deeper location.
>
> (Also a similar fix would be needed in IPv6)
ok
>
> Thanks.
>
> .
>
>>> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
>>> index 13ec7c3a9c49..fd45ed2fd985 100644
>>> --- a/net/ipv4/inet_connection_sock.c
>>> +++ b/net/ipv4/inet_connection_sock.c
>>> @@ -749,7 +749,7 @@ static void reqsk_timer_handler(struct timer_list *t)
>>> inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
>>> }
>>>
>>> -static void reqsk_queue_hash_req(struct request_sock *req,
>>> +static bool reqsk_queue_hash_req(struct request_sock *req,
>>> unsigned long timeout)
>>> {
>>> req->num_retrans = 0;
>>> @@ -759,19 +759,27 @@ static void reqsk_queue_hash_req(struct request_sock *req,
>>> timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
>>> mod_timer(&req->rsk_timer, jiffies + timeout);
>>>
>>> - inet_ehash_insert(req_to_sk(req), NULL);
>>> + if (!inet_ehash_insert(req_to_sk(req), NULL)) {
>>> + if (timer_pending(&req->rsk_timer))
>>> + del_timer_sync(&req->rsk_timer);
>>> + return false;
>>> + }
>>> /* before letting lookups find us, make sure all req fields
>>> * are committed to memory and refcnt initialized.
>>> */
>>> smp_wmb();
>>> refcount_set(&req->rsk_refcnt, 2 + 1);
>>> + return true;
>>> }
>>>
>>> -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
>>> +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
>>> unsigned long timeout)
>>> {
>>> - reqsk_queue_hash_req(req, timeout);
>>> + if (!reqsk_queue_hash_req(req, timeout))
>>> + return false;
>>> +
>>> inet_csk_reqsk_queue_added(sk);
>>> + return true;
>>> }
>>> EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
>>>
>>> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
>>> index c4503073248b..b6a1b5334565 100644
>>> --- a/net/ipv4/inet_hashtables.c
>>> +++ b/net/ipv4/inet_hashtables.c
>>> @@ -477,6 +477,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
>>> struct inet_ehash_bucket *head;
>>> spinlock_t *lock;
>>> bool ret = true;
>>> + struct sock *reqsk = NULL;
>>>
>>> WARN_ON_ONCE(!sk_unhashed(sk));
>>>
>>> @@ -486,6 +487,18 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
>>> lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
>>>
>>> spin_lock(lock);
>>> + if (!osk)
>>> + reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
>>> + sk->sk_daddr, sk->sk_dport,
>>> + sk->sk_rcv_saddr, sk->sk_num,
>>> + sk->sk_bound_dev_if, sk->sk_bound_dev_if);
>>> + if (unlikely(reqsk)) {
>>
>> What reqsk would be a SYN_RECV socket, and not a ESTABLISH one (or a
>> TIME_WAIT ?)
>
> It wouldn't be SYN_RECV,ESTABLISH or TIME_WAIT, just TCP_NEW_SYN_RECV.
>
> When server receives the third handshake packet ACK, SYN_RECV sk will insert to hash with osk(!= NULL).
> The looking up here just avoid to create two or more request sk with TCP_NEW_SYN_RECV when receive syn packet.
>
@Eric, for this issue I only want to check TCP_NEW_SYN_RECV sk, is it OK like below?
+ if (!osk && sk->sk_state == TCP_NEW_SYN_RECV)
+ reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
+ sk->sk_daddr, sk->sk_dport,
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_bound_dev_if, sk->sk_bound_dev_if);
+ if (unlikely(reqsk)) {
>>
>>> + ret = false;
>>> + reqsk_free(inet_reqsk(sk));
>>> + spin_unlock(lock);
>>> + return ret;
>>> + }
>>> +
>>> if (osk) {
>>
>> This test should have be a hint here : Sometime we _expect_ to have an
>> old socket (TIMEWAIT) and remove it
> I will check TIMEWAIT sk.
>>
>>
>>> WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
>>> ret = sk_nulls_del_node_init_rcu(osk);
>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>> index 38dfc308c0fb..358272394590 100644
>>> --- a/net/ipv4/tcp_input.c
>>> +++ b/net/ipv4/tcp_input.c
>>> @@ -6570,9 +6570,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>>> sock_put(fastopen_sk);
>>> } else {
>>> tcp_rsk(req)->tfo_listener = false;
>>> - if (!want_cookie)
>>> - inet_csk_reqsk_queue_hash_add(sk, req,
>>> - tcp_timeout_init((struct sock *)req));
>>> + if (!want_cookie && !inet_csk_reqsk_queue_hash_add(sk, req,
>>> + tcp_timeout_init((struct sock *)req)))
>>> + return 0;
>>> +
>>> af_ops->send_synack(sk, dst, &fl, req, &foc,
>>> !want_cookie ? TCP_SYNACK_NORMAL :
>>> TCP_SYNACK_COOKIE);
>>> --
>>> 2.20.1
>>>
>>
>> I believe the proper fix is more complicated.
> yes, pretty complicated.
>>
>> Probably we need to move the locking in a less deeper location.
Currently, I find inet_ehash_insert is the most suitable location to do hash looking up,
because the sk's lock can be found from sk_hash, and there has already existed spin_lock code
In v1, I put the hash looking up in tcp_connect_request, there will be redundant lock to do looking up.
>
>>
>> (Also a similar fix would be needed in IPv6)
> ok
I find IPv6 has the same call trace, so this fix seems good to IPv6?
tcp_v6_conn_request
tcp_conn_request
inet_csk_reqsk_queue_hash_add
reqsk_queue_hash_req
inet_ehash_insert
>>
>> Thanks.
>>
>> .
>>
On 6/13/19 9:19 PM, maowenan wrote:
>
>
> @Eric, for this issue I only want to check TCP_NEW_SYN_RECV sk, is it OK like below?
> + if (!osk && sk->sk_state == TCP_NEW_SYN_RECV)
> + reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
> + sk->sk_daddr, sk->sk_dport,
> + sk->sk_rcv_saddr, sk->sk_num,
> + sk->sk_bound_dev_if, sk->sk_bound_dev_if);
> + if (unlikely(reqsk)) {
>
Not enough.
If we have many cpus here, there is a chance another cpu has inserted a request socket, then
replaced it by an ESTABLISH socket for the same 4-tuple.
We need to take the per bucket spinlock much sooner.
And this is fine, all what matters is that we do no longer grab the listener spinlock.
On 2019/6/14 12:28, Eric Dumazet wrote:
>
>
> On 6/13/19 9:19 PM, maowenan wrote:
>>
>>
>> @Eric, for this issue I only want to check TCP_NEW_SYN_RECV sk, is it OK like below?
>> + if (!osk && sk->sk_state == TCP_NEW_SYN_RECV)
>> + reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
>> + sk->sk_daddr, sk->sk_dport,
>> + sk->sk_rcv_saddr, sk->sk_num,
>> + sk->sk_bound_dev_if, sk->sk_bound_dev_if);
>> + if (unlikely(reqsk)) {
>>
>
> Not enough.
>
> If we have many cpus here, there is a chance another cpu has inserted a request socket, then
> replaced it by an ESTABLISH socket for the same 4-tuple.
I try to get more clear about the scene you mentioned. And I have do some testing about this, it can work well
when I use multiple cpus.
The ESTABLISH socket would be from tcp_check_req->tcp_v4_syn_recv_sock->tcp_create_openreq_child,
and for this path, inet_ehash_nolisten pass osk(NOT NULL), my patch won't call __inet_lookup_established in inet_ehash_insert().
When TCP_NEW_SYN_RECV socket try to inset to hash table, it will pass osk with NULL, my patch will check whether reqsk existed
in hash table or not. If reqsk is existed, it just removes this reqsk and dose not insert to hash table. Then the synack for this
reqsk can't be sent to client, and there is no chance to receive the ack from client, so ESTABLISH socket can't be replaced in hash table.
So I don't see the race when there are many cpus. Can you show me some clue?
thank you.
>
> We need to take the per bucket spinlock much sooner.
>
> And this is fine, all what matters is that we do no longer grab the listener spinlock.
>
>
On Fri, Jun 14, 2019 at 2:35 AM maowenan <[email protected]> wrote:
>
>
>
> On 2019/6/14 12:28, Eric Dumazet wrote:
> >
> >
> > On 6/13/19 9:19 PM, maowenan wrote:
> >>
> >>
> >> @Eric, for this issue I only want to check TCP_NEW_SYN_RECV sk, is it OK like below?
> >> + if (!osk && sk->sk_state == TCP_NEW_SYN_RECV)
> >> + reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
> >> + sk->sk_daddr, sk->sk_dport,
> >> + sk->sk_rcv_saddr, sk->sk_num,
> >> + sk->sk_bound_dev_if, sk->sk_bound_dev_if);
> >> + if (unlikely(reqsk)) {
> >>
> >
> > Not enough.
> >
> > If we have many cpus here, there is a chance another cpu has inserted a request socket, then
> > replaced it by an ESTABLISH socket for the same 4-tuple.
>
> I try to get more clear about the scene you mentioned. And I have do some testing about this, it can work well
> when I use multiple cpus.
>
> The ESTABLISH socket would be from tcp_check_req->tcp_v4_syn_recv_sock->tcp_create_openreq_child,
> and for this path, inet_ehash_nolisten pass osk(NOT NULL), my patch won't call __inet_lookup_established in inet_ehash_insert().
>
> When TCP_NEW_SYN_RECV socket try to inset to hash table, it will pass osk with NULL, my patch will check whether reqsk existed
> in hash table or not. If reqsk is existed, it just removes this reqsk and dose not insert to hash table. Then the synack for this
> reqsk can't be sent to client, and there is no chance to receive the ack from client, so ESTABLISH socket can't be replaced in hash table.
>
> So I don't see the race when there are many cpus. Can you show me some clue?
This is a bit silly.
You focus on some crash you got on a given system, but do not see the real bug.
CPU A
SYN packet
lookup finds nothing.
Create a NEW_SYN_RECV
<long delay, like hardware interrupts calling some buggy driver or something>
CPU B
SYN packet
-> inserts a NEW_SYN_RECV sends a SYNACK
ACK packet
-> replaces the NEW_SYN_RECV by ESTABLISH socket
CPU A resumes.
Basically a lookup (after taking the bucket spinlock) could either find :
- Nothing (typical case where there was no race)
- A NEW_SYN_RECV
- A ESTABLISHED socket
- A TIME_WAIT socket.
You can not simply fix the "NEW_SYN_RECV" state case, and possibly add
hard crashes (instead of current situation leading to RST packets)
On 2019/6/14 20:27, Eric Dumazet wrote:
> On Fri, Jun 14, 2019 at 2:35 AM maowenan <[email protected]> wrote:
>>
>>
>>
>> On 2019/6/14 12:28, Eric Dumazet wrote:
>>>
>>>
>>> On 6/13/19 9:19 PM, maowenan wrote:
>>>>
>>>>
>>>> @Eric, for this issue I only want to check TCP_NEW_SYN_RECV sk, is it OK like below?
>>>> + if (!osk && sk->sk_state == TCP_NEW_SYN_RECV)
>>>> + reqsk = __inet_lookup_established(sock_net(sk), &tcp_hashinfo,
>>>> + sk->sk_daddr, sk->sk_dport,
>>>> + sk->sk_rcv_saddr, sk->sk_num,
>>>> + sk->sk_bound_dev_if, sk->sk_bound_dev_if);
>>>> + if (unlikely(reqsk)) {
>>>>
>>>
>>> Not enough.
>>>
>>> If we have many cpus here, there is a chance another cpu has inserted a request socket, then
>>> replaced it by an ESTABLISH socket for the same 4-tuple.
>>
>> I try to get more clear about the scene you mentioned. And I have do some testing about this, it can work well
>> when I use multiple cpus.
>>
>> The ESTABLISH socket would be from tcp_check_req->tcp_v4_syn_recv_sock->tcp_create_openreq_child,
>> and for this path, inet_ehash_nolisten pass osk(NOT NULL), my patch won't call __inet_lookup_established in inet_ehash_insert().
>>
>> When TCP_NEW_SYN_RECV socket try to inset to hash table, it will pass osk with NULL, my patch will check whether reqsk existed
>> in hash table or not. If reqsk is existed, it just removes this reqsk and dose not insert to hash table. Then the synack for this
>> reqsk can't be sent to client, and there is no chance to receive the ack from client, so ESTABLISH socket can't be replaced in hash table.
>>
>> So I don't see the race when there are many cpus. Can you show me some clue?
>
> This is a bit silly.
> You focus on some crash you got on a given system, but do not see the real bug.
>
>
> CPU A
>
> SYN packet
> lookup finds nothing.
> Create a NEW_SYN_RECV
> <long delay, like hardware interrupts calling some buggy driver or something>
I agree that this is a special case.
I propose one point about the sequence of synack, if two synack with two different
sequence since the time elapse 64ns, this issue disappear.
tcp_conn_request->tcp_v4_init_seq->secure_tcp_seq->seq_scale
static u32 seq_scale(u32 seq)
{
/*
* As close as possible to RFC 793, which
* suggests using a 250 kHz clock.
* Further reading shows this assumes 2 Mb/s networks.
* For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
* For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
* we also need to limit the resolution so that the u32 seq
* overlaps less than one time per MSL (2 minutes).
* Choosing a clock of 64 ns period is OK. (period of 274 s)
*/
return seq + (ktime_get_real_ns() >> 6);
}
So if the long delay larger than 64ns, the seq is difference.
>
> CPU B
> SYN packet
> -> inserts a NEW_SYN_RECV sends a SYNACK
> ACK packet
> -> replaces the NEW_SYN_RECV by ESTABLISH socket
>
> CPU A resumes.
> Basically a lookup (after taking the bucket spinlock) could either find :
> - Nothing (typical case where there was no race)
> - A NEW_SYN_RECV
> - A ESTABLISHED socket
> - A TIME_WAIT socket.
>
> You can not simply fix the "NEW_SYN_RECV" state case, and possibly add
> hard crashes (instead of current situation leading to RST packets)
>
> .
>
On Fri, Jun 14, 2019 at 7:04 AM maowenan <[email protected]> wrote:
> I agree that this is a special case.
> I propose one point about the sequence of synack, if two synack with two different
> sequence since the time elapse 64ns, this issue disappear.
>
> tcp_conn_request->tcp_v4_init_seq->secure_tcp_seq->seq_scale
> static u32 seq_scale(u32 seq)
> {
> /*
> * As close as possible to RFC 793, which
> * suggests using a 250 kHz clock.
> * Further reading shows this assumes 2 Mb/s networks.
> * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
> * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
> * we also need to limit the resolution so that the u32 seq
> * overlaps less than one time per MSL (2 minutes).
> * Choosing a clock of 64 ns period is OK. (period of 274 s)
> */
> return seq + (ktime_get_real_ns() >> 6);
> }
>
> So if the long delay larger than 64ns, the seq is difference.
The core issue has nothing to do with syncookies.
Are you sure you really understand this stack ?
On 6/14/19 7:25 AM, Eric Dumazet wrote:
> On Fri, Jun 14, 2019 at 7:04 AM maowenan <[email protected]> wrote:
>> I agree that this is a special case.
>> I propose one point about the sequence of synack, if two synack with two different
>> sequence since the time elapse 64ns, this issue disappear.
>>
>> tcp_conn_request->tcp_v4_init_seq->secure_tcp_seq->seq_scale
>> static u32 seq_scale(u32 seq)
>> {
>> /*
>> * As close as possible to RFC 793, which
>> * suggests using a 250 kHz clock.
>> * Further reading shows this assumes 2 Mb/s networks.
>> * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
>> * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
>> * we also need to limit the resolution so that the u32 seq
>> * overlaps less than one time per MSL (2 minutes).
>> * Choosing a clock of 64 ns period is OK. (period of 274 s)
>> */
>> return seq + (ktime_get_real_ns() >> 6);
>> }
>>
>> So if the long delay larger than 64ns, the seq is difference.
>
> The core issue has nothing to do with syncookies.
>
> Are you sure you really understand this stack ?
>
Oh well, maybe I should not have answered before my breakfast/coffee.
What I meant to say is that we do not want to fix this problem by working around
the issue you noticed (which leads to RST packets)