From: Menglong Dong <[email protected]>
The 'conn_request()' in struct inet_connection_sock_af_ops is used to
process connection requesting for TCP/DCCP. Take TCP for example, it
is just 'tcp_v4_conn_request()'.
When non-zero value is returned by 'tcp_v4_conn_request()', the skb
will be freed by kfree_skb() and a 'reset' packet will be send.
Otherwise, it will be freed normally.
In this code path, 'consume_skb()' is used in many abnormal cases, such
as the accept queue of the listen socket full, which should be
'kfree_skb()'.
Therefore, we make a little change to the 'conn_request()' interface.
When 0 is returned, we call 'consume_skb()' as usual; when negative is
returned, we call 'kfree_skb()' and send a 'reset' as usual; when
positive is returned, which has not happened yet, we do nothing, and
skb will be freed in 'conn_request()'. Then, we can use drop reasons
in 'conn_request()'.
Following new drop reasons are added:
SKB_DROP_REASON_LISTENOVERFLOWS
SKB_DROP_REASON_TCP_REQQFULLDROP
Reviewed-by: Jiang Biao <[email protected]>
Reviewed-by: Hao Peng <[email protected]>
Signed-off-by: Menglong Dong <[email protected]>
---
include/linux/skbuff.h | 4 ++++
include/trace/events/skb.h | 2 ++
net/dccp/input.c | 12 +++++-------
net/ipv4/tcp_input.c | 21 +++++++++++++--------
net/ipv4/tcp_ipv4.c | 3 ++-
5 files changed, 26 insertions(+), 16 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 84d78df60453..f33b3636bbce 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -469,6 +469,10 @@ enum skb_drop_reason {
SKB_DROP_REASON_PKT_TOO_BIG, /* packet size is too big (maybe exceed
* the MTU)
*/
+ SKB_DROP_REASON_LISTENOVERFLOWS, /* accept queue of the listen socket is full */
+ SKB_DROP_REASON_TCP_REQQFULLDROP, /* request queue of the listen
+ * socket is full
+ */
SKB_DROP_REASON_MAX,
};
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index a477bf907498..de6c93670437 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -80,6 +80,8 @@
EM(SKB_DROP_REASON_IP_INADDRERRORS, IP_INADDRERRORS) \
EM(SKB_DROP_REASON_IP_INNOROUTES, IP_INNOROUTES) \
EM(SKB_DROP_REASON_PKT_TOO_BIG, PKT_TOO_BIG) \
+ EM(SKB_DROP_REASON_LISTENOVERFLOWS, LISTENOVERFLOWS) \
+ EM(SKB_DROP_REASON_TCP_REQQFULLDROP, TCP_REQQFULLDROP) \
EMe(SKB_DROP_REASON_MAX, MAX)
#undef EM
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 2cbb757a894f..ed20dfe83f66 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -574,8 +574,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
const int old_state = sk->sk_state;
- bool acceptable;
- int queued = 0;
+ int err, queued = 0;
/*
* Step 3: Process LISTEN state
@@ -606,13 +605,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
rcu_read_lock();
local_bh_disable();
- acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
+ err = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb);
local_bh_enable();
rcu_read_unlock();
- if (!acceptable)
- return 1;
- consume_skb(skb);
- return 0;
+ if (!err)
+ consume_skb(skb);
+ return err < 0;
}
if (dh->dccph_type == DCCP_PKT_RESET)
goto discard;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index daff631b9486..e0bbbd624246 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6411,7 +6411,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
- int queued = 0;
+ int err, queued = 0;
bool acceptable;
SKB_DR(reason);
@@ -6438,14 +6438,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
*/
rcu_read_lock();
local_bh_disable();
- acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
+ err = icsk->icsk_af_ops->conn_request(sk, skb);
local_bh_enable();
rcu_read_unlock();
- if (!acceptable)
- return 1;
- consume_skb(skb);
- return 0;
+ if (!err)
+ consume_skb(skb);
+ return err < 0;
}
SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
@@ -6878,6 +6877,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
bool want_cookie = false;
struct dst_entry *dst;
struct flowi fl;
+ SKB_DR(reason);
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
@@ -6886,12 +6886,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
- if (!want_cookie)
+ if (!want_cookie) {
+ SKB_DR_SET(reason, TCP_REQQFULLDROP);
goto drop;
+ }
}
if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ SKB_DR_SET(reason, LISTENOVERFLOWS);
goto drop;
}
@@ -6947,6 +6950,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
*/
pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
rsk_ops->family);
+ SKB_DR_SET(reason, TCP_REQQFULLDROP);
goto drop_and_release;
}
@@ -7006,7 +7010,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
drop_and_free:
__reqsk_free(req);
drop:
+ kfree_skb_reason(skb, reason);
tcp_listendrop(sk);
- return 0;
+ return 1;
}
EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 157265aecbed..b8daf49f54a5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1470,7 +1470,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
drop:
tcp_listendrop(sk);
- return 0;
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INADDRERRORS);
+ return 1;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
--
2.36.0
On Tue, Apr 26, 2022 at 1:07 AM <[email protected]> wrote:
>
> From: Menglong Dong <[email protected]>
>
> The 'conn_request()' in struct inet_connection_sock_af_ops is used to
> process connection requesting for TCP/DCCP. Take TCP for example, it
> is just 'tcp_v4_conn_request()'.
>
> When non-zero value is returned by 'tcp_v4_conn_request()', the skb
> will be freed by kfree_skb() and a 'reset' packet will be send.
> Otherwise, it will be freed normally.
>
> In this code path, 'consume_skb()' is used in many abnormal cases, such
> as the accept queue of the listen socket full, which should be
> 'kfree_skb()'.
>
> Therefore, we make a little change to the 'conn_request()' interface.
> When 0 is returned, we call 'consume_skb()' as usual; when negative is
> returned, we call 'kfree_skb()' and send a 'reset' as usual; when
> positive is returned, which has not happened yet, we do nothing, and
> skb will be freed in 'conn_request()'. Then, we can use drop reasons
> in 'conn_request()'.
>
> Following new drop reasons are added:
>
> SKB_DROP_REASON_LISTENOVERFLOWS
> SKB_DROP_REASON_TCP_REQQFULLDROP
>
> Reviewed-by: Jiang Biao <[email protected]>
> Reviewed-by: Hao Peng <[email protected]>
> Signed-off-by: Menglong Dong <[email protected]>
> ---
> include/linux/skbuff.h | 4 ++++
> include/trace/events/skb.h | 2 ++
> net/dccp/input.c | 12 +++++-------
> net/ipv4/tcp_input.c | 21 +++++++++++++--------
> net/ipv4/tcp_ipv4.c | 3 ++-
> 5 files changed, 26 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 84d78df60453..f33b3636bbce 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -469,6 +469,10 @@ enum skb_drop_reason {
> SKB_DROP_REASON_PKT_TOO_BIG, /* packet size is too big (maybe exceed
> * the MTU)
> */
> + SKB_DROP_REASON_LISTENOVERFLOWS, /* accept queue of the listen socket is full */
> + SKB_DROP_REASON_TCP_REQQFULLDROP, /* request queue of the listen
> + * socket is full
> + */
> SKB_DROP_REASON_MAX,
> };
>
> diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
> index a477bf907498..de6c93670437 100644
> --- a/include/trace/events/skb.h
> +++ b/include/trace/events/skb.h
> @@ -80,6 +80,8 @@
> EM(SKB_DROP_REASON_IP_INADDRERRORS, IP_INADDRERRORS) \
> EM(SKB_DROP_REASON_IP_INNOROUTES, IP_INNOROUTES) \
> EM(SKB_DROP_REASON_PKT_TOO_BIG, PKT_TOO_BIG) \
> + EM(SKB_DROP_REASON_LISTENOVERFLOWS, LISTENOVERFLOWS) \
> + EM(SKB_DROP_REASON_TCP_REQQFULLDROP, TCP_REQQFULLDROP) \
> EMe(SKB_DROP_REASON_MAX, MAX)
>
> #undef EM
> diff --git a/net/dccp/input.c b/net/dccp/input.c
> index 2cbb757a894f..ed20dfe83f66 100644
> --- a/net/dccp/input.c
> +++ b/net/dccp/input.c
> @@ -574,8 +574,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
> struct dccp_sock *dp = dccp_sk(sk);
> struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
> const int old_state = sk->sk_state;
> - bool acceptable;
> - int queued = 0;
> + int err, queued = 0;
>
> /*
> * Step 3: Process LISTEN state
> @@ -606,13 +605,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
> */
> rcu_read_lock();
> local_bh_disable();
> - acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
> + err = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb);
> local_bh_enable();
> rcu_read_unlock();
> - if (!acceptable)
> - return 1;
> - consume_skb(skb);
> - return 0;
> + if (!err)
> + consume_skb(skb);
> + return err < 0;
> }
> if (dh->dccph_type == DCCP_PKT_RESET)
> goto discard;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index daff631b9486..e0bbbd624246 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -6411,7 +6411,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
> struct inet_connection_sock *icsk = inet_csk(sk);
> const struct tcphdr *th = tcp_hdr(skb);
> struct request_sock *req;
> - int queued = 0;
> + int err, queued = 0;
> bool acceptable;
> SKB_DR(reason);
>
> @@ -6438,14 +6438,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
> */
> rcu_read_lock();
> local_bh_disable();
> - acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
> + err = icsk->icsk_af_ops->conn_request(sk, skb);
> local_bh_enable();
> rcu_read_unlock();
>
> - if (!acceptable)
> - return 1;
> - consume_skb(skb);
> - return 0;
> + if (!err)
> + consume_skb(skb);
Please, do not add more mess like that, where skb is either freed by
the callee or the caller.
> + return err < 0;
Where err is set to a negative value ?
> }
> SKB_DR_SET(reason, TCP_FLAGS);
> goto discard;
> @@ -6878,6 +6877,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> bool want_cookie = false;
> struct dst_entry *dst;
> struct flowi fl;
> + SKB_DR(reason);
>
> /* TW buckets are converted to open requests without
> * limitations, they conserve resources and peer is
> @@ -6886,12 +6886,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
> inet_csk_reqsk_queue_is_full(sk)) && !isn) {
> want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
> - if (!want_cookie)
> + if (!want_cookie) {
> + SKB_DR_SET(reason, TCP_REQQFULLDROP);
> goto drop;
> + }
> }
>
> if (sk_acceptq_is_full(sk)) {
> NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
> + SKB_DR_SET(reason, LISTENOVERFLOWS);
> goto drop;
> }
>
> @@ -6947,6 +6950,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> */
> pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
> rsk_ops->family);
> + SKB_DR_SET(reason, TCP_REQQFULLDROP);
> goto drop_and_release;
> }
>
> @@ -7006,7 +7010,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> drop_and_free:
> __reqsk_free(req);
> drop:
> + kfree_skb_reason(skb, reason);
Ugh no, prefer "return reason" and leave to the caller the freeing part.
Your changes are too invasive and will hurt future backports.
> tcp_listendrop(sk);
> - return 0;
> + return 1;
> }
> EXPORT_SYMBOL(tcp_conn_request);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 157265aecbed..b8daf49f54a5 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1470,7 +1470,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
>
> drop:
> tcp_listendrop(sk);
> - return 0;
This return 0 meant : do not send reset.
> + kfree_skb_reason(skb, SKB_DROP_REASON_IP_INADDRERRORS);
double kfree_skb() ?
> + return 1;
-> send RESET
> }
> EXPORT_SYMBOL(tcp_v4_conn_request);
>
> --
> 2.36.0
>
I have a hard time understanding this patch.
Where is the related IPv6 change ?
I really wonder if you actually have tested this.
On Tue, Apr 26, 2022 at 9:32 PM Eric Dumazet <[email protected]> wrote:
>
> On Tue, Apr 26, 2022 at 1:07 AM <[email protected]> wrote:
> >
[......]
> > + if (!err)
> > + consume_skb(skb);
>
> Please, do not add more mess like that, where skb is either freed by
> the callee or the caller.
>
Yeah, this is a little chaotic.....I just can't find a way out :/
keeping thinking
>
> > + return err < 0;
>
> Where err is set to a negative value ?
-1 is returned in dccp_v4_conn_request()
>
>
> > }
> > SKB_DR_SET(reason, TCP_FLAGS);
> > goto discard;
> > @@ -6878,6 +6877,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> > bool want_cookie = false;
> > struct dst_entry *dst;
> > struct flowi fl;
> > + SKB_DR(reason);
> >
> > /* TW buckets are converted to open requests without
> > * limitations, they conserve resources and peer is
> > @@ -6886,12 +6886,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> > if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
> > inet_csk_reqsk_queue_is_full(sk)) && !isn) {
> > want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
> > - if (!want_cookie)
> > + if (!want_cookie) {
> > + SKB_DR_SET(reason, TCP_REQQFULLDROP);
> > goto drop;
> > + }
> > }
> >
> > if (sk_acceptq_is_full(sk)) {
> > NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
> > + SKB_DR_SET(reason, LISTENOVERFLOWS);
> > goto drop;
> > }
> >
> > @@ -6947,6 +6950,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> > */
> > pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
> > rsk_ops->family);
> > + SKB_DR_SET(reason, TCP_REQQFULLDROP);
> > goto drop_and_release;
> > }
> >
> > @@ -7006,7 +7010,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> > drop_and_free:
> > __reqsk_free(req);
> > drop:
> > + kfree_skb_reason(skb, reason);
>
> Ugh no, prefer "return reason" and leave to the caller the freeing part.
>
> Your changes are too invasive and will hurt future backports.
>
Okey, I'll try some way else.
>
> > tcp_listendrop(sk);
> > - return 0;
> > + return 1;
> > }
> > EXPORT_SYMBOL(tcp_conn_request);
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index 157265aecbed..b8daf49f54a5 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -1470,7 +1470,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
> >
> > drop:
> > tcp_listendrop(sk);
> > - return 0;
>
> This return 0 meant : do not send reset.
>
>
> > + kfree_skb_reason(skb, SKB_DROP_REASON_IP_INADDRERRORS);
>
> double kfree_skb() ?
>
> > + return 1;
>
> -> send RESET
>
No, this return 1 means not send RESET and this skb is already freed in
icsk_af_ops->conn_request(), since I made a change to the caller of
conn_request() in tcp_rcv_state_process() and dccp_rcv_state_process():
err = icsk->icsk_af_ops->conn_request(sk, skb);
local_bh_enable();
rcu_read_unlock();
if (!err)
consume_skb(skb);
return err < 0;
if err==1, the skb will not be freed again, as 0 is returned by
tcp_rcv_state_process()
> > }
> > EXPORT_SYMBOL(tcp_v4_conn_request);
> >
> > --
> > 2.36.0
> >
>
> I have a hard time understanding this patch.
>
> Where is the related IPv6 change ?
>
> I really wonder if you actually have tested this.
Yeah, I missed the IPv6....but it still works, the changes are
compatible with current IPv6 code.
In fact, I have tested it, and everything is ok, no double free
happens:
drop at: tcp_conn_request+0xf1/0xcb0 (0xffffffff81d43271)
origin: software
input port ifindex: 1
timestamp: Thu Apr 28 10:19:42 2022 917631574 nsec
protocol: 0x800
length: 74
original length: 74
drop reason: LISTENOVERFLOWS
drop at: tcp_conn_request+0xf1/0xcb0 (0xffffffff81d43271)
origin: software
input port ifindex: 1
timestamp: Thu Apr 28 10:19:43 2022 930983132 nsec
protocol: 0x800
length: 74
original length: 74
drop reason: LISTENOVERFLOWS