I used the suggestion from `Brendan Gregg`. In addition to the
`reason` parameter there is also the `field` parameter pointing
to `SNMP` to distinguish the `tcp_drop` cause. I know what I
submitted is not accurate, so I am submitting the current
patch to get comments and criticism from everyone so that I
can submit better code and solutions.And of course to make me
more familiar and understand the `linux` kernel network code.
Thank you everyone!
Signed-off-by: Zhongya Yan <[email protected]>
---
include/trace/events/tcp.h | 62 ++++++++++++++++++
net/ipv4/tcp_input.c | 126 +++++++++++++++++++++++--------------
2 files changed, 142 insertions(+), 46 deletions(-)
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 521059d8dc0a..68bbe8741ce8 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -371,6 +371,68 @@ DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
TP_ARGS(skb)
);
+TRACE_EVENT(tcp_drop,
+ TP_PROTO(struct sock *sk, struct sk_buff *skb, int field, const char *reason),
+
+ TP_ARGS(sk, skb, field, reason),
+
+ TP_STRUCT__entry(
+ __array(__u8, saddr, sizeof(struct sockaddr_in6))
+ __array(__u8, daddr, sizeof(struct sockaddr_in6))
+ __field(__u16, sport)
+ __field(__u16, dport)
+ __field(__u32, mark)
+ __field(__u16, data_len)
+ __field(__u32, snd_nxt)
+ __field(__u32, snd_una)
+ __field(__u32, snd_cwnd)
+ __field(__u32, ssthresh)
+ __field(__u32, snd_wnd)
+ __field(__u32, srtt)
+ __field(__u32, rcv_wnd)
+ __field(__u64, sock_cookie)
+ __field(int, field)
+ __string(reason, reason)
+ ),
+
+ TP_fast_assign(
+ const struct tcphdr *th = (const struct tcphdr *)skb->data;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+ memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+ TP_STORE_ADDR_PORTS(__entry, inet, sk);
+
+ __entry->sport = ntohs(inet->inet_sport);
+ __entry->dport = ntohs(inet->inet_dport);
+ __entry->mark = skb->mark;
+
+ __entry->data_len = skb->len - __tcp_hdrlen(th);
+ __entry->snd_nxt = tp->snd_nxt;
+ __entry->snd_una = tp->snd_una;
+ __entry->snd_cwnd = tp->snd_cwnd;
+ __entry->snd_wnd = tp->snd_wnd;
+ __entry->rcv_wnd = tp->rcv_wnd;
+ __entry->ssthresh = tcp_current_ssthresh(sk);
+ __entry->srtt = tp->srtt_us >> 3;
+ __entry->sock_cookie = sock_gen_cookie(sk);
+ __entry->field = field;
+
+ __assign_str(reason, reason);
+ ),
+
+ TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x \
+ snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u \
+ sock_cookie=%llx field=%d reason=%s",
+ __entry->saddr, __entry->daddr, __entry->mark,
+ __entry->data_len, __entry->snd_nxt, __entry->snd_una,
+ __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
+ __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie,
+ __entry->field, __get_str(reason))
+);
+
#endif /* _TRACE_TCP_H */
/* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f7bd7ae7d7a..f16eeea9d00b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4675,8 +4675,10 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
return res;
}
-static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+static void tcp_drop(struct sock *sk, struct sk_buff *skb,
+ int field, const char *reason)
{
+ trace_tcp_drop(sk, skb, field, reason);
sk_drops_add(sk, skb);
__kfree_skb(skb);
}
@@ -4708,7 +4710,7 @@ static void tcp_ofo_queue(struct sock *sk)
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
- tcp_drop(sk, skb);
+ tcp_drop(sk, skb, LINUX_MIB_TCPOFOQUEUE, "Tcp queue error");
continue;
}
@@ -4764,7 +4766,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
sk->sk_data_ready(sk);
- tcp_drop(sk, skb);
+ tcp_drop(sk, skb, LINUX_MIB_TCPOFODROP, "Tcp rmem failed");
return;
}
@@ -4827,7 +4829,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb);
+ tcp_drop(sk, skb, LINUX_MIB_TCPOFOMERGE, "Tcp bits are present");
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4846,7 +4848,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop(sk, skb1, LINUX_MIB_TCPOFOMERGE,
+ "Tcp replace(skb.seq eq skb1.seq)");
+
goto merge_right;
}
} else if (tcp_ooo_try_coalesce(sk, skb1,
@@ -4874,7 +4878,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop(sk, skb1, LINUX_MIB_TCPOFOMERGE, "Tcp useless other segments");
}
/* If there is no skb after us, we are the last_skb ! */
if (!skb1)
@@ -5010,7 +5014,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
sk->sk_data_ready(sk);
- goto drop;
+ tcp_drop(sk, skb, LINUX_MIB_TCPRCVQDROP, "Tcp rmem failed");
+ goto end;
}
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
@@ -5050,8 +5055,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
-drop:
- tcp_drop(sk, skb);
+ tcp_drop(sk, skb, LINUX_MIB_TCPZEROWINDOWDROP, "Tcp out of order or zero window");
+end:
return;
}
@@ -5308,7 +5313,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
goal -= rb_to_skb(node)->truesize;
- tcp_drop(sk, rb_to_skb(node));
+ tcp_drop(sk, rb_to_skb(node), LINUX_MIB_OFOPRUNED, "Tcp drop out-of-order queue");
if (!prev || goal <= 0) {
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -5643,7 +5648,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_PAWSESTABREJECTED, "Tcp PAWS seq first");
+ goto end;
}
/* Reset is accepted even if it did not pass PAWS. */
}
@@ -5666,7 +5672,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
} else if (tcp_reset_check(sk, skb)) {
tcp_reset(sk, skb);
}
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_PAWSESTABREJECTED, "Tcp check sequence number");
+ goto end;
}
/* Step 2: check RST bit */
@@ -5711,7 +5718,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
tcp_fastopen_active_disable(sk);
tcp_send_challenge_ack(sk, skb);
}
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TCPCHALLENGEACK, "Tcp check RST bit ");
+ goto end;
}
/* step 3: check security and precedence [ignored] */
@@ -5725,15 +5733,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TCPSYNCHALLENGE, "Tcp check for a SYN");
+ goto end;
}
bpf_skops_parse_hdr(sk, skb);
return true;
-discard:
- tcp_drop(sk, skb);
+end:
return false;
}
@@ -5851,7 +5859,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
return;
} else { /* Header too small */
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
- goto discard;
+ tcp_drop(sk, skb, TCP_MIB_INERRS, "Tcp header too small");
+ goto end;
}
} else {
int eaten = 0;
@@ -5905,8 +5914,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
- if (!th->ack && !th->rst && !th->syn)
- goto discard;
+ if (!th->ack && !th->rst && !th->syn) {
+ tcp_drop(sk, skb, LINUX_MIB_TCPSLOWSTARTRETRANS, "Tcp state not in ack|rst|syn");
+ goto end;
+ }
/*
* Standard slow path.
@@ -5916,8 +5927,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
return;
step5:
- if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
- goto discard;
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) {
+ tcp_drop(sk, skb, LINUX_MIB_TCPSACKDISCARD, "Tcp ack have not sent yet");
+ goto end;
+ }
tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5935,9 +5948,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ tcp_drop(sk, skb, TCP_MIB_CSUMERRORS, "Tcp csum error");
-discard:
- tcp_drop(sk, skb);
+end:
+ return;
}
EXPORT_SYMBOL(tcp_rcv_established);
@@ -6137,7 +6151,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (th->rst) {
tcp_reset(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_NUM, "Tcp reset");
+ goto end;
}
/* rfc793:
@@ -6226,9 +6241,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
+ tcp_drop(sk, skb, LINUX_MIB_TCPFASTOPENACTIVE, "Tcp fast open ack error");
-discard:
- tcp_drop(sk, skb);
+end:
return 0;
} else {
tcp_send_ack(sk);
@@ -6301,7 +6316,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
return -1;
#else
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_SYNCOOKIESRECV, "Tcp syn received error");
+ goto end;
#endif
}
/* "fifth, if neither of the SYN or RST bits is set then
@@ -6311,7 +6327,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TCPSACKDISCARD, "Tcp not neither of SYN or RST");
+ goto end;
reset_and_undo:
tcp_clear_options(&tp->rx_opt);
@@ -6369,18 +6386,23 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
switch (sk->sk_state) {
case TCP_CLOSE:
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TCPABORTONCLOSE, "Tcp close");
+ goto end;
case TCP_LISTEN:
if (th->ack)
return 1;
- if (th->rst)
- goto discard;
+ if (th->rst) {
+ tcp_drop(sk, skb, LINUX_MIB_LISTENDROPS, "Tcp rst");
+ goto end;
+ }
if (th->syn) {
- if (th->fin)
- goto discard;
+ if (th->fin) {
+ tcp_drop(sk, skb, LINUX_MIB_LISTENDROPS, "Tcp fin");
+ goto end;
+ }
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
@@ -6395,7 +6417,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
consume_skb(skb);
return 0;
}
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_LISTENDROPS, "Tcp syn");
+ goto end;
case TCP_SYN_SENT:
tp->rx_opt.saw_tstamp = 0;
@@ -6421,12 +6444,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- if (!tcp_check_req(sk, skb, req, true, &req_stolen))
- goto discard;
+ if (!tcp_check_req(sk, skb, req, true, &req_stolen)) {
+ tcp_drop(sk, skb, LINUX_MIB_LISTENDROPS, "Tcp check req error");
+ goto end;
+ }
}
- if (!th->ack && !th->rst && !th->syn)
- goto discard;
+ if (!th->ack && !th->rst && !th->syn) {
+ tcp_drop(sk, skb, LINUX_MIB_LISTENDROPS, "Tcp not ack|rst|syn");
+ goto end;
+ }
if (!tcp_validate_incoming(sk, skb, th, 0))
return 0;
@@ -6440,7 +6467,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_SYN_RECV)
return 1; /* send one RST */
tcp_send_challenge_ack(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TCPCHALLENGEACK, "Tcp check ack failed");
+ goto end;
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
@@ -6533,7 +6561,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TCPABORTONDATA, "Tcp fin wait2");
+ goto end;
}
break;
}
@@ -6541,7 +6570,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TIMEWAITED, "Tcp time wait");
+ goto end;
}
break;
@@ -6549,7 +6579,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
- goto discard;
+ tcp_drop(sk, skb, LINUX_MIB_TCPPUREACKS, "Tcp last ack");
+ goto end;
}
break;
}
@@ -6566,8 +6597,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
/* If a subflow has been reset, the packet should not
* continue to be processed, drop the packet.
*/
- if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
- goto discard;
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+ tcp_drop(sk, skb, LINUX_MIB_TCPPUREACKS, "Tcp subflow been reset");
+ goto end;
+ }
break;
}
fallthrough;
@@ -6599,9 +6632,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
if (!queued) {
-discard:
- tcp_drop(sk, skb);
+ tcp_drop(sk, skb, LINUX_MIB_TCPOFOQUEUE, "Tcp rcv synsent state process");
}
+
+end:
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
--
2.25.1
On Fri, Sep 3, 2021 at 11:43 PM Zhongya Yan <[email protected]> wrote:
>
> I used the suggestion from `Brendan Gregg`. In addition to the
> `reason` parameter there is also the `field` parameter pointing
> to `SNMP` to distinguish the `tcp_drop` cause. I know what I
> submitted is not accurate, so I am submitting the current
> patch to get comments and criticism from everyone so that I
> can submit better code and solutions.And of course to make me
> more familiar and understand the `linux` kernel network code.
Any reason why only limit this to TCP? I am pretty sure we are
interested in packet drops across the entire stack.
You can take a look at net/core/drop_monitor.c, it actually has
a big advantage over your solution, which is sending entire
dropped packets to user-space for inspection.
If you can have a solution for all packet drops, not just TCP,
it would make your patch much more useful. Therefore, I'd
suggest you to consider extending the drop monitor.
Thanks.
On Tue, Sep 7, 2021 at 8:12 AM Steven Rostedt <[email protected]> wrote:
> Just curious. Is "Tcp" the normal way to write "TCP" in the kernel? I see
> it in snmp_seq_show_tcp_udp() in net/ipv4/proc.c, but no where else
> (besides doing CamelCase words). Should these be written using "TCP"
> instead of "Tcp". It just looks awkward to me.
>
Yes, we prefer tcp or TCP, although SNMP counters have Tcp in them,
for some reason
( grep Tcp /proc/net/netstat /proc/net/snmp ), that can not be changed.
On Fri, 3 Sep 2021 23:40:44 -0700
Zhongya Yan <[email protected]> wrote:
> @@ -4708,7 +4710,7 @@ static void tcp_ofo_queue(struct sock *sk)
> rb_erase(&skb->rbnode, &tp->out_of_order_queue);
>
> if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
> - tcp_drop(sk, skb);
> + tcp_drop(sk, skb, LINUX_MIB_TCPOFOQUEUE, "Tcp queue error");
> continue;
> }
>
> @@ -4764,7 +4766,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
> NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
> sk->sk_data_ready(sk);
> - tcp_drop(sk, skb);
> + tcp_drop(sk, skb, LINUX_MIB_TCPOFODROP, "Tcp rmem failed");
> return;
> }
>
> @@ -4827,7 +4829,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> /* All the bits are present. Drop. */
> NET_INC_STATS(sock_net(sk),
> LINUX_MIB_TCPOFOMERGE);
> - tcp_drop(sk, skb);
> + tcp_drop(sk, skb, LINUX_MIB_TCPOFOMERGE, "Tcp bits are present");
Just curious. Is "Tcp" the normal way to write "TCP" in the kernel? I see
it in snmp_seq_show_tcp_udp() in net/ipv4/proc.c, but no where else
(besides doing CamelCase words). Should these be written using "TCP"
instead of "Tcp". It just looks awkward to me.
But hey, I'm not one of the networking folks, so what do I know?
-- Steve
On Sat, 11 Sep 2021 03:50:12 -0700
Steven Rostedt <[email protected]> wrote:
> Just curious. Is "Tcp" the normal way to write "TCP" in the kernel? I see
> it in snmp_seq_show_tcp_udp() in net/ipv4/proc.c, but no where else
> (besides doing CamelCase words). Should these be written using "TCP"
> instead of "Tcp". It just looks awkward to me.
>
> But hey, I'm not one of the networking folks, so what do I know?
>
> -- Steve
I will do it. I guess any other suggestions? I feel like my description may not be accurate.
Thanks.
--Zhongya Yan
On Sat, Sep 4, 2021 at 7:51 PM Zhongya Yan <[email protected]> wrote:
> Thanks for the tip!
> See reason:https://github.com/iovisor/bcc/issues/3533
This link only explains why TCP needs it, my question is why not
do it for other protocols too, therefore neither this link nor you
answers it.
BTW, net/core/drop_monitor.c is based on kfree_skb() tracepoint
too, in case you are only interested in tracepoints not other things.
Thanks.