We never know why we are deleting a tcp packet when we delete it,
and the tcp_drop_new() function can effectively solve this problem.
The tcp_drop_new() will learn from the specified status code why the
packet was deleted, and the caller from whom the packet was deleted.
The kernel should be a little more open to data that is about to be
destroyed and useless, and users should be able to keep track of it.
Signed-off-by: jony-one <[email protected]>
---
include/trace/events/tcp.h | 51 ++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_input.c | 29 ++++++++++++++--------
2 files changed, 69 insertions(+), 11 deletions(-)
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 521059d8d..5a0478440 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -371,6 +371,57 @@ DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
TP_ARGS(skb)
);
+/*
+ * tcp event whit argument sk, skb, reason
+ */
+TRACE_EVENT(tcp_drop_new,
+
+ TP_PROTO(struct sock *sk, struct sk_buff *skb, const char *reason),
+
+ TP_ARGS(sk, skb, reason),
+
+ TP_STRUCT__entry(
+ __field(const void *, skbaddr)
+ __field(const void *, skaddr)
+ __string(reason, reason)
+ __field(int, state)
+ __field(__u16, sport)
+ __field(__u16, dport)
+ __array(__u8, saddr, 4)
+ __array(__u8, daddr, 4)
+ __array(__u8, saddr_v6, 16)
+ __array(__u8, daddr_v6, 16)
+ ),
+
+ TP_fast_assign(
+ struct inet_sock *inet = inet_sk(sk);
+ __be32 *p32;
+
+ __assign_str(reason, reason);
+
+ __entry->skbaddr = skb;
+ __entry->skaddr = sk;
+ __entry->state = sk->sk_state;
+
+ __entry->sport = ntohs(inet->inet_sport);
+ __entry->dport = ntohs(inet->inet_dport);
+
+ p32 = (__be32 *) __entry->saddr;
+ *p32 = inet->inet_saddr;
+
+ p32 = (__be32 *) __entry->daddr;
+ *p32 = inet->inet_daddr;
+
+ TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+ sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+ ),
+
+ TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s reason=%s",
+ __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
+ __entry->saddr_v6, __entry->daddr_v6,
+ show_tcp_state_name(__entry->state), __get_str(reason))
+);
+
#endif /* _TRACE_TCP_H */
/* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 149ceb5c9..988989e25 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4649,6 +4649,13 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb);
}
+static void tcp_drop_new(struct sock *sk, struct sk_buff *skb, const char *reason)
+{
+ trace_tcp_drop_new(sk, skb, reason);
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
continue;
}
@@ -4732,7 +4739,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
sk->sk_data_ready(sk);
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
return;
}
@@ -4795,7 +4802,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4814,7 +4821,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop_new(sk, skb1, __func__);
goto merge_right;
}
} else if (tcp_ooo_try_coalesce(sk, skb1,
@@ -4842,7 +4849,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop_new(sk, skb1, __func__);
}
/* If there is no skb after us, we are the last_skb ! */
if (!skb1)
@@ -5019,7 +5026,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
return;
}
@@ -5276,7 +5283,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
goal -= rb_to_skb(node)->truesize;
- tcp_drop(sk, rb_to_skb(node));
+ tcp_drop_new(sk, rb_to_skb(node), __func__);
if (!prev || goal <= 0) {
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -5701,7 +5708,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
return true;
discard:
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
return false;
}
@@ -5905,7 +5912,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
}
EXPORT_SYMBOL(tcp_rcv_established);
@@ -6196,7 +6203,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
return 0;
} else {
tcp_send_ack(sk);
@@ -6568,7 +6575,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!queued) {
discard:
- tcp_drop(sk, skb);
+ tcp_drop_new(sk, skb, __func__);
}
return 0;
}
--
2.25.1
On 8/21/21 8:53 AM, jony-one wrote:
> We never know why we are deleting a tcp packet when we delete it,
> and the tcp_drop_new() function can effectively solve this problem.
> The tcp_drop_new() will learn from the specified status code why the
> packet was deleted, and the caller from whom the packet was deleted.
> The kernel should be a little more open to data that is about to be
> destroyed and useless, and users should be able to keep track of it.
For the subject, prefix "net/mlx4" is not accurate. This patch has
nothing to do with mlx4. Just use "net". The subject is not accurate
too, maybe "introduce tracepoint tcp_drop". Tracepoint name "tcp_drop"
seems more accurate compared to "tcp_drop_new".
It is worthwhile to mention the context/why we want to this
tracepoint with bcc issue https://github.com/iovisor/bcc/issues/3533.
Mainly two reasons: (1). tcp_drop is a tiny function which
may easily get inlined, a tracepoint is more stable, and (2).
tcp_drop does not provide enough information on why it is dropped.
>
> Signed-off-by: jony-one <[email protected]>
Please use your proper name. "jony-one" is not a good one.
> ---
> include/trace/events/tcp.h | 51 ++++++++++++++++++++++++++++++++++++++
> net/ipv4/tcp_input.c | 29 ++++++++++++++--------
> 2 files changed, 69 insertions(+), 11 deletions(-)
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 521059d8d..5a0478440 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -371,6 +371,57 @@ DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
> TP_ARGS(skb)
> );
>
> +/*
> + * tcp event whit argument sk, skb, reason
> + */
> +TRACE_EVENT(tcp_drop_new,
tcp_drop instead of tcp_deop_new?
> +
> + TP_PROTO(struct sock *sk, struct sk_buff *skb, const char *reason),
> +
> + TP_ARGS(sk, skb, reason),
> +
> + TP_STRUCT__entry(
> + __field(const void *, skbaddr)
> + __field(const void *, skaddr)
> + __string(reason, reason)
I understand we may want to print out the reason with more
elaborate reason, but for kernel internal implementation
an enum should be enough. See tracepoint sched/sched_switch.
> + __field(int, state)
> + __field(__u16, sport)
> + __field(__u16, dport)
> + __array(__u8, saddr, 4)
> + __array(__u8, daddr, 4)
> + __array(__u8, saddr_v6, 16)
> + __array(__u8, daddr_v6, 16)
> + ),
> +
> + TP_fast_assign(
> + struct inet_sock *inet = inet_sk(sk);
> + __be32 *p32;
> +
> + __assign_str(reason, reason);
> +
> + __entry->skbaddr = skb;
> + __entry->skaddr = sk;
> + __entry->state = sk->sk_state;
> +
> + __entry->sport = ntohs(inet->inet_sport);
> + __entry->dport = ntohs(inet->inet_dport);
> +
> + p32 = (__be32 *) __entry->saddr;
> + *p32 = inet->inet_saddr;
> +
> + p32 = (__be32 *) __entry->daddr;
> + *p32 = inet->inet_daddr;
> +
> + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> + ),
> +
> + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s reason=%s",
> + __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
> + __entry->saddr_v6, __entry->daddr_v6,
> + show_tcp_state_name(__entry->state), __get_str(reason))
> +);
> +
> #endif /* _TRACE_TCP_H */
>
> /* This part must be outside protection */
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 149ceb5c9..988989e25 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4649,6 +4649,13 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
> __kfree_skb(skb);
> }
>
> +static void tcp_drop_new(struct sock *sk, struct sk_buff *skb, const char *reason)
You can rename funciton name to __tcp_drop(). tcp_drop_new() is not a
good name.
> +{
> + trace_tcp_drop_new(sk, skb, reason);
> + sk_drops_add(sk, skb);
> + __kfree_skb(skb);
> +}
> +
> /* This one checks to see if we can put data from the
> * out_of_order queue into the receive_queue.
> */
> @@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)
> rb_erase(&skb->rbnode, &tp->out_of_order_queue);
>
> if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
use enumeration?
> continue;
> }
>
> @@ -4732,7 +4739,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
> NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
> sk->sk_data_ready(sk);
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
> return;
> }
>
> @@ -4795,7 +4802,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> /* All the bits are present. Drop. */
> NET_INC_STATS(sock_net(sk),
> LINUX_MIB_TCPOFOMERGE);
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
> skb = NULL;
> tcp_dsack_set(sk, seq, end_seq);
> goto add_sack;
> @@ -4814,7 +4821,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> TCP_SKB_CB(skb1)->end_seq);
> NET_INC_STATS(sock_net(sk),
> LINUX_MIB_TCPOFOMERGE);
> - tcp_drop(sk, skb1);
> + tcp_drop_new(sk, skb1, __func__);
This one and the above one are in the same function so they will have
the same reason. It would be good if we can differentiate them.
> goto merge_right;
> }
> } else if (tcp_ooo_try_coalesce(sk, skb1,
> @@ -4842,7 +4849,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
> tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
> TCP_SKB_CB(skb1)->end_seq);
> NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
> - tcp_drop(sk, skb1);
> + tcp_drop_new(sk, skb1, __func__);
> }
> /* If there is no skb after us, we are the last_skb ! */
> if (!skb1)
> @@ -5019,7 +5026,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
> tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
> inet_csk_schedule_ack(sk);
> drop:
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
> return;
> }
>
> @@ -5276,7 +5283,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
> prev = rb_prev(node);
> rb_erase(node, &tp->out_of_order_queue);
> goal -= rb_to_skb(node)->truesize;
> - tcp_drop(sk, rb_to_skb(node));
> + tcp_drop_new(sk, rb_to_skb(node), __func__);
> if (!prev || goal <= 0) {
> sk_mem_reclaim(sk);
> if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
> @@ -5701,7 +5708,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> return true;
>
> discard:
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
> return false;
> }
>
> @@ -5905,7 +5912,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
> TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
>
> discard:
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
> }
> EXPORT_SYMBOL(tcp_rcv_established);
>
> @@ -6196,7 +6203,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
> TCP_DELACK_MAX, TCP_RTO_MAX);
>
> discard:
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
> return 0;
> } else {
> tcp_send_ack(sk);
> @@ -6568,7 +6575,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
>
> if (!queued) {
> discard:
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
> }
> return 0;
> }
>
On Sat, Aug 21, 2021 at 8:55 AM jony-one <[email protected]> wrote:
> @@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)
> rb_erase(&skb->rbnode, &tp->out_of_order_queue);
>
> if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
> - tcp_drop(sk, skb);
> + tcp_drop_new(sk, skb, __func__);
This is very similar to race_kfree_skb():
void kfree_skb(struct sk_buff *skb)
{
if (!skb_unref(skb))
return;
trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
So... why not something like this?
iff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f7bd7ae7d7a..cc840e4552c9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4678,6 +4678,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
static void tcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
+ trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
Thanks.