2021-08-25 16:04:50

by Benjamin Yim

[permalink] [raw]
Subject: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

In this version, fix and modify some code issues. Changed the reason for `tcp_drop` from an enumeration to a mask and enumeration usage in the trace output.
By shifting `__LINE__` left by 6 bits to accommodate the `tcp_drop` call method source, 6 bits are enough to use. This allows for a more granular identification of the reason for calling `tcp_drop` without conflicts and essentially without overflow.
Example.
```
enum {
TCP_OFO_QUEUE = 1
}
reason = __LINE__ << 6
reason |= TCP_OFO_QUEUE
```
Suggestions from Jakub Kicinski, Eric Dumazet, much appreciated.

Modified the expression of the enumeration, and the use of the output under the trace definition.
Suggestion from Steven Rostedt. Thanks.

Signed-off-by: Zhongya Yan <[email protected]>
---
include/net/tcp.h | 18 +++---
include/trace/events/tcp.h | 39 +++++++++++--
net/ipv4/tcp_input.c | 114 ++++++++++++++++++++++---------------
3 files changed, 112 insertions(+), 59 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd8cd8d6f2f1..75614a252675 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -256,15 +256,19 @@ extern unsigned long tcp_memory_pressure;

enum tcp_drop_reason {
TCP_OFO_QUEUE = 1,
- TCP_DATA_QUEUE_OFO = 2,
- TCP_DATA_QUEUE = 3,
- TCP_PRUNE_OFO_QUEUE = 4,
- TCP_VALIDATE_INCOMING = 5,
- TCP_RCV_ESTABLISHED = 6,
- TCP_RCV_SYNSENT_STATE_PROCESS = 7,
- TCP_RCV_STATE_PROCESS = 8
+ TCP_DATA_QUEUE_OFO,
+ TCP_DATA_QUEUE,
+ TCP_PRUNE_OFO_QUEUE,
+ TCP_VALIDATE_INCOMING,
+ TCP_RCV_ESTABLISHED,
+ TCP_RCV_SYNSENT_STATE_PROCESS,
+ TCP_RCV_STATE_PROCESS
};

+#define TCP_DROP_MASK(line, code) ((line << 6) | code) /* reason for masking */
+#define TCP_DROP_LINE(mask) (mask >> 6) /* Cause decode to get unique identifier */
+#define TCP_DROP_CODE(mask) (mask&0x3F) /* Cause decode get function code */
+
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index a0d3d31eb591..699539702ea9 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -371,8 +371,26 @@ DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
TP_ARGS(skb)
);

+// from author @{Steven Rostedt}
+#define TCP_DROP_REASON \
+ REASON_STRING(TCP_OFO_QUEUE, ofo_queue) \
+ REASON_STRING(TCP_DATA_QUEUE_OFO, data_queue_ofo) \
+ REASON_STRING(TCP_DATA_QUEUE, data_queue) \
+ REASON_STRING(TCP_PRUNE_OFO_QUEUE, prune_ofo_queue) \
+ REASON_STRING(TCP_VALIDATE_INCOMING, validate_incoming) \
+ REASON_STRING(TCP_RCV_ESTABLISHED, rcv_established) \
+ REASON_STRING(TCP_RCV_SYNSENT_STATE_PROCESS, rcv_synsent_state_process) \
+ REASON_STRINGe(TCP_RCV_STATE_PROCESS, rcv_state_proces)
+
+#undef REASON_STRING
+#undef REASON_STRINGe
+
+#define REASON_STRING(code, name) {code , #name},
+#define REASON_STRINGe(code, name) {code, #name}
+
+
TRACE_EVENT(tcp_drop,
- TP_PROTO(struct sock *sk, struct sk_buff *skb, enum tcp_drop_reason reason),
+ TP_PROTO(struct sock *sk, struct sk_buff *skb, __u32 reason),

TP_ARGS(sk, skb, reason),

@@ -392,6 +410,8 @@ TRACE_EVENT(tcp_drop,
__field(__u32, rcv_wnd)
__field(__u64, sock_cookie)
__field(__u32, reason)
+ __field(__u32, reason_code)
+ __field(__u32, reason_line)
),

TP_fast_assign(
@@ -415,16 +435,23 @@ TRACE_EVENT(tcp_drop,
__entry->snd_wnd = tp->snd_wnd;
__entry->rcv_wnd = tp->rcv_wnd;
__entry->ssthresh = tcp_current_ssthresh(sk);
- __entry->srtt = tp->srtt_us >> 3;
- __entry->sock_cookie = sock_gen_cookie(sk);
- __entry->reason = reason;
+ __entry->srtt = tp->srtt_us >> 3;
+ __entry->sock_cookie = sock_gen_cookie(sk);
+ __entry->reason = reason;
+ __entry->reason_code = TCP_DROP_CODE(reason);
+ __entry->reason_line = TCP_DROP_LINE(reason);
),

- TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx reason=%d",
+ TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x \
+ snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u \
+ sock_cookie=%llx reason=%d reason_type=%s reason_line=%d",
__entry->saddr, __entry->daddr, __entry->mark,
__entry->data_len, __entry->snd_nxt, __entry->snd_una,
__entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
- __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie, __entry->reason)
+ __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie,
+ __entry->reason,
+ __print_symbolic(__entry->reason_code, TCP_DROP_REASON),
+ __entry->reason_line)
);

#endif /* _TRACE_TCP_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 83e31661876b..7dfcc4253c35 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4643,20 +4643,14 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
return res;
}

-static void __tcp_drop(struct sock *sk,
- struct sk_buff *skb)
-{
- sk_drops_add(sk, skb);
- __kfree_skb(skb);
-}

-/* tcp_drop whit reason,for epbf trace
+/* tcp_drop with reason
*/
static void tcp_drop(struct sock *sk, struct sk_buff *skb,
- enum tcp_drop_reason reason)
+ __u32 reason)
{
trace_tcp_drop(sk, skb, reason);
- __tcp_drop(sk, skb);
+ __kfree_skb(skb);
}

/* This one checks to see if we can put data from the
@@ -5621,7 +5615,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
+ goto end;
}
/* Reset is accepted even if it did not pass PAWS. */
}
@@ -5644,7 +5639,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
} else if (tcp_reset_check(sk, skb)) {
tcp_reset(sk, skb);
}
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
+ goto end;
}

/* Step 2: check RST bit */
@@ -5689,7 +5685,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
tcp_fastopen_active_disable(sk);
tcp_send_challenge_ack(sk, skb);
}
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
+ goto end;
}

/* step 3: check security and precedence [ignored] */
@@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
+ goto end;
}

bpf_skops_parse_hdr(sk, skb);

return true;

-discard:
- tcp_drop(sk, skb, TCP_VALIDATE_INCOMING);
+end:
return false;
}

@@ -5829,7 +5826,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
return;
} else { /* Header too small */
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_ESTABLISHED));
+ goto end;
}
} else {
int eaten = 0;
@@ -5883,8 +5881,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;

- if (!th->ack && !th->rst && !th->syn)
- goto discard;
+ if (!th->ack && !th->rst && !th->syn) {
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_ESTABLISHED));
+ goto end;
+ }

/*
* Standard slow path.
@@ -5894,8 +5894,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
return;

step5:
- if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
- goto discard;
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) {
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_ESTABLISHED));
+ goto end;
+ }

tcp_rcv_rtt_measure_ts(sk, skb);

@@ -5913,9 +5915,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
-
-discard:
- tcp_drop(sk, skb, TCP_RCV_ESTABLISHED);
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_ESTABLISHED));
+end:
+ return;
}
EXPORT_SYMBOL(tcp_rcv_established);

@@ -6115,7 +6117,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

if (th->rst) {
tcp_reset(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_SYNSENT_STATE_PROCESS));
+ goto end;
}

/* rfc793:
@@ -6204,9 +6207,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_SYNSENT_STATE_PROCESS));

-discard:
- tcp_drop(sk, skb, TCP_RCV_SYNSENT_STATE_PROCESS);
+end:
return 0;
} else {
tcp_send_ack(sk);
@@ -6279,7 +6282,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
return -1;
#else
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_SYNSENT_STATE_PROCESS));
+ goto end;
#endif
}
/* "fifth, if neither of the SYN or RST bits is set then
@@ -6289,7 +6293,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_SYNSENT_STATE_PROCESS));
+ goto end;

reset_and_undo:
tcp_clear_options(&tp->rx_opt);
@@ -6347,18 +6352,23 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)

switch (sk->sk_state) {
case TCP_CLOSE:
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;

case TCP_LISTEN:
if (th->ack)
return 1;

- if (th->rst)
- goto discard;
+ if (th->rst) {
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
+ }

if (th->syn) {
- if (th->fin)
- goto discard;
+ if (th->fin) {
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
+ }
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
@@ -6373,7 +6383,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
consume_skb(skb);
return 0;
}
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;

case TCP_SYN_SENT:
tp->rx_opt.saw_tstamp = 0;
@@ -6399,12 +6410,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);

- if (!tcp_check_req(sk, skb, req, true, &req_stolen))
- goto discard;
+ if (!tcp_check_req(sk, skb, req, true, &req_stolen)) {
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
+ }
}

- if (!th->ack && !th->rst && !th->syn)
- goto discard;
+ if (!th->ack && !th->rst && !th->syn) {
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
+ }

if (!tcp_validate_incoming(sk, skb, th, 0))
return 0;
@@ -6418,7 +6433,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_SYN_RECV)
return 1; /* send one RST */
tcp_send_challenge_ack(sk, skb);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
@@ -6511,7 +6527,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
}
break;
}
@@ -6519,7 +6536,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
}
break;

@@ -6527,7 +6545,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
- goto discard;
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
}
break;
}
@@ -6544,8 +6563,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
/* If a subflow has been reset, the packet should not
* continue to be processed, drop the packet.
*/
- if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
- goto discard;
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
+ goto end;
+ }
break;
}
fallthrough;
@@ -6577,9 +6598,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}

if (!queued) {
-discard:
- tcp_drop(sk, skb, TCP_RCV_STATE_PROCESS);
+ tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_RCV_STATE_PROCESS));
}
+
+end:
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
--
2.25.1


2021-08-25 16:05:22

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Wed, Aug 25, 2021 at 8:41 AM Zhongya Yan <[email protected]> wrote:
>
> In this version, fix and modify some code issues. Changed the reason for `tcp_drop` from an enumeration to a mask and enumeration usage in the trace output.
> By shifting `__LINE__` left by 6 bits to accommodate the `tcp_drop` call method source, 6 bits are enough to use. This allows for a more granular identification of the reason for calling `tcp_drop` without conflicts and essentially without overflow.
> Example.
> ```


...

*/
> @@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
> tcp_send_challenge_ack(sk, skb);
> - goto discard;
> + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));

I'd rather use a string. So that we can more easily identify _why_ the
packet was drop, without looking at the source code
of the exact kernel version to locate line number 1057

You can be sure that we will get reports in the future from users of
heavily modified kernels.
Having to download a git tree, or apply semi-private patches is a no go.

If you really want to include __FILE__ and __LINE__, these both can be
stringified and included in the report, with the help of macros.

2021-08-25 16:14:19

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Wed, 25 Aug 2021 08:47:46 -0700 Eric Dumazet wrote:
> On Wed, Aug 25, 2021 at 8:41 AM Zhongya Yan <[email protected]> wrote:
> > @@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> > TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
> > tcp_send_challenge_ack(sk, skb);
> > - goto discard;
> > + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
>
> I'd rather use a string. So that we can more easily identify _why_ the
> packet was drop, without looking at the source code
> of the exact kernel version to locate line number 1057

Yeah, the line number seems like a particularly bad idea. Hopefully
strings won't be problematic, given we can expect most serious users
to feed the tracepoints via BPF. enum would be more convenient there,
I'd think.

> You can be sure that we will get reports in the future from users of
> heavily modified kernels.
> Having to download a git tree, or apply semi-private patches is a no go.

I'm slightly surprised by this angle. Are there downstream kernels with
heavily modified TCP other than Google's?

> If you really want to include __FILE__ and __LINE__, these both can be
> stringified and included in the report, with the help of macros.

2021-08-25 16:27:47

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Wed, Aug 25, 2021 at 9:04 AM Jakub Kicinski <[email protected]> wrote:
>
> On Wed, 25 Aug 2021 08:47:46 -0700 Eric Dumazet wrote:
> > On Wed, Aug 25, 2021 at 8:41 AM Zhongya Yan <[email protected]> wrote:
> > > @@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> > > TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> > > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
> > > tcp_send_challenge_ack(sk, skb);
> > > - goto discard;
> > > + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
> >
> > I'd rather use a string. So that we can more easily identify _why_ the
> > packet was drop, without looking at the source code
> > of the exact kernel version to locate line number 1057
>
> Yeah, the line number seems like a particularly bad idea. Hopefully
> strings won't be problematic, given we can expect most serious users
> to feed the tracepoints via BPF. enum would be more convenient there,
> I'd think.
>
> > You can be sure that we will get reports in the future from users of
> > heavily modified kernels.
> > Having to download a git tree, or apply semi-private patches is a no go.
>
> I'm slightly surprised by this angle. Are there downstream kernels with
> heavily modified TCP other than Google's?

Not sure why Google is mentioned here ?
Have you ever received a public report about TCP behavior in a Google kernel ?

Over the years, we received hundreds of TCP bug reports on
netdev@vger, where users claim to use kernel version 4.19 (or other),
when in fact they use 4.19.xxx
It takes in general multiple emails exchange before we get a more
realistic version number.
Not to mention distro kernels, or even worse private kernels, which
are not exactly easy to track for us upstream developers.

2021-08-25 17:10:45

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Wed, 25 Aug 2021 09:20:37 -0700 Eric Dumazet wrote:
> On Wed, Aug 25, 2021 at 9:04 AM Jakub Kicinski <[email protected]> wrote:
> > On Wed, 25 Aug 2021 08:47:46 -0700 Eric Dumazet wrote:
> > > I'd rather use a string. So that we can more easily identify _why_ the
> > > packet was drop, without looking at the source code
> > > of the exact kernel version to locate line number 1057
> >
> > Yeah, the line number seems like a particularly bad idea. Hopefully
> > strings won't be problematic, given we can expect most serious users
> > to feed the tracepoints via BPF. enum would be more convenient there,
> > I'd think.
> >
> > > You can be sure that we will get reports in the future from users of
> > > heavily modified kernels.
> > > Having to download a git tree, or apply semi-private patches is a no go.
> >
> > I'm slightly surprised by this angle. Are there downstream kernels with
> > heavily modified TCP other than Google's?
>
> Not sure why Google is mentioned here ?
> Have you ever received a public report about TCP behavior in a Google kernel ?

That's a rhetorical question quite likely, but to be clear - what
I meant is that Google is the main contributor to Linux TCP and has
the expertise to make changes. I don't know of any others hence the
question.

> Over the years, we received hundreds of TCP bug reports on
> netdev@vger, where users claim to use kernel version 4.19 (or other),
> when in fact they use 4.19.xxx
> It takes in general multiple emails exchange before we get a more
> realistic version number.
> Not to mention distro kernels, or even worse private kernels, which
> are not exactly easy to track for us upstream developers.

Right but for backports values come from original patch, enum or string.

I don't mean to dispute your preference tho, if you want strings,
strings it is.

2021-08-26 03:22:43

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Wed, 25 Aug 2021 08:47:46 -0700
Eric Dumazet <[email protected]> wrote:

> > @@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> > TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
> > tcp_send_challenge_ack(sk, skb);
> > - goto discard;
> > + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
>
> I'd rather use a string. So that we can more easily identify _why_ the
> packet was drop, without looking at the source code
> of the exact kernel version to locate line number 1057
>
> You can be sure that we will get reports in the future from users of
> heavily modified kernels.
> Having to download a git tree, or apply semi-private patches is a no go.
>
> If you really want to include __FILE__ and __LINE__, these both can be
> stringified and included in the report, with the help of macros.

I agree the __LINE__ is pointless, but if this has a tracepoint
involved, then you can simply enable the stacktrace trigger to it and
it will save a stack trace in the ring buffer for you.

echo stacktrace > /sys/kernel/tracing/events/tcp/tcp_drop/trigger

And when the event triggers it will record a stack trace. You can also
even add a filter to do it only for specific reasons.

echo 'stacktrace if reason == 1' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger

And it even works for flags:

echo 'stacktrace if reason & 0xa' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger

Which gives another reason to use an enum over a string.

-- Steve

2021-08-26 05:16:19

by Brendan Gregg

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Thu, Aug 26, 2021 at 1:20 PM Steven Rostedt <[email protected]> wrote:
>
> On Wed, 25 Aug 2021 08:47:46 -0700
> Eric Dumazet <[email protected]> wrote:
>
> > > @@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> > > TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> > > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
> > > tcp_send_challenge_ack(sk, skb);
> > > - goto discard;
> > > + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
> >
> > I'd rather use a string. So that we can more easily identify _why_ the
> > packet was drop, without looking at the source code
> > of the exact kernel version to locate line number 1057
> >
> > You can be sure that we will get reports in the future from users of
> > heavily modified kernels.
> > Having to download a git tree, or apply semi-private patches is a no go.
> >
> > If you really want to include __FILE__ and __LINE__, these both can be
> > stringified and included in the report, with the help of macros.
>
> I agree the __LINE__ is pointless, but if this has a tracepoint
> involved, then you can simply enable the stacktrace trigger to it and
> it will save a stack trace in the ring buffer for you.
>
> echo stacktrace > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
>
> And when the event triggers it will record a stack trace. You can also
> even add a filter to do it only for specific reasons.
>
> echo 'stacktrace if reason == 1' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
>
> And it even works for flags:
>
> echo 'stacktrace if reason & 0xa' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
>
> Which gives another reason to use an enum over a string.

You can't do string comparisons? The more string support Ftrace has,
the more convenient they will be. Using bpftrace as an example of
convenience and showing drop frequency counted by human-readable
reason and stack trace:

# bpftrace -e 'k:tcp_drop { @[str(arg2), kstack] = count(); }'

Don't need further translation beyond the str(arg2). And filtering on
backlog drops:

bpftrace -e 'k:tcp_drop /str(arg2) == "SYN backlog drop"/ { @[kstack]
= count(); }'

etc. (Although ultimately we'll want a tracepoint added in tcp_drop
with those arguments.) If it's a enum I'll need to translate it back,
and deal with enum additions that my tool might not be coded for. I
can do it, it just needs maintenance, e.g. [0]. Plus the kernel code
needs maintenance. For a narrow observability use case, it starts to
feel like overkill to maintain an enum.

I wouldn't mind an optional _additional_ reason argument that's the
enum SNMP counter if appropriate. E.g.:

tcpdrop(sk, skb, "Accept backlog full", LINUX_MIB_LISTENDROPS);
tcpdrop(sk, skb, "No route", LINUX_MIB_LISTENDROPS);

So you could trace LINUX_MIB_LISTENDROPS and see different string
reasons for each different code path.

I don't feel strongly about having __LINE__. I'd look it up from the
stack trace anyway.

Brendan

[0] https://github.com/brendangregg/bpf-perf-tools-book/blob/master/originals/Ch16_Hypervisors/kvmexits.bt

2021-09-01 14:51:52

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Thu, 26 Aug 2021 15:13:07 +1000
Brendan Gregg <[email protected]> wrote:

> On Thu, Aug 26, 2021 at 1:20 PM Steven Rostedt <[email protected]> wrote:
> >
> > On Wed, 25 Aug 2021 08:47:46 -0700
> > Eric Dumazet <[email protected]> wrote:
> >
> > > > @@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> > > > TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> > > > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
> > > > tcp_send_challenge_ack(sk, skb);
> > > > - goto discard;
> > > > + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
> > >
> > > I'd rather use a string. So that we can more easily identify _why_ the
> > > packet was drop, without looking at the source code
> > > of the exact kernel version to locate line number 1057
> > >
> > > You can be sure that we will get reports in the future from users of
> > > heavily modified kernels.
> > > Having to download a git tree, or apply semi-private patches is a no go.
> > >
> > > If you really want to include __FILE__ and __LINE__, these both can be
> > > stringified and included in the report, with the help of macros.
> >
> > I agree the __LINE__ is pointless, but if this has a tracepoint
> > involved, then you can simply enable the stacktrace trigger to it and
> > it will save a stack trace in the ring buffer for you.
> >
> > echo stacktrace > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
> >
> > And when the event triggers it will record a stack trace. You can also
> > even add a filter to do it only for specific reasons.
> >
> > echo 'stacktrace if reason == 1' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
> >
> > And it even works for flags:
> >
> > echo 'stacktrace if reason & 0xa' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
> >
> > Which gives another reason to use an enum over a string.
>
> You can't do string comparisons? The more string support Ftrace has,
> the more convenient they will be. Using bpftrace as an example of
> convenience and showing drop frequency counted by human-readable
> reason and stack trace:

Yes, you can (and pretty much always had this ability), but having
flags is usually makes it easier (and faster).

You can have 'stacktrace if reason ~ "*string*"' which will match
anything with "string" in it.

My main argument against strings is more of the space they take up in
the ring buffer than the ability to filter.

-- Steve

--
Sent from my Android device with K-9 Mail. Please excuse my brevity and top posting.

2021-09-01 15:27:07

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2

On Wed, Sep 1, 2021 at 7:36 AM Steven Rostedt <[email protected]> wrote:
>
> On Thu, 26 Aug 2021 15:13:07 +1000
> Brendan Gregg <[email protected]> wrote:
>
> > On Thu, Aug 26, 2021 at 1:20 PM Steven Rostedt <[email protected]> wrote:
> > >
> > > On Wed, 25 Aug 2021 08:47:46 -0700
> > > Eric Dumazet <[email protected]> wrote:
> > >
> > > > > @@ -5703,15 +5700,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
> > > > > TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> > > > > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
> > > > > tcp_send_challenge_ack(sk, skb);
> > > > > - goto discard;
> > > > > + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__, TCP_VALIDATE_INCOMING));
> > > >
> > > > I'd rather use a string. So that we can more easily identify _why_ the
> > > > packet was drop, without looking at the source code
> > > > of the exact kernel version to locate line number 1057
> > > >
> > > > You can be sure that we will get reports in the future from users of
> > > > heavily modified kernels.
> > > > Having to download a git tree, or apply semi-private patches is a no go.
> > > >
> > > > If you really want to include __FILE__ and __LINE__, these both can be
> > > > stringified and included in the report, with the help of macros.
> > >
> > > I agree the __LINE__ is pointless, but if this has a tracepoint
> > > involved, then you can simply enable the stacktrace trigger to it and
> > > it will save a stack trace in the ring buffer for you.
> > >
> > > echo stacktrace > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
> > >
> > > And when the event triggers it will record a stack trace. You can also
> > > even add a filter to do it only for specific reasons.
> > >
> > > echo 'stacktrace if reason == 1' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
> > >
> > > And it even works for flags:
> > >
> > > echo 'stacktrace if reason & 0xa' > /sys/kernel/tracing/events/tcp/tcp_drop/trigger
> > >
> > > Which gives another reason to use an enum over a string.
> >
> > You can't do string comparisons? The more string support Ftrace has,
> > the more convenient they will be. Using bpftrace as an example of
> > convenience and showing drop frequency counted by human-readable
> > reason and stack trace:
>
> Yes, you can (and pretty much always had this ability), but having
> flags is usually makes it easier (and faster).
>
> You can have 'stacktrace if reason ~ "*string*"' which will match
> anything with "string" in it.
>
> My main argument against strings is more of the space they take up in
> the ring buffer than the ability to filter.

Understood the concern about size, but it seems the trace includes many things.
Can we have an estimate of the size needed per event ?
If we do not use symbolic, but numbers, I am afraid this trace event
will only be used by a few TCP experts.

+ TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d
snd_nxt=%#x snd_una=%#x \
+ snd_cwnd=%u ssthresh=%u snd_wnd=%u
srtt=%u rcv_wnd=%u \
+ sock_cookie=%llx reason=%d
reason_type=%s reason_line=%d",
__entry->saddr, __entry->daddr, __entry->mark,
__entry->data_len, __entry->snd_nxt,
__entry->snd_una,
__entry->snd_cwnd, __entry->ssthresh,
__entry->snd_wnd,
- __entry->srtt, __entry->rcv_wnd,
__entry->sock_cookie, __entry->reason)
+ __entry->srtt, __entry->rcv_wnd,
__entry->sock_cookie,
+ __entry->reason,
+ __print_symbolic(__entry->reason_code,
TCP_DROP_REASON),
+ __entry->reason_line)
);

2021-09-01 17:47:25

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] net: tcp_drop adds `reason` parameter for tracing v2



On September 1, 2021 11:20:58 AM EDT, Eric Dumazet <[email protected]> wrote:
>On Wed, Sep 1, 2021 at 7:36 AM Steven Rostedt <[email protected]>
>wrote:
>>
>> On Thu, 26 Aug 2021 15:13:07 +1000
>> Brendan Gregg <[email protected]> wrote:
>>
>> > On Thu, Aug 26, 2021 at 1:20 PM Steven Rostedt
><[email protected]> wrote:
>> > >
>> > > On Wed, 25 Aug 2021 08:47:46 -0700
>> > > Eric Dumazet <[email protected]> wrote:
>> > >
>> > > > > @@ -5703,15 +5700,15 @@ static bool
>tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
>> > > > > TCP_INC_STATS(sock_net(sk),
>TCP_MIB_INERRS);
>> > > > > NET_INC_STATS(sock_net(sk),
>LINUX_MIB_TCPSYNCHALLENGE);
>> > > > > tcp_send_challenge_ack(sk, skb);
>> > > > > - goto discard;
>> > > > > + tcp_drop(sk, skb, TCP_DROP_MASK(__LINE__,
>TCP_VALIDATE_INCOMING));
>> > > >
>> > > > I'd rather use a string. So that we can more easily identify
>_why_ the
>> > > > packet was drop, without looking at the source code
>> > > > of the exact kernel version to locate line number 1057
>> > > >
>> > > > You can be sure that we will get reports in the future from
>users of
>> > > > heavily modified kernels.
>> > > > Having to download a git tree, or apply semi-private patches is
>a no go.
>> > > >
>> > > > If you really want to include __FILE__ and __LINE__, these both
>can be
>> > > > stringified and included in the report, with the help of
>macros.
>> > >
>> > > I agree the __LINE__ is pointless, but if this has a tracepoint
>> > > involved, then you can simply enable the stacktrace trigger to it
>and
>> > > it will save a stack trace in the ring buffer for you.
>> > >
>> > > echo stacktrace >
>/sys/kernel/tracing/events/tcp/tcp_drop/trigger
>> > >
>> > > And when the event triggers it will record a stack trace. You can
>also
>> > > even add a filter to do it only for specific reasons.
>> > >
>> > > echo 'stacktrace if reason == 1' >
>/sys/kernel/tracing/events/tcp/tcp_drop/trigger
>> > >
>> > > And it even works for flags:
>> > >
>> > > echo 'stacktrace if reason & 0xa' >
>/sys/kernel/tracing/events/tcp/tcp_drop/trigger
>> > >
>> > > Which gives another reason to use an enum over a string.
>> >
>> > You can't do string comparisons? The more string support Ftrace
>has,
>> > the more convenient they will be. Using bpftrace as an example of
>> > convenience and showing drop frequency counted by human-readable
>> > reason and stack trace:
>>
>> Yes, you can (and pretty much always had this ability), but having
>> flags is usually makes it easier (and faster).
>>
>> You can have 'stacktrace if reason ~ "*string*"' which will match
>> anything with "string" in it.
>>
>> My main argument against strings is more of the space they take up in
>> the ring buffer than the ability to filter.
>
>Understood the concern about size, but it seems the trace includes many
>things.
>Can we have an estimate of the size needed per event ?
>If we do not use symbolic, but numbers, I am afraid this trace event
>will only be used by a few TCP experts.

Note, the output is still text, not numeric, as the __print_symbolic() macro will convert the numbers to text.

Or is there another issue you have with the enum?

-- Steve

>
>+ TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d
>snd_nxt=%#x snd_una=%#x \
>+ snd_cwnd=%u ssthresh=%u snd_wnd=%u
>srtt=%u rcv_wnd=%u \
>+ sock_cookie=%llx reason=%d
>reason_type=%s reason_line=%d",
> __entry->saddr, __entry->daddr, __entry->mark,
> __entry->data_len, __entry->snd_nxt,
>__entry->snd_una,
> __entry->snd_cwnd, __entry->ssthresh,
>__entry->snd_wnd,
>- __entry->srtt, __entry->rcv_wnd,
>__entry->sock_cookie, __entry->reason)
>+ __entry->srtt, __entry->rcv_wnd,
>__entry->sock_cookie,
>+ __entry->reason,
>+ __print_symbolic(__entry->reason_code,
>TCP_DROP_REASON),
>+ __entry->reason_line)
> );

--
Sent from my Android device with K-9 Mail. Please excuse my brevity and top posting.