There are cases where we need information about the socket during a
warning, so, it could help us to find bugs that happens and do not have
an easy repro.
This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
dumps more information about the TCP socket.
This new warning is not only useful to give more insight about kernel bugs, but,
it is also helpful to expose information that might be coming from buggy
BPF applications, such as BPF applications that sets invalid
tcp_sock->snd_cwnd values.
Signed-off-by: Breno Leitao <[email protected]>
---
include/net/tcp.h | 3 ++-
include/net/tcp_debug.h | 10 ++++++++++
net/ipv4/tcp.c | 30 ++++++++++++++++++++++++++++++
3 files changed, 42 insertions(+), 1 deletion(-)
create mode 100644 include/net/tcp_debug.h
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 14d45661a84d..e490af8e6fdc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -40,6 +40,7 @@
#include <net/inet_ecn.h>
#include <net/dst.h>
#include <net/mptcp.h>
+#include <net/tcp_debug.h>
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
@@ -1229,7 +1230,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
{
- WARN_ON_ONCE((int)val <= 0);
+ TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
tp->snd_cwnd = val;
}
diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
new file mode 100644
index 000000000000..50e96d87d335
--- /dev/null
+++ b/include/net/tcp_debug.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TCP_DEBUG_H
+#define _LINUX_TCP_DEBUG_H
+
+void tcp_sock_warn(const struct tcp_sock *tp);
+
+#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
+ DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
+
+#endif /* _LINUX_TCP_DEBUG_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 54836a6b81d6..dd682f60c7cb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
}
EXPORT_SYMBOL_GPL(tcp_abort);
+void tcp_sock_warn(const struct tcp_sock *tp)
+{
+ const struct sock *sk = (const struct sock *)tp;
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ WARN_ON(1);
+
+ if (!tp)
+ return;
+
+ pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
+ sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
+ ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
+ &inet->inet_daddr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
+ &sk->sk_v6_daddr);
+ break;
+#endif
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_sock_warn);
+
extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
--
2.30.2
From: Breno Leitao <[email protected]>
Date: Thu, 24 Nov 2022 03:22:29 -0800
> There are cases where we need information about the socket during a
> warning, so, it could help us to find bugs that happens and do not have
> an easy repro.
>
> This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> dumps more information about the TCP socket.
>
> This new warning is not only useful to give more insight about kernel bugs, but,
> it is also helpful to expose information that might be coming from buggy
> BPF applications, such as BPF applications that sets invalid
> tcp_sock->snd_cwnd values.
Have you finally found a root cause on BPF or TCP side ?
> Signed-off-by: Breno Leitao <[email protected]>
> ---
> include/net/tcp.h | 3 ++-
> include/net/tcp_debug.h | 10 ++++++++++
> net/ipv4/tcp.c | 30 ++++++++++++++++++++++++++++++
> 3 files changed, 42 insertions(+), 1 deletion(-)
> create mode 100644 include/net/tcp_debug.h
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 14d45661a84d..e490af8e6fdc 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -40,6 +40,7 @@
> #include <net/inet_ecn.h>
> #include <net/dst.h>
> #include <net/mptcp.h>
> +#include <net/tcp_debug.h>
>
> #include <linux/seq_file.h>
> #include <linux/memcontrol.h>
> @@ -1229,7 +1230,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
>
> static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
> {
> - WARN_ON_ONCE((int)val <= 0);
> + TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
> tp->snd_cwnd = val;
> }
>
> diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
> new file mode 100644
> index 000000000000..50e96d87d335
> --- /dev/null
> +++ b/include/net/tcp_debug.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_TCP_DEBUG_H
> +#define _LINUX_TCP_DEBUG_H
> +
> +void tcp_sock_warn(const struct tcp_sock *tp);
> +
> +#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
> + DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
> +
> +#endif /* _LINUX_TCP_DEBUG_H */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 54836a6b81d6..dd682f60c7cb 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
> }
> EXPORT_SYMBOL_GPL(tcp_abort);
>
> +void tcp_sock_warn(const struct tcp_sock *tp)
> +{
> + const struct sock *sk = (const struct sock *)tp;
> + struct inet_sock *inet = inet_sk(sk);
> + struct inet_connection_sock *icsk = inet_csk(sk);
> +
> + WARN_ON(1);
> +
> + if (!tp)
Is this needed ?
> + return;
> +
> + pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> + sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> + ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> +
> + switch (sk->sk_family) {
> + case AF_INET:
> + pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> + &inet->inet_daddr);
As with tcp_syn_flood_action(), [address]:port format is easy
to read and consistent in kernel ?
> + break;
> +#if IS_ENABLED(CONFIG_IPV6)
> + case AF_INET6:
> + pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
> + &sk->sk_v6_daddr);
> + break;
> +#endif
> + }
> +}
> +EXPORT_SYMBOL_GPL(tcp_sock_warn);
> +
> extern struct tcp_congestion_ops tcp_reno;
>
> static __initdata unsigned long thash_entries;
> --
> 2.30.2
Hello,
On Thu, 2022-11-24 at 03:22 -0800, Breno Leitao wrote:
> There are cases where we need information about the socket during a
> warning, so, it could help us to find bugs that happens and do not have
> an easy repro.
>
> This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> dumps more information about the TCP socket.
>
> This new warning is not only useful to give more insight about kernel bugs, but,
> it is also helpful to expose information that might be coming from buggy
> BPF applications, such as BPF applications that sets invalid
> tcp_sock->snd_cwnd values.
I personally find this use-case a little too tight, you could likelly
fetch the same information with a perf probe or something similar.
> Signed-off-by: Breno Leitao <[email protected]>
> ---
> include/net/tcp.h | 3 ++-
> include/net/tcp_debug.h | 10 ++++++++++
> net/ipv4/tcp.c | 30 ++++++++++++++++++++++++++++++
> 3 files changed, 42 insertions(+), 1 deletion(-)
> create mode 100644 include/net/tcp_debug.h
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 14d45661a84d..e490af8e6fdc 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -40,6 +40,7 @@
> #include <net/inet_ecn.h>
> #include <net/dst.h>
> #include <net/mptcp.h>
> +#include <net/tcp_debug.h>
>
> #include <linux/seq_file.h>
> #include <linux/memcontrol.h>
> @@ -1229,7 +1230,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
>
> static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
> {
> - WARN_ON_ONCE((int)val <= 0);
> + TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
> tp->snd_cwnd = val;
> }
>
> diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
> new file mode 100644
> index 000000000000..50e96d87d335
> --- /dev/null
> +++ b/include/net/tcp_debug.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_TCP_DEBUG_H
> +#define _LINUX_TCP_DEBUG_H
> +
> +void tcp_sock_warn(const struct tcp_sock *tp);
> +
> +#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
> + DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
> +
> +#endif /* _LINUX_TCP_DEBUG_H */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 54836a6b81d6..dd682f60c7cb 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
> }
> EXPORT_SYMBOL_GPL(tcp_abort);
>
> +void tcp_sock_warn(const struct tcp_sock *tp)
> +{
> + const struct sock *sk = (const struct sock *)tp;
> + struct inet_sock *inet = inet_sk(sk);
> + struct inet_connection_sock *icsk = inet_csk(sk);
> +
> + WARN_ON(1);
> +
> + if (!tp)
> + return;
> +
> + pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> + sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> + ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> +
> + switch (sk->sk_family) {
> + case AF_INET:
> + pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> + &inet->inet_daddr);
> + break;
> +#if IS_ENABLED(CONFIG_IPV6)
> + case AF_INET6:
> + pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
> + &sk->sk_v6_daddr);
> + break;
> +#endif
Please, adjust the output format as suggested by Kuniyuki,
thanks!
Paolo
On Tue, Nov 29, 2022 at 10:00:55AM +0900, Kuniyuki Iwashima wrote:
> From: Breno Leitao <[email protected]>
> Date: Thu, 24 Nov 2022 03:22:29 -0800
> > There are cases where we need information about the socket during a
> > warning, so, it could help us to find bugs that happens and do not have
> > an easy repro.
> >
> > This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> > dumps more information about the TCP socket.
> >
> > This new warning is not only useful to give more insight about kernel bugs, but,
> > it is also helpful to expose information that might be coming from buggy
> > BPF applications, such as BPF applications that sets invalid
> > tcp_sock->snd_cwnd values.
>
> Have you finally found a root cause on BPF or TCP side ?
Yes, this demonstrated to be very useful to find out BPF applications
that are doing nasty things with the congestion window.
We currently have this patch applied to Meta's infrastructure to track
BPF applications that are misbehaving, and easily track down to which
BPF application is the responsible one.
> > +#endif /* _LINUX_TCP_DEBUG_H */
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index 54836a6b81d6..dd682f60c7cb 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
> > }
> > EXPORT_SYMBOL_GPL(tcp_abort);
> >
> > +void tcp_sock_warn(const struct tcp_sock *tp)
> > +{
> > + const struct sock *sk = (const struct sock *)tp;
> > + struct inet_sock *inet = inet_sk(sk);
> > + struct inet_connection_sock *icsk = inet_csk(sk);
> > +
> > + WARN_ON(1);
> > +
> > + if (!tp)
>
> Is this needed ?
We are de-referencing tp/sk in the lines below, so, I think it is safe to
check if they are not NULL before the de-refencing it.
Should I do check for "ck" instead of "tp" to make the code a bit
cleaner to read?
> > + pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> > + sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> > + ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> > +
> > + switch (sk->sk_family) {
> > + case AF_INET:
> > + pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> > + &inet->inet_daddr);
>
> As with tcp_syn_flood_action(), [address]:port format is easy
> to read and consistent in kernel ?
Absolutely. I am going to fix it in v2. Thanks!
> On Nov 29, 2022, at 21:48, Breno Leitao <[email protected]> wrote:
>> On Tue, Nov 29, 2022 at 10:00:55AM +0900, Kuniyuki Iwashima wrote:
>> From: Breno Leitao <[email protected]>
>> Date: Thu, 24 Nov 2022 03:22:29 -0800
>>> There are cases where we need information about the socket during a
>>> warning, so, it could help us to find bugs that happens and do not have
>>> an easy repro.
>>>
>>> This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
>>> dumps more information about the TCP socket.
>>>
>>> This new warning is not only useful to give more insight about kernel bugs, but,
>>> it is also helpful to expose information that might be coming from buggy
>>> BPF applications, such as BPF applications that sets invalid
>>> tcp_sock->snd_cwnd values.
>>
>> Have you finally found a root cause on BPF or TCP side ?
>
> Yes, this demonstrated to be very useful to find out BPF applications
> that are doing nasty things with the congestion window.
>
> We currently have this patch applied to Meta's infrastructure to track
> BPF applications that are misbehaving, and easily track down to which
> BPF application is the responsible one.
If you have a fix merged on the BPF side,
it would be helpful to mention the commit to
well understand the issue, background,
and why other tooling is not enough as Paolo wondered.
>>> +#endif /* _LINUX_TCP_DEBUG_H */
>>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>>> index 54836a6b81d6..dd682f60c7cb 100644
>>> --- a/net/ipv4/tcp.c
>>> +++ b/net/ipv4/tcp.c
>>> @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
>>> }
>>> EXPORT_SYMBOL_GPL(tcp_abort);
>>>
>>> +void tcp_sock_warn(const struct tcp_sock *tp)
>>> +{
>>> + const struct sock *sk = (const struct sock *)tp;
>>> + struct inet_sock *inet = inet_sk(sk);
>>> + struct inet_connection_sock *icsk = inet_csk(sk);
>>> +
>>> + WARN_ON(1);
>>> +
>>> + if (!tp)
>>
>> Is this needed ?
>
> We are de-referencing tp/sk in the lines below, so, I think it is safe to
> check if they are not NULL before the de-refencing it.
tp->snd_cwnd is accessed just after this WARN,
so I thought there were no cases where tp is NULL.
If it exists, KASAN should be complaining.
I think this additional if could confuse future readers and
want to make sure if there is such a case.
Thank you!
>
> Should I do check for "ck" instead of "tp" to make the code a bit
> cleaner to read?
>
>>> + pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
>>> + sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
>>> + ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
>>> +
>>> + switch (sk->sk_family) {
>>> + case AF_INET:
>>> + pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
>>> + &inet->inet_daddr);
>>
>> As with tcp_syn_flood_action(), [address]:port format is easy
>> to read and consistent in kernel ?
>
> Absolutely. I am going to fix it in v2. Thanks!
On Tue, 29 Nov 2022 11:18:27 +0100 Paolo Abeni wrote:
> On Thu, 2022-11-24 at 03:22 -0800, Breno Leitao wrote:
> > There are cases where we need information about the socket during a
> > warning, so, it could help us to find bugs that happens and do not have
> > an easy repro.
> >
> > This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> > dumps more information about the TCP socket.
> >
> > This new warning is not only useful to give more insight about kernel bugs, but,
> > it is also helpful to expose information that might be coming from buggy
> > BPF applications, such as BPF applications that sets invalid
> > tcp_sock->snd_cwnd values.
>
> I personally find this use-case a little too tight, you could likelly
> fetch the same information with a perf probe or something similar.
It's just the initial case, to keep the patch small.
The intent is to convert all TCP warnings to this helper.
As Breno says in the first sentence this is about having enough
relevant information to zero in on the cause of the rare crashes /
warnings (which are hit quite a lot on our "millions of machines").
On Tue, Nov 29, 2022 at 09:16:16PM +0000, Iwashima, Kuniyuki wrote:
> > On Nov 29, 2022, at 21:48, Breno Leitao <[email protected]> wrote:
> >> On Tue, Nov 29, 2022 at 10:00:55AM +0900, Kuniyuki Iwashima wrote:
<snip>
> >>> +void tcp_sock_warn(const struct tcp_sock *tp)
> >>> +{
> >>> + const struct sock *sk = (const struct sock *)tp;
> >>> + struct inet_sock *inet = inet_sk(sk);
> >>> + struct inet_connection_sock *icsk = inet_csk(sk);
> >>> +
> >>> + WARN_ON(1);
> >>> +
> >>> + if (!tp)
> >>
> >> Is this needed ?
> >
> > We are de-referencing tp/sk in the lines below, so, I think it is safe to
> > check if they are not NULL before the de-refencing it.
>
> tp->snd_cwnd is accessed just after this WARN,
> so I thought there were no cases where tp is NULL.
Oh, important to say that we want to re-use this macro on other places
as well. This initial usage (on tcp_snd_cwnd_set()) is just for the
initial patch. I see value replacing some WARN_ON_*() by
TCP_SOCK_WARN_ON_ONCE() in other parts of the code, so, this check is to
protect this warning when TCP_SOCK_WARN_ON_ONCE() is called from
different places.
Anyway, I definitely can remove the check here, but, we might want to
re-add it later, as we replace some WARN_ON_* by TCP_SOCK_WARN_ON_*();
> I think this additional if could confuse future readers and
> want to make sure if there is such a case.
How come checking if a pointer is valid before de-refencing it could
confuse readers?
Thank you for the review!