SOCK_DEBUG is a very ancient debugging interface, and it's not very useful
for debugging.
This pacthset cleanups SOCK_DEBUG() and replace it with a new methord
based on BPF.
I cleanup SOCK_DEBUG() only for TCP, and other protocols are kept as is.
After this patchset, the SO_DEBUG interface will not take any effect for
TCP, but I still keep it in sock_{s,g}etsockopt() for TCP to avoid breaking
applications.
In the future we may extend tcp_stats() as bellow or something else to
cover all the LINUX_MIB_* and TCP_MIB_* proposaled[0] in the netconf2018.
now:
tcp_stats(struct sock *sk, int mib_idx)
future:
tcp_stats(struct sock *sk, int mib_idx, int packets)
The argument packets can be 1 to indicates this is a event only;
and skb_shinfo(skb)->gso_segs to indicates the number of packets
are also concerned.
[0] page 14,
http://vger.kernel.org/netconf2018_files/BrendanGregg_netconf2018.pdf
Yafang Shao (2):
tcp: replace SOCK_DEBUG() with tcp_stats()
bpf: add BPF_SOCK_OPS_STATS_CB for tcp_stats()
include/uapi/linux/bpf.h | 5 +++++
include/uapi/linux/snmp.h | 3 +++
net/ipv4/proc.c | 3 +++
net/ipv4/tcp_input.c | 27 ++++++++++++---------------
net/ipv6/tcp_ipv6.c | 2 --
5 files changed, 23 insertions(+), 17 deletions(-)
--
1.8.3.1
SOCK_DEBUG is a very ancient debugging interface, and it's not very useful
for debugging.
So this patch removes the SOCK_DEBUG() and introduce a new function
tcp_stats() to trace this kind of events.
Some MIBs are added for these events.
Regarding the SO_DEBUG in sock_{s,g}etsockopt, I think it is better to
keep as-is, because if we return an errno to tell the application that
this optname isn't supported for TCP, it may break the application.
The application still can use this option but don't take any effect for
TCP.
Signed-off-by: Yafang Shao <[email protected]>
---
include/uapi/linux/snmp.h | 3 +++
net/ipv4/proc.c | 3 +++
net/ipv4/tcp_input.c | 26 +++++++++++---------------
net/ipv6/tcp_ipv6.c | 2 --
4 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 86dc24a..fd5c09c 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -283,6 +283,9 @@ enum
LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */
LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */
LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */
+ LINUX_MIB_TCPINVALIDACK, /* TCPInvalidAck */
+ LINUX_MIB_TCPOLDACK, /* TCPOldAck */
+ LINUX_MIB_TCPPARTIALPACKET, /* TCPPartialPacket */
__LINUX_MIB_MAX
};
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index c3610b3..1b0320a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -291,6 +291,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
+ SNMP_MIB_ITEM("TCPInvalidAck", LINUX_MIB_TCPINVALIDACK),
+ SNMP_MIB_ITEM("TCPOldAck", LINUX_MIB_TCPOLDACK),
+ SNMP_MIB_ITEM("TCPPartialPacket", LINUX_MIB_TCPPARTIALPACKET),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7a027dec..88deb1f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3554,6 +3554,11 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
return delivered;
}
+static void tcp_stats(struct sock *sk, int mib_idx)
+{
+ NET_INC_STATS(sock_net(sk), mib_idx);
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3715,7 +3720,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
invalid_ack:
- SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ tcp_stats(sk, LINUX_MIB_TCPINVALIDACK);
return -1;
old_ack:
@@ -3731,7 +3736,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_xmit_recovery(sk, rexmit);
}
- SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ tcp_stats(sk, LINUX_MIB_TCPOLDACK);
return 0;
}
@@ -4432,13 +4437,10 @@ static void tcp_ofo_queue(struct sock *sk)
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
- SOCK_DEBUG(sk, "ofo packet was already received\n");
+ tcp_stats(sk, LINUX_MIB_TCPOFODROP);
tcp_drop(sk, skb);
continue;
}
- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
@@ -4499,11 +4501,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, seq, end_seq);
+ tcp_stats(sk, LINUX_MIB_TCPOFOQUEUE);
p = &tp->out_of_order_queue.rb_node;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
@@ -4779,9 +4779,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
+ tcp_stats(sk, LINUX_MIB_TCPPARTIALPACKET);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
@@ -5061,9 +5059,7 @@ static int tcp_prune_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
+ tcp_stats(sk, LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e51cda7..57ef69a1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -220,8 +220,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
- SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
-
if (__ipv6_only_sock(sk))
return -ENETUNREACH;
--
1.8.3.1
Introuce this new op BPF_SOCK_OPS_STATS_CB for tcp_stats() such that it
can be traced via BPF on a per socket basis.
There's one argument in BPF_SOCK_OPS_STATS_CB, which is Linux MIB index
LINUX_MIB_* to indicate the TCP event.
All these Linux MIBs are defined in include/uapi/linux/snmp.h.
Signed-off-by: Yafang Shao <[email protected]>
---
include/uapi/linux/bpf.h | 5 +++++
net/ipv4/tcp_input.c | 1 +
2 files changed, 6 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1777fa0..0314ddd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2894,6 +2894,11 @@ enum {
BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
* socket transition to LISTEN state.
*/
+ BPF_SOCK_OPS_STATS_CB, /*
+ * Called on tcp_stats().
+ * Arg1: Linux MIB index
+ * LINUX_MIB_*
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 88deb1f..4acf458 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3557,6 +3557,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
static void tcp_stats(struct sock *sk, int mib_idx)
{
NET_INC_STATS(sock_net(sk), mib_idx);
+ tcp_call_bpf(sk, BPF_SOCK_OPS_STATS_CB, 1, &mib_idx);
}
/* This routine deals with incoming acks, but not outgoing ones. */
--
1.8.3.1
On 02/12/2019 03:31 AM, Yafang Shao wrote:
> SOCK_DEBUG is a very ancient debugging interface, and it's not very useful
> for debugging.
> So this patch removes the SOCK_DEBUG() and introduce a new function
> tcp_stats() to trace this kind of events.
> Some MIBs are added for these events.
>
> Regarding the SO_DEBUG in sock_{s,g}etsockopt, I think it is better to
> keep as-is, because if we return an errno to tell the application that
> this optname isn't supported for TCP, it may break the application.
> The application still can use this option but don't take any effect for
> TCP.
>
> Signed-off-by: Yafang Shao <[email protected]>
> ---
> include/uapi/linux/snmp.h | 3 +++
> net/ipv4/proc.c | 3 +++
> net/ipv4/tcp_input.c | 26 +++++++++++---------------
> net/ipv6/tcp_ipv6.c | 2 --
> 4 files changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
> index 86dc24a..fd5c09c 100644
> --- a/include/uapi/linux/snmp.h
> +++ b/include/uapi/linux/snmp.h
> @@ -283,6 +283,9 @@ enum
> LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */
> LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */
> LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */
> + LINUX_MIB_TCPINVALIDACK, /* TCPInvalidAck */
> + LINUX_MIB_TCPOLDACK, /* TCPOldAck */
> + LINUX_MIB_TCPPARTIALPACKET, /* TCPPartialPacket */
> __LINUX_MIB_MAX
> };
>
> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
> index c3610b3..1b0320a 100644
> --- a/net/ipv4/proc.c
> +++ b/net/ipv4/proc.c
> @@ -291,6 +291,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
> SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
> SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
> SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
> + SNMP_MIB_ITEM("TCPInvalidAck", LINUX_MIB_TCPINVALIDACK),
> + SNMP_MIB_ITEM("TCPOldAck", LINUX_MIB_TCPOLDACK),
> + SNMP_MIB_ITEM("TCPPartialPacket", LINUX_MIB_TCPPARTIALPACKET),
> SNMP_MIB_SENTINEL
> };
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 7a027dec..88deb1f 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3554,6 +3554,11 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
> return delivered;
> }
>
> +static void tcp_stats(struct sock *sk, int mib_idx)
> +{
> + NET_INC_STATS(sock_net(sk), mib_idx);
> +}
This is not a very descriptive name.
Why is it static, and in net/ipv4/tcp_input.c ???
> +
> /* This routine deals with incoming acks, but not outgoing ones. */
> static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> {
> @@ -3715,7 +3720,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> return 1;
>
> invalid_ack:
> - SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
> + tcp_stats(sk, LINUX_MIB_TCPINVALIDACK);
> return -1;
>
> old_ack:
> @@ -3731,7 +3736,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> tcp_xmit_recovery(sk, rexmit);
> }
>
> - SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
> + tcp_stats(sk, LINUX_MIB_TCPOLDACK);
> return 0;
> }
>
These counters will add noise to an already crowded MIB space.
What bug do you expect to track and fix with these ?
I see many TCP patches coming adding icache pressure, enabling companies to build their own modified
TCP stack, but no real meat.
On 02/12/2019 03:31 AM, Yafang Shao wrote:
> Introuce this new op BPF_SOCK_OPS_STATS_CB for tcp_stats() such that it
> can be traced via BPF on a per socket basis.
> There's one argument in BPF_SOCK_OPS_STATS_CB, which is Linux MIB index
> LINUX_MIB_* to indicate the TCP event.
> All these Linux MIBs are defined in include/uapi/linux/snmp.h.
>
> Signed-off-by: Yafang Shao <[email protected]>
> ---
> include/uapi/linux/bpf.h | 5 +++++
> net/ipv4/tcp_input.c | 1 +
> 2 files changed, 6 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 1777fa0..0314ddd 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2894,6 +2894,11 @@ enum {
> BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
> * socket transition to LISTEN state.
> */
> + BPF_SOCK_OPS_STATS_CB, /*
> + * Called on tcp_stats().
> + * Arg1: Linux MIB index
> + * LINUX_MIB_*
> + */
> };
>
> /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 88deb1f..4acf458 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3557,6 +3557,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
> static void tcp_stats(struct sock *sk, int mib_idx)
> {
> NET_INC_STATS(sock_net(sk), mib_idx);
> + tcp_call_bpf(sk, BPF_SOCK_OPS_STATS_CB, 1, &mib_idx);
> }
>
> /* This routine deals with incoming acks, but not outgoing ones. */
>
If the plan is to add to all NET_INC_STATS() calls in TCP an additional tcp_call_bpf()
I will say no.
So far, tcp_call_bpf() has not been used in fast path.
On Tue, Feb 12, 2019 at 11:07 PM Eric Dumazet <[email protected]> wrote:
>
>
>
> On 02/12/2019 03:31 AM, Yafang Shao wrote:
> > SOCK_DEBUG is a very ancient debugging interface, and it's not very useful
> > for debugging.
> > So this patch removes the SOCK_DEBUG() and introduce a new function
> > tcp_stats() to trace this kind of events.
> > Some MIBs are added for these events.
> >
> > Regarding the SO_DEBUG in sock_{s,g}etsockopt, I think it is better to
> > keep as-is, because if we return an errno to tell the application that
> > this optname isn't supported for TCP, it may break the application.
> > The application still can use this option but don't take any effect for
> > TCP.
> >
> > Signed-off-by: Yafang Shao <[email protected]>
> > ---
> > include/uapi/linux/snmp.h | 3 +++
> > net/ipv4/proc.c | 3 +++
> > net/ipv4/tcp_input.c | 26 +++++++++++---------------
> > net/ipv6/tcp_ipv6.c | 2 --
> > 4 files changed, 17 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
> > index 86dc24a..fd5c09c 100644
> > --- a/include/uapi/linux/snmp.h
> > +++ b/include/uapi/linux/snmp.h
> > @@ -283,6 +283,9 @@ enum
> > LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */
> > LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */
> > LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */
> > + LINUX_MIB_TCPINVALIDACK, /* TCPInvalidAck */
> > + LINUX_MIB_TCPOLDACK, /* TCPOldAck */
> > + LINUX_MIB_TCPPARTIALPACKET, /* TCPPartialPacket */
> > __LINUX_MIB_MAX
> > };
> >
> > diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
> > index c3610b3..1b0320a 100644
> > --- a/net/ipv4/proc.c
> > +++ b/net/ipv4/proc.c
> > @@ -291,6 +291,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
> > SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
> > SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
> > SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
> > + SNMP_MIB_ITEM("TCPInvalidAck", LINUX_MIB_TCPINVALIDACK),
> > + SNMP_MIB_ITEM("TCPOldAck", LINUX_MIB_TCPOLDACK),
> > + SNMP_MIB_ITEM("TCPPartialPacket", LINUX_MIB_TCPPARTIALPACKET),
> > SNMP_MIB_SENTINEL
> > };
> >
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 7a027dec..88deb1f 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -3554,6 +3554,11 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
> > return delivered;
> > }
> >
> > +static void tcp_stats(struct sock *sk, int mib_idx)
> > +{
> > + NET_INC_STATS(sock_net(sk), mib_idx);
> > +}
>
> This is not a very descriptive name.
>
> Why is it static, and in net/ipv4/tcp_input.c ???
>
Because it is only called in net/ipv4/tcp_input.c currently, so I
define it as static in this file,
the reseaon I don't define it as 'static inline' is that I think the
compiler can make a better decision than me.
In the future it may be called in other files, then we can put it into
a more proper file.
> > +
> > /* This routine deals with incoming acks, but not outgoing ones. */
> > static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> > {
> > @@ -3715,7 +3720,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> > return 1;
> >
> > invalid_ack:
> > - SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
> > + tcp_stats(sk, LINUX_MIB_TCPINVALIDACK);
> > return -1;
> >
> > old_ack:
> > @@ -3731,7 +3736,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> > tcp_xmit_recovery(sk, rexmit);
> > }
> >
> > - SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
> > + tcp_stats(sk, LINUX_MIB_TCPOLDACK);
> > return 0;
> > }
> >
>
>
> These counters will add noise to an already crowded MIB space.
>
I have another idea that we can define some tcp bpf events to replace
these MIB counters somehing like,
#define BPF_EVENT_TCP_OLDACK 1
#define BPF_EVENT_TCP_PARTIALPACKET 2
...
Maybe we could also cleanup some MIBs to make it less crowded.
> What bug do you expect to track and fix with these ?
>
Let me explain the background for you.
I want to track some TCP abnormal behavior in TCP/IP stack. But I
find there's no good way to do it.
The current MIBs are per net, other than per socket, that makes it not
very powerful.
And the ancient SOCK_DEBUG is not good as well.
So we think why not cleanup this ancient SOCK_DEBUG() and introduce a
more powerful method.
> I see many TCP patches coming adding icache pressure, enabling companies to build their own modified
> TCP stack, but no real meat.
>
Thanks
Yafang
On Tue, Feb 12, 2019 at 11:11 PM Eric Dumazet <[email protected]> wrote:
>
>
>
> On 02/12/2019 03:31 AM, Yafang Shao wrote:
> > Introuce this new op BPF_SOCK_OPS_STATS_CB for tcp_stats() such that it
> > can be traced via BPF on a per socket basis.
> > There's one argument in BPF_SOCK_OPS_STATS_CB, which is Linux MIB index
> > LINUX_MIB_* to indicate the TCP event.
> > All these Linux MIBs are defined in include/uapi/linux/snmp.h.
> >
> > Signed-off-by: Yafang Shao <[email protected]>
> > ---
> > include/uapi/linux/bpf.h | 5 +++++
> > net/ipv4/tcp_input.c | 1 +
> > 2 files changed, 6 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 1777fa0..0314ddd 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2894,6 +2894,11 @@ enum {
> > BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
> > * socket transition to LISTEN state.
> > */
> > + BPF_SOCK_OPS_STATS_CB, /*
> > + * Called on tcp_stats().
> > + * Arg1: Linux MIB index
> > + * LINUX_MIB_*
> > + */
> > };
> >
> > /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 88deb1f..4acf458 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -3557,6 +3557,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
> > static void tcp_stats(struct sock *sk, int mib_idx)
> > {
> > NET_INC_STATS(sock_net(sk), mib_idx);
> > + tcp_call_bpf(sk, BPF_SOCK_OPS_STATS_CB, 1, &mib_idx);
> > }
> >
> > /* This routine deals with incoming acks, but not outgoing ones. */
> >
>
> If the plan is to add to all NET_INC_STATS() calls in TCP an additional tcp_call_bpf()
> I will say no.
>
I have no plan to place it in fast path.
Because what I concerned is the TCP abnomal events, which should be
not in the fast path.
However if we want to place it in the fast path, the code should be as bellow,
if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATS_CB_FLAG))
tcp_call_bpf(sk, BPF_SOCK_OPS_STATS_CB, 1, &mib_idx);
> So far, tcp_call_bpf() has not been used in fast path.
>
That's why I do it like this.
Thanks
Yafang
On Tue, Feb 12, 2019 at 6:07 PM Yafang Shao <[email protected]> wrote:
>
> Let me explain the background for you.
> I want to track some TCP abnormal behavior in TCP/IP stack. But I
> find there's no good way to do it.
> The current MIBs are per net, other than per socket, that makes it not
> very powerful.
> And the ancient SOCK_DEBUG is not good as well.
> So we think why not cleanup this ancient SOCK_DEBUG() and introduce a
> more powerful method.
I am all for it, but this more powerful method does nothing at all in
the current patches.
I can not accept patches just because they seem to be harmless,
knowing that the next patches
will be pushed later changing more stuff, just because the new
infrastructure is there "and can be used"
Just remove all SOCK_DEBUG() calls, there are leftovers of very ancient times.
Do not add more debugging stuff unless you can demonstrate
they actually allowed you to find a real bug and that you sent a
public fix for it.
Just adding "cool stuff" in TCP stack does not please me, it is only
more complexity for unproven gain.
Otherwise, I am tempted to think that these BPF hooks are there only
so that a company can more
easily build a private variant of TCP, yet letting the community
maintaining the hard part of TCP stack.
Thank you.
On Wed, Feb 13, 2019 at 10:15 AM Eric Dumazet <[email protected]> wrote:
>
> On Tue, Feb 12, 2019 at 6:07 PM Yafang Shao <[email protected]> wrote:
> >
>
> > Let me explain the background for you.
> > I want to track some TCP abnormal behavior in TCP/IP stack. But I
> > find there's no good way to do it.
> > The current MIBs are per net, other than per socket, that makes it not
> > very powerful.
> > And the ancient SOCK_DEBUG is not good as well.
> > So we think why not cleanup this ancient SOCK_DEBUG() and introduce a
> > more powerful method.
>
>
> I am all for it, but this more powerful method does nothing at all in
> the current patches.
>
> I can not accept patches just because they seem to be harmless,
> knowing that the next patches
> will be pushed later changing more stuff, just because the new
> infrastructure is there "and can be used"
>
> Just remove all SOCK_DEBUG() calls, there are leftovers of very ancient times.
>
OK. I will send a patch for it.
> Do not add more debugging stuff unless you can demonstrate
> they actually allowed you to find a real bug and that you sent a
> public fix for it.
>
Sure.
> Just adding "cool stuff" in TCP stack does not please me, it is only
> more complexity for unproven gain.
>
> Otherwise, I am tempted to think that these BPF hooks are there only
> so that a company can more
> easily build a private variant of TCP, yet letting the community
> maintaining the hard part of TCP stack.
:-)
>
> Thank you.
Thanks
Yafang
On Tue, Feb 12, 2019 at 6:15 PM Eric Dumazet <[email protected]> wrote:
>
> Do not add more debugging stuff unless you can demonstrate
> they actually allowed you to find a real bug and that you sent a
> public fix for it.
>
> Just adding "cool stuff" in TCP stack does not please me, it is only
> more complexity for unproven gain.
I agree.
I don't see why this debugging of 'abnormal TCP' cannot be done
with kprobes and tracepoints.
Instrumenting every tcp counter increment is overkill.
On Wed, Feb 13, 2019 at 10:49 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Tue, Feb 12, 2019 at 6:15 PM Eric Dumazet <[email protected]> wrote:
> >
> > Do not add more debugging stuff unless you can demonstrate
> > they actually allowed you to find a real bug and that you sent a
> > public fix for it.
> >
> > Just adding "cool stuff" in TCP stack does not please me, it is only
> > more complexity for unproven gain.
>
> I agree.
> I don't see why this debugging of 'abnormal TCP' cannot be done
> with kprobes and tracepoints.
kprobe is not suitable, because we have to use the line number, that's
a trouble cross all kernel versions.
Regarding tracepoints, I don't think it is a good idea to introduce
more and more tcp tracepoints and make them crowed as well.
> Instrumenting every tcp counter increment is overkill.