When this SKB is dropped, we should add the counter sk_drops.
That could help us better tracking this behavior.
Signed-off-by: Yafang Shao <[email protected]>
---
net/ipv4/tcp_input.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d51fa35..90f83eb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4802,7 +4802,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
else
rb_erase(&skb->rbnode, root);
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
--
1.8.3.1
Currently the collapsed SKB doesn't propagate the GSO information to the
new SKB.
The GSO should be propagated for better tracking, i.e. when this SKB is
dropped we could know how many network segments are dropped.
Signed-off-by: Yafang Shao <[email protected]>
---
net/ipv4/tcp_input.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 90f83eb..af52e4e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4893,6 +4893,8 @@ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
if (!nskb)
break;
+ skb_shinfo(nskb)->gso_size = skb_shinfo(skb)->gso_size;
+ skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
#ifdef CONFIG_TLS_DEVICE
nskb->decrypted = skb->decrypted;
@@ -4906,18 +4908,24 @@ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
- int offset = start - TCP_SKB_CB(skb)->seq;
int size = TCP_SKB_CB(skb)->end_seq - start;
+ int offset = start - TCP_SKB_CB(skb)->seq;
BUG_ON(offset < 0);
if (size > 0) {
- size = min(copy, size);
+ if (copy >= size)
+ skb_shinfo(nskb)->gso_segs +=
+ max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ else
+ size = copy;
+
if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
BUG();
TCP_SKB_CB(nskb)->end_seq += size;
copy -= size;
start += size;
}
+
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
--
1.8.3.1
On Fri, Jul 27, 2018 at 8:02 PM Yafang Shao <[email protected]> wrote:
>
> When this SKB is dropped, we should add the counter sk_drops.
> That could help us better tracking this behavior.
>
> Signed-off-by: Yafang Shao <[email protected]>
> ---
> net/ipv4/tcp_input.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index d51fa35..90f83eb 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4802,7 +4802,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
> else
> rb_erase(&skb->rbnode, root);
>
> - __kfree_skb(skb);
> + tcp_drop(sk, skb);
Absolutely not.
We do not drop the packet, we have simply lowered the memory overhead.
On Fri, Jul 27, 2018 at 8:02 PM Yafang Shao <[email protected]> wrote:
>
> Currently the collapsed SKB doesn't propagate the GSO information to the
> new SKB.
> The GSO should be propagated for better tracking, i.e. when this SKB is
> dropped we could know how many network segments are dropped.
What is "the GSO" ?
>
> Signed-off-by: Yafang Shao <[email protected]>
> ---
> net/ipv4/tcp_input.c | 12 ++++++++++--
> 1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 90f83eb..af52e4e 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4893,6 +4893,8 @@ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
> if (!nskb)
> break;
>
> + skb_shinfo(nskb)->gso_size = skb_shinfo(skb)->gso_size;
> + skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type;
Why gso_size and gso_type are important ?
Where later in the stack these values are used ?
> memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
> #ifdef CONFIG_TLS_DEVICE
> nskb->decrypted = skb->decrypted;
> @@ -4906,18 +4908,24 @@ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
>
> /* Copy data, releasing collapsed skbs. */
> while (copy > 0) {
> - int offset = start - TCP_SKB_CB(skb)->seq;
> int size = TCP_SKB_CB(skb)->end_seq - start;
> + int offset = start - TCP_SKB_CB(skb)->seq;
>
> BUG_ON(offset < 0);
> if (size > 0) {
> - size = min(copy, size);
> + if (copy >= size)
> + skb_shinfo(nskb)->gso_segs +=
> + max_t(u16, 1, skb_shinfo(skb)->gso_segs);
> + else
> + size = copy;
> +
So... what happens if copy was partial ?
Your patch does not really fix the uncertainty, it merely shifts it a bit.
On Sat, Jul 28, 2018 at 11:13 AM, Eric Dumazet <[email protected]> wrote:
> On Fri, Jul 27, 2018 at 8:02 PM Yafang Shao <[email protected]> wrote:
>>
>> Currently the collapsed SKB doesn't propagate the GSO information to the
>> new SKB.
>> The GSO should be propagated for better tracking, i.e. when this SKB is
>> dropped we could know how many network segments are dropped.
>
> What is "the GSO" ?
>
I mean gso_segs, gso_type and gso_size, which are all set in GRO.
>>
>> Signed-off-by: Yafang Shao <[email protected]>
>> ---
>> net/ipv4/tcp_input.c | 12 ++++++++++--
>> 1 file changed, 10 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index 90f83eb..af52e4e 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -4893,6 +4893,8 @@ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
>> if (!nskb)
>> break;
>>
>> + skb_shinfo(nskb)->gso_size = skb_shinfo(skb)->gso_size;
>> + skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type;
>
> Why gso_size and gso_type are important ?
>
> Where later in the stack these values are used ?
>
I'm not sure it is important or not.
I just worry it may be used latter.
>> memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
>> #ifdef CONFIG_TLS_DEVICE
>> nskb->decrypted = skb->decrypted;
>> @@ -4906,18 +4908,24 @@ void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
>>
>> /* Copy data, releasing collapsed skbs. */
>> while (copy > 0) {
>> - int offset = start - TCP_SKB_CB(skb)->seq;
>> int size = TCP_SKB_CB(skb)->end_seq - start;
>> + int offset = start - TCP_SKB_CB(skb)->seq;
>
>>
>> BUG_ON(offset < 0);
>> if (size > 0) {
>> - size = min(copy, size);
>> + if (copy >= size)
>> + skb_shinfo(nskb)->gso_segs +=
>> + max_t(u16, 1, skb_shinfo(skb)->gso_segs);
>> + else
>> + size = copy;
>> +
>
> So... what happens if copy was partial ?
>
In the current patch, if copy was parial, the gso_segs are in the
orignal SKB as it will not be freed now.
If that is not ok, what about the bellow change ?
else {
size = copy;
skb_shinfo(nskb)->gso_segs += DIV_ROUND_UP(size,
skb_shinfo(nskb)->gso_size);
}
> Your patch does not really fix the uncertainty, it merely shifts it a bit.
On Sat, Jul 28, 2018 at 11:06 AM, Eric Dumazet <[email protected]> wrote:
> On Fri, Jul 27, 2018 at 8:02 PM Yafang Shao <[email protected]> wrote:
>>
>> When this SKB is dropped, we should add the counter sk_drops.
>> That could help us better tracking this behavior.
>>
>> Signed-off-by: Yafang Shao <[email protected]>
>> ---
>> net/ipv4/tcp_input.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index d51fa35..90f83eb 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -4802,7 +4802,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
>> else
>> rb_erase(&skb->rbnode, root);
>>
>> - __kfree_skb(skb);
>> + tcp_drop(sk, skb);
>
>
> Absolutely not.
>
> We do not drop the packet, we have simply lowered the memory overhead.
So what about LINUX_MIB_TCPOFOMERGE ?
Regarding LINUX_MIB_TCPOFOMERGE, a skb is already covered by another
skb, is that dropping the packet or simply lowering the memory
overhead ?
Thanks
Yafang
On Fri, Jul 27, 2018 at 8:35 PM Yafang Shao <[email protected]> wrote:
> So what about LINUX_MIB_TCPOFOMERGE ?
> Regarding LINUX_MIB_TCPOFOMERGE, a skb is already covered by another
> skb, is that dropping the packet or simply lowering the memory
> overhead ?
What do you think ?
If you receive two times the same payload, don't you have to drop one
of the duplicate ?
There is a a big difference between the two cases.
On Sat, Jul 28, 2018 at 11:38 AM, Eric Dumazet <[email protected]> wrote:
> On Fri, Jul 27, 2018 at 8:35 PM Yafang Shao <[email protected]> wrote:
>
>> So what about LINUX_MIB_TCPOFOMERGE ?
>> Regarding LINUX_MIB_TCPOFOMERGE, a skb is already covered by another
>> skb, is that dropping the packet or simply lowering the memory
>> overhead ?
>
> What do you think ?
>
> If you receive two times the same payload, don't you have to drop one
> of the duplicate ?
>
> There is a a big difference between the two cases.
If the drop caused some data lost (which may then cause retransmition
or something), then this is a really DROP.
While if the drop won't cause any data lost, meaning it is a
non-harmful behavior, I think it should not be defined as DROP.
This is my suggestion anyway.
Thanks
Yafang
On Sat, Jul 28, 2018 at 12:43 AM Yafang Shao <[email protected]> wrote:
>
> On Sat, Jul 28, 2018 at 11:38 AM, Eric Dumazet <[email protected]> wrote:
> > On Fri, Jul 27, 2018 at 8:35 PM Yafang Shao <[email protected]> wrote:
> >
> >> So what about LINUX_MIB_TCPOFOMERGE ?
> >> Regarding LINUX_MIB_TCPOFOMERGE, a skb is already covered by another
> >> skb, is that dropping the packet or simply lowering the memory
> >> overhead ?
> >
> > What do you think ?
> >
> > If you receive two times the same payload, don't you have to drop one
> > of the duplicate ?
> >
> > There is a a big difference between the two cases.
>
> If the drop caused some data lost (which may then cause retransmition
> or something), then this is a really DROP.
> While if the drop won't cause any data lost, meaning it is a
> non-harmful behavior, I think it should not be defined as DROP.
> This is my suggestion anyway.
Sigh.
We count drops, not because they are ' bad or something went wrong'.
If TCP stack receives twice the same sequence (same payload), we
_drop_ one of the duplicate, so we account for this event.
When ' collapsing' we reorganize our own storage, not because we have
to drop a payload,
but for some memory pressure reason.
We have specific SNMP counters to account for these, we do not want to
pretend a packet was ' dropped' since it was not.
If we have to _drop_ some packets, it is called Pruning, and we do
properly account for these drops.
On Sun, Jul 29, 2018 at 12:28 AM, Eric Dumazet <[email protected]> wrote:
> On Sat, Jul 28, 2018 at 12:43 AM Yafang Shao <[email protected]> wrote:
>>
>> On Sat, Jul 28, 2018 at 11:38 AM, Eric Dumazet <[email protected]> wrote:
>> > On Fri, Jul 27, 2018 at 8:35 PM Yafang Shao <[email protected]> wrote:
>> >
>> >> So what about LINUX_MIB_TCPOFOMERGE ?
>> >> Regarding LINUX_MIB_TCPOFOMERGE, a skb is already covered by another
>> >> skb, is that dropping the packet or simply lowering the memory
>> >> overhead ?
>> >
>> > What do you think ?
>> >
>> > If you receive two times the same payload, don't you have to drop one
>> > of the duplicate ?
>> >
>> > There is a a big difference between the two cases.
>>
>> If the drop caused some data lost (which may then cause retransmition
>> or something), then this is a really DROP.
>> While if the drop won't cause any data lost, meaning it is a
>> non-harmful behavior, I think it should not be defined as DROP.
>> This is my suggestion anyway.
>
> Sigh.
>
> We count drops, not because they are ' bad or something went wrong'.
>
> If TCP stack receives twice the same sequence (same payload), we
> _drop_ one of the duplicate, so we account for this event.
>
> When ' collapsing' we reorganize our own storage, not because we have
> to drop a payload,
> but for some memory pressure reason.
Thanks for you clarification.
So what about LINUX_MIB_TCPOFODROP ?
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
tcp_drop(sk, skb);
return;
}
It is also because of our own memory pressure, but we call tcp_drop() here.
I am not mean to disagree with you. I am just confused and want to
make it clear.
> We have specific SNMP counters to account for these, we do not want to
> pretend a packet was ' dropped' since it was not.
>
> If we have to _drop_ some packets, it is called Pruning, and we do
> properly account for these drops.
Agreed.
Thanks
Yafang
On Sun, Jul 29, 2018 at 7:06 PM Yafang Shao <[email protected]> wrote:
>
> On Sun, Jul 29, 2018 at 12:28 AM, Eric Dumazet <[email protected]> wrote:
> > On Sat, Jul 28, 2018 at 12:43 AM Yafang Shao <[email protected]> wrote:
> >>
> >> On Sat, Jul 28, 2018 at 11:38 AM, Eric Dumazet <[email protected]> wrote:
> >> > On Fri, Jul 27, 2018 at 8:35 PM Yafang Shao <[email protected]> wrote:
> >> >
> >> >> So what about LINUX_MIB_TCPOFOMERGE ?
> >> >> Regarding LINUX_MIB_TCPOFOMERGE, a skb is already covered by another
> >> >> skb, is that dropping the packet or simply lowering the memory
> >> >> overhead ?
> >> >
> >> > What do you think ?
> >> >
> >> > If you receive two times the same payload, don't you have to drop one
> >> > of the duplicate ?
> >> >
> >> > There is a a big difference between the two cases.
> >>
> >> If the drop caused some data lost (which may then cause retransmition
> >> or something), then this is a really DROP.
> >> While if the drop won't cause any data lost, meaning it is a
> >> non-harmful behavior, I think it should not be defined as DROP.
> >> This is my suggestion anyway.
> >
> > Sigh.
> >
> > We count drops, not because they are ' bad or something went wrong'.
> >
> > If TCP stack receives twice the same sequence (same payload), we
> > _drop_ one of the duplicate, so we account for this event.
> >
> > When ' collapsing' we reorganize our own storage, not because we have
> > to drop a payload,
> > but for some memory pressure reason.
>
> Thanks for you clarification.
> So what about LINUX_MIB_TCPOFODROP ?
>
> if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
> NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
> tcp_drop(sk, skb);
> return;
> }
>
>
> It is also because of our own memory pressure, but we call tcp_drop() here.
Yes, we _drop_ a packet.
That is pretty clear that the payload is dropped, and that the sender
will have to _retransmit_.
>
> I am not mean to disagree with you. I am just confused and want to
> make it clear.
Collapsing is :
For (a bunch of packets)
Try (to compress them in order to reduce memory overhead)
No drop of payload happens here. Sender wont have to retransmit.
On Mon, Jul 30, 2018 at 10:27 AM, Eric Dumazet <[email protected]> wrote:
> On Sun, Jul 29, 2018 at 7:06 PM Yafang Shao <[email protected]> wrote:
>>
>> On Sun, Jul 29, 2018 at 12:28 AM, Eric Dumazet <[email protected]> wrote:
>> > On Sat, Jul 28, 2018 at 12:43 AM Yafang Shao <[email protected]> wrote:
>> >>
>> >> On Sat, Jul 28, 2018 at 11:38 AM, Eric Dumazet <[email protected]> wrote:
>> >> > On Fri, Jul 27, 2018 at 8:35 PM Yafang Shao <[email protected]> wrote:
>> >> >
>> >> >> So what about LINUX_MIB_TCPOFOMERGE ?
>> >> >> Regarding LINUX_MIB_TCPOFOMERGE, a skb is already covered by another
>> >> >> skb, is that dropping the packet or simply lowering the memory
>> >> >> overhead ?
>> >> >
>> >> > What do you think ?
>> >> >
>> >> > If you receive two times the same payload, don't you have to drop one
>> >> > of the duplicate ?
>> >> >
>> >> > There is a a big difference between the two cases.
>> >>
>> >> If the drop caused some data lost (which may then cause retransmition
>> >> or something), then this is a really DROP.
>> >> While if the drop won't cause any data lost, meaning it is a
>> >> non-harmful behavior, I think it should not be defined as DROP.
>> >> This is my suggestion anyway.
>> >
>> > Sigh.
>> >
>> > We count drops, not because they are ' bad or something went wrong'.
>> >
>> > If TCP stack receives twice the same sequence (same payload), we
>> > _drop_ one of the duplicate, so we account for this event.
>> >
>> > When ' collapsing' we reorganize our own storage, not because we have
>> > to drop a payload,
>> > but for some memory pressure reason.
>>
>> Thanks for you clarification.
>> So what about LINUX_MIB_TCPOFODROP ?
>>
>> if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
>> NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
>> tcp_drop(sk, skb);
>> return;
>> }
>>
>>
>> It is also because of our own memory pressure, but we call tcp_drop() here.
>
> Yes, we _drop_ a packet.
>
> That is pretty clear that the payload is dropped, and that the sender
> will have to _retransmit_.
>
>>
>> I am not mean to disagree with you. I am just confused and want to
>> make it clear.
>
>
> Collapsing is :
>
> For (a bunch of packets)
> Try (to compress them in order to reduce memory overhead)
>
> No drop of payload happens here. Sender wont have to retransmit.
OK.
Thanks for your patient.
Should we put NET_INC_STATS(sock_net(sk), mib_idx) into the funtion
tcp_drop() ?
Then we could easily relate the sk_drops with the SNMP counters.
Something like that,
static void tcp_drop(struct sock *sk, struct sk_buff *skb, int mib_idx)
{
int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
atomic_add(segs, &sk->sk_drops);
NET_ADD_STATS(sock_net(sk), mib_idx, segs);
__kfree_skb(skb);
}
Thanks
Yafang
On Sun, Jul 29, 2018 at 10:40 PM Yafang Shao <[email protected]> wrote:
> Should we put NET_INC_STATS(sock_net(sk), mib_idx) into the funtion
> tcp_drop() ?
> Then we could easily relate the sk_drops with the SNMP counters.
>
> Something like that,
>
> static void tcp_drop(struct sock *sk, struct sk_buff *skb, int mib_idx)
> {
> int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
>
> atomic_add(segs, &sk->sk_drops);
> NET_ADD_STATS(sock_net(sk), mib_idx, segs);
> __kfree_skb(skb);
> }
We had a discussion during netconf, and Brendan Gregg was working on
an idea like that,
so that distinct events could be traced/reported.
I prefer letting Brendan submit his patch, which not only refactors
things, but add new functionality.
Thanks.
On Mon, Jul 30, 2018 at 11:56 PM, Eric Dumazet <[email protected]> wrote:
> On Sun, Jul 29, 2018 at 10:40 PM Yafang Shao <[email protected]> wrote:
>
>> Should we put NET_INC_STATS(sock_net(sk), mib_idx) into the funtion
>> tcp_drop() ?
>> Then we could easily relate the sk_drops with the SNMP counters.
>>
>> Something like that,
>>
>> static void tcp_drop(struct sock *sk, struct sk_buff *skb, int mib_idx)
>> {
>> int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
>>
>> atomic_add(segs, &sk->sk_drops);
>> NET_ADD_STATS(sock_net(sk), mib_idx, segs);
>> __kfree_skb(skb);
>> }
>
> We had a discussion during netconf, and Brendan Gregg was working on
> an idea like that,
> so that distinct events could be traced/reported.
>
Oh yes, introducing a new tracepoint for it should be better.
trace_tcp_probe(sk, skb, mib_idx);
> I prefer letting Brendan submit his patch, which not only refactors
> things, but add new functionality.
>
OK.