Unlike skb_realloc_headroom, new helper skb_expand_head
does not allocate a new skb if possible.
Additionally this patch replaces commonly used dereferencing with variables.
Signed-off-by: Vasily Averin <[email protected]>
---
net/ipv6/ip6_output.c | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7d2ec25..f91d13a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -249,6 +249,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst->dev;
+ struct inet6_dev *idev = ip6_dst_idev(dst);
unsigned int head_room;
struct ipv6hdr *hdr;
u8 proto = fl6->flowi6_proto;
@@ -256,22 +258,16 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
int hlimit = -1;
u32 mtu;
- head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+ head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
if (opt)
head_room += opt->opt_nflen + opt->opt_flen;
- if (unlikely(skb_headroom(skb) < head_room)) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
- if (!skb2) {
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
+ if (unlikely(head_room > skb_headroom(skb))) {
+ skb = skb_expand_head(skb, head_room);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
return -ENOBUFS;
}
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
}
if (opt) {
@@ -313,8 +309,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
mtu = dst_mtu(dst);
if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
- IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUT, skb->len);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -327,17 +322,17 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
* we promote our socket to non const
*/
return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net, (struct sock *)sk, skb, NULL, dst->dev,
+ net, (struct sock *)sk, skb, NULL, dev,
dst_output);
}
- skb->dev = dst->dev;
+ skb->dev = dev;
/* ipv6_local_error() does not require socket lock,
* we promote our socket to non const
*/
ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
return -EMSGSIZE;
}
--
1.8.3.1
(resend without html - thanks gmail web-interface...)
On Fri, Aug 20, 2021 at 3:41 PM Christoph Paasch
<[email protected]> wrote:
>
> Hello,
>
> On Fri, Aug 6, 2021 at 1:18 AM Vasily Averin <[email protected]> wrote:
> >
> > Unlike skb_realloc_headroom, new helper skb_expand_head
> > does not allocate a new skb if possible.
> >
> > Additionally this patch replaces commonly used dereferencing with variables.
> >
> > Signed-off-by: Vasily Averin <[email protected]>
> > ---
> > net/ipv6/ip6_output.c | 27 +++++++++++----------------
> > 1 file changed, 11 insertions(+), 16 deletions(-)
> >
> > diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> > index 7d2ec25..f91d13a 100644
> > --- a/net/ipv6/ip6_output.c
> > +++ b/net/ipv6/ip6_output.c
> > @@ -249,6 +249,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
> > const struct ipv6_pinfo *np = inet6_sk(sk);
> > struct in6_addr *first_hop = &fl6->daddr;
> > struct dst_entry *dst = skb_dst(skb);
> > + struct net_device *dev = dst->dev;
> > + struct inet6_dev *idev = ip6_dst_idev(dst);
> > unsigned int head_room;
> > struct ipv6hdr *hdr;
> > u8 proto = fl6->flowi6_proto;
> > @@ -256,22 +258,16 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
> > int hlimit = -1;
> > u32 mtu;
> >
> > - head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
> > + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
> > if (opt)
> > head_room += opt->opt_nflen + opt->opt_flen;
> >
> > - if (unlikely(skb_headroom(skb) < head_room)) {
> > - struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
> > - if (!skb2) {
> > - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
> > - IPSTATS_MIB_OUTDISCARDS);
> > - kfree_skb(skb);
> > + if (unlikely(head_room > skb_headroom(skb))) {
> > + skb = skb_expand_head(skb, head_room);
> > + if (!skb) {
> > + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
>
>
> this change introduces a panic on my syzkaller instance:
>
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 1832 at net/core/skbuff.c:5412 skb_try_coalesce+0x1019/0x12c0 net/core/skbuff.c:5412
> Modules linked in:
> CPU: 0 PID: 1832 Comm: syz-executor.0 Not tainted 5.14.0-rc4ab492b0cda378661ae004e2fd66cfd1be474438d #102
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
> RIP: 0010:skb_try_coalesce+0x1019/0x12c0 net/core/skbuff.c:5412
> Code: 24 20 bf 01 00 00 00 8b 40 20 44 0f b7 f0 44 89 f6 e8 ab 41 c0 fe 41 83 ee 01 0f 85 01 f3 ff ff e9 42 f6 ff ff e8 07 3c c0 fe <0f> 0b e9 7b f9 ff ff e8 fb 3b c0 fe 48 8b 44 24 40 48 8d 70 ff 4c
> RSP: 0018:ffffc90002d97530 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000e00 RCX: 0000000000000000
> RDX: ffff88810a27bc00 RSI: ffffffff8276b6c9 RDI: 0000000000000003
> RBP: ffff88810a17f9e0 R08: 0000000000000e00 R09: 0000000000000000
> R10: ffffffff8276b042 R11: 0000000000000000 R12: ffff88810a17f760
> R13: ffff888108fc6ac0 R14: 0000000000001000 R15: ffff88810a17f7d6
> FS: 00007f6be8546700(0000) GS:ffff88811b400000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000000 CR3: 000000010a4f0005 CR4: 0000000000170ef0
> Call Trace:
> tcp_try_coalesce net/ipv4/tcp_input.c:4642 [inline]
> tcp_try_coalesce+0x312/0x870 net/ipv4/tcp_input.c:4621
> tcp_queue_rcv+0x73/0x670 net/ipv4/tcp_input.c:4905
> tcp_data_queue+0x11e5/0x4af0 net/ipv4/tcp_input.c:5016
> tcp_rcv_established+0x83a/0x1d30 net/ipv4/tcp_input.c:5928
> tcp_v6_do_rcv+0x438/0x1380 net/ipv6/tcp_ipv6.c:1517
> sk_backlog_rcv include/net/sock.h:1024 [inline]
> __release_sock+0x1ad/0x310 net/core/sock.c:2669
> release_sock+0x54/0x1a0 net/core/sock.c:3193
> tcp_sendmsg+0x36/0x40 net/ipv4/tcp.c:1462
> inet6_sendmsg+0xb5/0x140 net/ipv6/af_inet6.c:646
> sock_sendmsg_nosec net/socket.c:704 [inline]
> sock_sendmsg net/socket.c:724 [inline]
> ____sys_sendmsg+0x3b5/0x970 net/socket.c:2403
> ___sys_sendmsg+0xff/0x170 net/socket.c:2457
> __sys_sendmmsg+0x192/0x440 net/socket.c:2543
> __do_sys_sendmmsg net/socket.c:2572 [inline]
> __se_sys_sendmmsg net/socket.c:2569 [inline]
> __x64_sys_sendmmsg+0x98/0x100 net/socket.c:2569
> do_syscall_x64 arch/x86/entry/common.c:50 [inline]
> do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80
> entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7f6be7e55469
> Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ff 49 2b 00 f7 d8 64 89 01 48
> RSP: 002b:00007f6be8545da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
> RAX: ffffffffffffffda RBX: 0000000000000133 RCX: 00007f6be7e55469
> RDX: 0000000000000003 RSI: 00000000200008c0 RDI: 0000000000000003
> RBP: 0000000000000133 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000040044040 R11: 0000000000000246 R12: 000000000069bf8c
> R13: 00007ffe013f968f R14: 00007f6be8526000 R15: 0000000000000003
> ---[ end trace 60453d9d261151ca ]---
>
> (syzkaller-reproducer at the end of this email)
>
> AFAICS, this is because pskb_expand_head (called from skb_expand_head) is not adjusting skb->truesize when skb->sk is set (which I guess is the case in this particular scenario). I'm not sure what the proper fix would be though...
>
>
> Reproducer:
>
> # {Threaded:true Collide:true Repeat:true RepeatTimes:0 Procs:1 Slowdown:1 Sandbox:none Fault:false FaultCall:-1 FaultNth:0 Leak:false NetInjection:true NetDevices:true NetReset:true Cgroups:true BinfmtMisc:true CloseFDs:true KCSAN:false DevlinkPCI:false USB:false VhciInjection:false Wifi:false IEEE802154:false Sysctl:true UseTmpDir:true HandleSegv:true Repro:false Trace:false}
> r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
> bind$inet6(r0, &(0x7f0000000080)={0xa, 0x4e22, 0x0, @loopback}, 0x1c)
> sendmmsg$inet6(r0, &(0x7f0000002940)=[{{&(0x7f00000000c0)={0xa, 0x4e22, 0x0, @empty}, 0x1c, 0x0}}], 0x36, 0x20000145)
> r1 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
> r2 = socket$inet_tcp(0x2, 0x1, 0x0)
> ioctl$sock_SIOCGIFINDEX(r2, 0x8933, 0x0)
> setsockopt$inet6_mreq(r1, 0x29, 0x1b, 0x0, 0x0)
> sendmmsg$inet6(r0, &(0x7f00000008c0)=[{{0x0, 0x0, &(0x7f0000000240)=[{&(0x7f0000000040)="11c2e854", 0x4}, {0x0}], 0x2}}, {{0x0, 0x0, &(0x7f0000000580)=[{0x0}, {&(0x7f0000001800)="bcfad31884cb1c6226004dd6ed929a7fa308da79249cdfcf5447732df714b21f9fa725d49453be964002a469aff676404855809cf7d7f8fad8ff26c30a9aa692c5ba1c3bf622d795da6efd5425e52f43e8a01e98fec8c4079d0f711b7b02666cc4046eed001f62377a14142ee1004708bc6b57ed028e4a0f8af459fa89ec7cd3098c32cbe69625cd654a3aca8814c9426a817da1b631828f40c0ba2227e8030aabba4eeaeea5617df6fbc54918905137032bb60cb86a7ebd4b8b1f8428085bfc5749a3a986dfca6f0e40b2cc32121cdd53ab4815cdf32dca5d1ee64d0e11894d07ad78f5ee5e7701f4803c49d6fdf927a0c103cf5ba9293e232ebd5661fb950f1cacef864673b79c7a180010889f0a8c9d69a97e3f6a88a9180fcc61631ffa42332f68a8ae78982b5e232decca2a4259a05e96c7fae309468a364798b7343354bdddfe1d81c3393556ced79b92f7c8f9455c1e4deb7cac7d81fdc3d72f201b711a253c2bf4df9f9bcb0ebe51edd3bb62a9440c0c88dc0b7ed6aaa2fdadd868845865cf6c40ba21222123894323fdd0ae8f8fc896a1c4b77431655d77750578db7e6d01380a7b5f5326cd8dd0091ac0a5526b9d1e9318dbfd2ac7227b1779674167c23fa8c59fcc44ab0c117788aa6a1c2baa0fa314093cfa79474e26b09d305c4b20d8f20150feb861ab750f1dbd7c4081ede2b245c14f7152f46352db5b0e4a3e676d23a76ea6363f97a35b3df2ced972f9daec0a5f6f13d5093250a93b31f4fafb1fa783cd99504ba8ac5fbfb4af9ff108a891e3786964ae5f00bc52496db9cf162b0efaded2459a3c1c1fbc60182b62a430a8eef93c6673db2596256f2b85f41975a31908dfd27eced4b169c99fbe79f59e61c8d3aef4e23f3bb665a0f25a09ce19a15f47734b817b1451f32418b1d9d801c64339b4fa2147e794dec9909569abcabeb9785991e32eb9835da449617c9c050000cf83fc6297e908d3708784a9c0599d3df55055123ef6037c5f8223acbeb0413610af573904069b64057d73a95e76f53939d6a144b411d73b7ae09bddc8c28e2484e03f781cce9df821c727facfd7d158a63f466d183e7be32c921176f574258d252bf2c15f1bcb6498ef252315a360ccd4ce7e5388d66462c51885a04211ed9ab097f66e31f6bb704902b74d6afd96f05b7a5de5eaa9d6ba4c700e7ad2e686b7241cf57358937e1b526f34d8522b761ca596b77e74ebae352ffbc9ee9d98b40ea1b1f7e24b3037d0630cd3ede29d4cd960fe0c17102c06e0a7f993b99f961384698dfd82c813cb862779d68fd8f6828a7c6e0bc8f9e3798ca7c3d5f287f8b63b65f6db8fa706e8a378166486b8c8fadfa06a4f0a21e8d6dc8ed230c8b5cde27a48742700b43ba4dce5e3da897b2bf1fc5832d07ac3482a9834b30c0371f517b9f0ae0706a12b87dcb3555e08df00cc0a83d7060c06edd7b8b8c6199398ee6bea33a4690e8f9455f54a6ef9d4b69d39c55a69d6fed3d7a376ceb51bd032b9b76a08ce48c5a2af8702922163a57835133e9968f8a1e4401479ec8a83a42f1c425840cdcd52483d8432c897ae75e8f68df55e5a5c8c16b4d071e92be22eb23f35d58cb86cc1b0c494e96723f5c50e441b81a7f1fca08faaefec8003fddb967381c542c5ac6c18cc7e4f4e61c8aae4ae90f157ae283a9fe8d02b9f52dc7bbf80e47cc6e3e2e6ee9008e8b82cb7ee672b11be1a3874aa63586fb242378f59251990cd8d944d8e46d84e76cfe9dbffe2562d29ed94f5a314062fc8f496bdb38a426f2ab0525418625727f329ca1cb8cd30ae9013829136c025892c2614d1fa0edf019c8e7f3a9ef50ec0483f3d70daec362b603a89b6a7049310b4776e1bb884f8d44a74b2a3da056e3ba8f3425f7a4628f3f140aa12fcd779ea01e22eff2c670759971ed8e11570813835401484537c0075e897863ae690c420b1c4d90ea4e90d84b220511bbba8e21869610e4ff8d2130b09a40a0cf49715275a01ecc438d6fcdfcb189d602d964a1a39ee375067d82c63dc409fab9715ee86cbbf5cdc9b906b66b40e4d2180cd1d03229bd42b2ea6359d8a127d6156d19cc638e4811b55871c39b370121bc0da9b29d3be162572f672f3f1cb9aac5dba4cb9f57062a27b95f5db96b7ea16beb7df8d5642ca2695aa0dccec99bc1b4efbdbe4ceb7febca4b1846b1486ee11383ca9c802428a9a8ac308bfa106dd8a5c87cceb4e1a700626a7ceb074d5628a6be3a96ff59851f797e375f42c7c452d66232f076f5c0045d06f5ee1e39f48b1b4285e569e574136de2d3412dc6f24e5e7024739f1fc5aa71c41e8eaa88c0094ca7ae5b96a20beaec44223d2df1cf650ef924b58e71f7ddf82b18a7d13d8eccfaea41e17ba7c65923d6dea3a7834113f446eaac407c371dc80841a85a4aa0ef2f4d49a204f626ed1c4921d61c50f6bd938359e8fec4e7d03dd6edb0b29437e3e121b65852d5cda472a99a29de0a56db2aa6aca87a9d6bcf1bbe6783735b67807c42a399b0757390d872a26e73244df25b5a6870b2da91e45e5168f8993cf7ca209df04c49cfb6eed24cfc343864cfb997490ac5b0b39d2e3bfa453d54d36546cd034d542ba38f66fe87cd4f89d08ca059e721272648c550481d952a0a47fd15d8fbe4a23303978b813011deef566317f71d9b18a7cb2283c44be2ed1a05ca9c3daf6c28f782f6c3593536d48253243a8995e8b9a30d83b7733bc166d4791bfd1555dcdf44c297ef726aaa47cbebdeae8a7288e20fae87b92fa44c78158501526afc5a7e0041b48c55881f9031d2ae4f482b8f44ce5297113bbd217d081162b26811b4d08f0fb9d76d5f179e5af344f73d62742bf871920121e26796eb67d059b49f5850475e02723c9e84a5578d5610f1cb3b1055b6333d59df391dbf67f6660f2d3aaa9a601d44cec1bb4199b1130d373a5a53ca81e16c374400976b39967bc7310ba801a225e44da319744a1e7e2b2261dedcbb31dda7105de586baf881f88bc7e0069b42fb3ebae8fa63e39609a1f596ba219da2ef61d69976d3175e47769bc1ba2b3cd064db3ff4d7ed0d87ee7054aa7e111aae93592dd6203e11e87bcaf8c43cd803282d0d25ba9b043ea8cbe26f585643d504d6f4f66848c881c066e9f10057e2c773b1e5abf2e56a4f07d35fc29f7fe98dc5fc26507406ca2901ebfae6e68715f6ca4d73555dea5f6ec77661ab9cd7335276eda1f28606e8dbd9b04fe17f53ffc3bedab5d800860f1a2c8bd4909f3b98cc7e7bda7a7e46deeff86c756e3b7d40067ca35f867bb5456ea61599e95916397f72404b2bac726dd5c1a5042eed92bd2988365405d17dde91c214c12263e976356d6131a2a5269cdfb1aefa6dc60b9f522ec2a619b2cc58baf8fadd52f43892d12d27498023e18390db603a3fe5e2f3e14491c1aed2ddaf47e717decfa6877aeed9fea0c575223cb062ceab83a20e99b49a0581e0fb25ba1eae88d5b229eb5320d888652acae4955c3bbe94de43fa99476c9058250847381ee4d85e079c190fadfa0776318ed00f7019a4010f2eb8777c7499456b8748991d4d4c6590cf903a237a81c0af16c4cec87ac49bc7787c2c05ea56501c9cddc2c4ba884ab26a51639e4b77fa5a2488ab842b8ef6d0fb1d3bea710ad1528a1e78a18e37b4a0ed803ba29d9ddf1ca7e8bd56d1588dbb9e3995493959adaa334acd611ac8d266a3199b2ecb9958366d785946722bc9507ab420d447dfadd0274a8e2a25d290c598d65145956c1c8542d07bc0a15efa0a9daed2f2fe71f60b4b657b34a3e897749a49bd85c2a0d915af6a38e8de62df1fcb4e991d04ce4c43eef97c4b3e9de11d528eb9cb923115efc681d0a2a0a714ad923086961c07335e152ec83b65ddb43b8865984c36de76f19bdb6e96566b741fb1c755d012c5377b196bed875e9a08bb7d4fd3160fddc71b9bb55a7199183529cda7a3624077a77b3a2b0367845d23e42dbc4130dcef4cc855556f833b94794b1a4d28d7503113ed4f8ec7b3578b418640d49b01e6a75b456944377a8d892eab38037705e010e32cf7074784dd42f42a92d18c0c26eb4c23a12c1697215aaa41ca92dd59f9816168bc7c9275c256cc689c03809f40125a1c17063a7696ab30dc32d8dcddf8e4e2983a883f11d64b485310c71e5533399d928da94b4ce9a8178517477345bc26b6ba838dfc0b0d9f8542f93183f68e4660e17a8bc92dea290bb718f1d0e89e385ad8ac2f3cc9ad8dfa08e62501b3bd6597574f98bfa39b901daa1e8c56dcddf87d49cf4b1345ae352b1aa72ca62fdeb39abcad462c4045fa1daaba17047d1b91bec23145d4090a7eca6f1510d88beab5a2625fdf773f16293b77eec703bdc13504550f8d74546b89ac056419fb12ded672a2cd4efee157689a704eb8511fab3c12d29e025184b6f329fe4b6226f187b0c009e1b5cfc249791a6c96bee7766276f7e90163f0c11f6c1d04a32e3128d1e3eb95e167f0bdf4b927b775d7fe01aa2174d43a8fb7b8ae4b5c1e6c7e7b7a6bc7f19a70f7ef107a6406e69cb60047aee0d3d0ed4b1a6605bc1eba9ea2664edf145d2422dcc47e26c6f000071a15cbd62c16983bf8a5fef08aeea48d6a477654aacdada917efee28a79897a7b9280025a370d2ccbadd639c3314a7e3cc12298da71e2ca40833f1b88d44e6693049baee1b0789a619815aee3707703963e296be3171854411f874a01f1c6c9f9bb6e4932e13b791280aed0cca54c3dd77033a8c3836a0306e13dabfdfa96ddcb9fa1de806419c1eaac65d7b601417b011ce50fd6257ae97f883faacfc2ce7a5fec477793f2bfa24f51d542f7d5d4828e7efc455af4f4d4e60f72c6598b72751dc058169e5d13b14b91b5849463f688c10d9d4a8f566ee18620a4541dce68411d3746b8b4ef4f891c87cbc536e8e99b828c23732fb1484fd03e23bc0affde95d7c0c6fc0407d1990b55296c4e32f8c0387b965ee20482d9e666f3caa2167218d6a48305b37430835e839b62960fab96ec61e5b49c345fe8c07f7b71ba3b82bef5701de404461a1aaa01331913dc03e27ed36c7f14f0c82057f477b73e9ee20621a5c70df091a2da470db9602aa17f281a15f644a2a85534040dba67d2a6473503453e133098205fb53a3c20fdafa508a82e9c8d172b2eed24afb1c47890ace2eb48ee9e2bc2488e47bf18e69676b32087e7abf3d1b918bd9ac24338d5ae54c0f2c95b0f969f44edac2b552b611ae2b415cfed467ec989e7a139a72b0d3c68e20dd307500bbd7f4e079f6c1486bdb31f648c50c1c4b58e6be618ac6257ddb2558af9f24be2ae415e8edc50197dda4a1178ea4de53bbd66e819173574fecaaa5540189538762f472bd562f1d87f64fc39008d4a3f85cd1b3f01508a42a3047976f6f1dcd4c1d942349967dfd633a6e56de55a5e0f2ad68847e7f1e3a08813dc7483db90cf417e7388aab6e4806628e6b9ab980a12b9ca6b345845f043d9b31cfbba9a17517cb0d421e2db467e991c2584f625dd1884126261e3f455c4f44c15380d43dc7e3e8fc69086395d9535f094432115b3d1b643178aadcf0919d85c3faddbf631ddd50c322a3e489310de21ec2c3026721f9301a34acfe9d65326f7f7ad54b6ad6eaa978b739407105d2d4eb869fff3e2de7585a7fed747493bd65537120cac03e3b48458ddbdbc5ba3382b6040d4863f4d3fd783276e01a3a9a5f05067d1d101ec424e6fb179cf9bcad8f5536b63dd63248ee411ce4b79b70fa7e8a619714646ff2fd557", 0x1000}, {0x0}, {0x0}], 0x4}}, {{0x0, 0x0, 0x0}}], 0x3, 0x40044040)
> setsockopt$inet6_IPV6_HOPOPTS(r0, 0x29, 0x36, &(0x7f0000000640)={0x0, 0x11, '\x00', [@calipso={0x7, 0x40, {0x0, 0xe, 0xb, 0x101, [0x5, 0x4ebd, 0x5d, 0x3, 0x80, 0x7, 0x8]}}, @calipso={0x7, 0x10, {0x0, 0x2, 0x6, 0x0, [0x0]}}, @calipso={0x7, 0x28, {0x2, 0x8, 0x9, 0x6, [0x0, 0x80000001, 0x5d53, 0x9]}}, @calipso={0x7, 0x8, {0x2, 0x0, 0x3, 0x5}}]}, 0x90)
>
>
>
> Christoph
>
On 8/21/21 1:44 AM, Christoph Paasch wrote:
> (resend without html - thanks gmail web-interface...)
> On Fri, Aug 20, 2021 at 3:41 PM Christoph Paasch
>> AFAICS, this is because pskb_expand_head (called from
>> skb_expand_head) is not adjusting skb->truesize when skb->sk is set
>> (which I guess is the case in this particular scenario). I'm not
>> sure what the proper fix would be though...
Could you please elaborate?
it seems to me skb_realloc_headroom used before my patch called pskb_expand_head() too
and did not adjusted skb->truesize too. Am I missed something perhaps?
The only difference in my patch is that skb_clone can be not called,
though I do not understand how this can affect skb->truesize.
Thank you,
Vasily Averin
Hello Vasily,
On Fri, Aug 20, 2021 at 11:21 PM Vasily Averin <[email protected]> wrote:
>
> On 8/21/21 1:44 AM, Christoph Paasch wrote:
> > (resend without html - thanks gmail web-interface...)
> > On Fri, Aug 20, 2021 at 3:41 PM Christoph Paasch
> >> AFAICS, this is because pskb_expand_head (called from
> >> skb_expand_head) is not adjusting skb->truesize when skb->sk is set
> >> (which I guess is the case in this particular scenario). I'm not
> >> sure what the proper fix would be though...
>
> Could you please elaborate?
> it seems to me skb_realloc_headroom used before my patch called pskb_expand_head() too
> and did not adjusted skb->truesize too. Am I missed something perhaps?
>
> The only difference in my patch is that skb_clone can be not called,
> though I do not understand how this can affect skb->truesize.
I *believe* that the difference is that after skb_clone() skb->sk is
NULL and thus truesize will be adjusted.
I will try to confirm that with some more debugging.
Christoph
>
> Thank you,
> Vasily Averin
On Sun, Aug 22, 2021 at 10:04 AM Christoph Paasch
<[email protected]> wrote:
>
> Hello Vasily,
>
> On Fri, Aug 20, 2021 at 11:21 PM Vasily Averin <[email protected]> wrote:
> >
> > On 8/21/21 1:44 AM, Christoph Paasch wrote:
> > > (resend without html - thanks gmail web-interface...)
> > > On Fri, Aug 20, 2021 at 3:41 PM Christoph Paasch
> > >> AFAICS, this is because pskb_expand_head (called from
> > >> skb_expand_head) is not adjusting skb->truesize when skb->sk is set
> > >> (which I guess is the case in this particular scenario). I'm not
> > >> sure what the proper fix would be though...
> >
> > Could you please elaborate?
> > it seems to me skb_realloc_headroom used before my patch called pskb_expand_head() too
> > and did not adjusted skb->truesize too. Am I missed something perhaps?
> >
> > The only difference in my patch is that skb_clone can be not called,
> > though I do not understand how this can affect skb->truesize.
>
> I *believe* that the difference is that after skb_clone() skb->sk is
> NULL and thus truesize will be adjusted.
>
> I will try to confirm that with some more debugging.
Yes indeed.
Before your patch:
[ 19.154039] ip6_xmit before realloc truesize 4864 sk? 000000002ccd6868
[ 19.155230] ip6_xmit after realloc truesize 5376 sk? 0000000000000000
skb->sk is not set and thus truesize will be adjusted.
With your change:
[ 15.092933] ip6_xmit before realloc truesize 4864 sk? 00000000072930fd
[ 15.094131] ip6_xmit after realloc truesize 4864 sk? 00000000072930fd
skb->sk is set and thus truesize is not adjusted.
Christoph
>
>
> Christoph
>
> >
> > Thank you,
> > Vasily Averin
On 8/22/21 8:13 PM, Christoph Paasch wrote:
> On Sun, Aug 22, 2021 at 10:04 AM Christoph Paasch
> <[email protected]> wrote:
>>
>> Hello Vasily,
>>
>> On Fri, Aug 20, 2021 at 11:21 PM Vasily Averin <[email protected]> wrote:
>>>
>>> On 8/21/21 1:44 AM, Christoph Paasch wrote:
>>>> (resend without html - thanks gmail web-interface...)
>>>> On Fri, Aug 20, 2021 at 3:41 PM Christoph Paasch
>>>>> AFAICS, this is because pskb_expand_head (called from
>>>>> skb_expand_head) is not adjusting skb->truesize when skb->sk is set
>>>>> (which I guess is the case in this particular scenario). I'm not
>>>>> sure what the proper fix would be though...
>>>
>>> Could you please elaborate?
>>> it seems to me skb_realloc_headroom used before my patch called pskb_expand_head() too
>>> and did not adjusted skb->truesize too. Am I missed something perhaps?
>>>
>>> The only difference in my patch is that skb_clone can be not called,
>>> though I do not understand how this can affect skb->truesize.
>>
>> I *believe* that the difference is that after skb_clone() skb->sk is
>> NULL and thus truesize will be adjusted.
>>
>> I will try to confirm that with some more debugging.
>
> Yes indeed.
>
> Before your patch:
> [ 19.154039] ip6_xmit before realloc truesize 4864 sk? 000000002ccd6868
> [ 19.155230] ip6_xmit after realloc truesize 5376 sk? 0000000000000000
>
> skb->sk is not set and thus truesize will be adjusted.
This looks strange for me. skb should not lost sk reference.
Could you please clarify where exactly you cheked it?
sk on newly allocated skb is set on line 291
net/ipv6/ip6_output.c::ip6_xmit()
282 if (unlikely(skb_headroom(skb) < head_room)) {
283 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
284 if (!skb2) {
285 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
286 IPSTATS_MIB_OUTDISCARDS);
287 kfree_skb(skb);
288 return -ENOBUFS;
289 }
290 if (skb->sk)
291 skb_set_owner_w(skb2, skb->sk); <<<<< here
292 consume_skb(skb);
293 skb = skb2;
294 }
> With your change:
> [ 15.092933] ip6_xmit before realloc truesize 4864 sk? 00000000072930fd
> [ 15.094131] ip6_xmit after realloc truesize 4864 sk? 00000000072930fd
>
> skb->sk is set and thus truesize is not adjusted.
In this case skb_set_owner_w() is called inside skb_expand_head()
net/ipv6/ip6_output.c::ip6_xmit()
265 if (unlikely(head_room > skb_headroom(skb))) {
266 skb = skb_expand_head(skb, head_room);
267 if (!skb) {
268 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
269 return -ENOBUFS;
270 }
271 }
net/core/skbuff.c::skb_expand_head()
1813 if (skb_shared(skb)) {
1814 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1815
1816 if (likely(nskb)) {
1817 if (skb->sk)
1818 skb_set_owner_w(nskb, skb->sk); <<<< here
1819 consume_skb(skb);
1820 } else {
1821 kfree_skb(skb);
1822 }
1823 skb = nskb;
1824 }
So I do not understand how this can happen.
With my patch:
a) if skb is not shared -- it should keep original skb->sk
b) if skb is shared -- new skb should set sk if it was set on original skb.
Your results can be explained if you looked and skb->sk and truesize right after skb_realloc_headroom() call
but before following skb_set_owner_w(). Could you please check it?
Thank you,
Vasily Averin
On 8/23/21 8:44 AM, Vasily Averin wrote:
> On 8/22/21 8:13 PM, Christoph Paasch wrote:
>> On Sun, Aug 22, 2021 at 10:04 AM Christoph Paasch
>> <[email protected]> wrote:
>>>
>>> Hello Vasily,
>>>
>>> On Fri, Aug 20, 2021 at 11:21 PM Vasily Averin <[email protected]> wrote:
>>>>
>>>> On 8/21/21 1:44 AM, Christoph Paasch wrote:
>>>>> (resend without html - thanks gmail web-interface...)
>>>>> On Fri, Aug 20, 2021 at 3:41 PM Christoph Paasch
>>>>>> AFAICS, this is because pskb_expand_head (called from
>>>>>> skb_expand_head) is not adjusting skb->truesize when skb->sk is set
>>>>>> (which I guess is the case in this particular scenario). I'm not
>>>>>> sure what the proper fix would be though...
>>>>
>>>> Could you please elaborate?
>>>> it seems to me skb_realloc_headroom used before my patch called pskb_expand_head() too
>>>> and did not adjusted skb->truesize too. Am I missed something perhaps?
>>>>
>>>> The only difference in my patch is that skb_clone can be not called,
>>>> though I do not understand how this can affect skb->truesize.
>>>
>>> I *believe* that the difference is that after skb_clone() skb->sk is
>>> NULL and thus truesize will be adjusted.
>>>
>>> I will try to confirm that with some more debugging.
>>
>> Yes indeed.
>>
>> Before your patch:
>> [ 19.154039] ip6_xmit before realloc truesize 4864 sk? 000000002ccd6868
>> [ 19.155230] ip6_xmit after realloc truesize 5376 sk? 0000000000000000
>>
>> skb->sk is not set and thus truesize will be adjusted.
>
> This looks strange for me. skb should not lost sk reference.
>
> Could you please clarify where exactly you cheked it?
> sk on newly allocated skb is set on line 291
>
> net/ipv6/ip6_output.c::ip6_xmit()
> 282 if (unlikely(skb_headroom(skb) < head_room)) {
> 283 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
> 284 if (!skb2) {
> 285 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
> 286 IPSTATS_MIB_OUTDISCARDS);
> 287 kfree_skb(skb);
> 288 return -ENOBUFS;
> 289 }
> 290 if (skb->sk)
> 291 skb_set_owner_w(skb2, skb->sk); <<<<< here
> 292 consume_skb(skb);
> 293 skb = skb2;
> 294 }
>
>> With your change:
>> [ 15.092933] ip6_xmit before realloc truesize 4864 sk? 00000000072930fd
>> [ 15.094131] ip6_xmit after realloc truesize 4864 sk? 00000000072930fd
>>
>> skb->sk is set and thus truesize is not adjusted.
>
> In this case skb_set_owner_w() is called inside skb_expand_head()
>
> net/ipv6/ip6_output.c::ip6_xmit()
> 265 if (unlikely(head_room > skb_headroom(skb))) {
> 266 skb = skb_expand_head(skb, head_room);
> 267 if (!skb) {
> 268 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
> 269 return -ENOBUFS;
> 270 }
> 271 }
>
> net/core/skbuff.c::skb_expand_head()
> 1813 if (skb_shared(skb)) {
> 1814 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
> 1815
> 1816 if (likely(nskb)) {
> 1817 if (skb->sk)
> 1818 skb_set_owner_w(nskb, skb->sk); <<<< here
> 1819 consume_skb(skb);
> 1820 } else {
> 1821 kfree_skb(skb);
> 1822 }
> 1823 skb = nskb;
> 1824 }
>
> So I do not understand how this can happen.
> With my patch:
> a) if skb is not shared -- it should keep original skb->sk
> b) if skb is shared -- new skb should set sk if it was set on original skb.
>
> Your results can be explained if you looked and skb->sk and truesize right after skb_realloc_headroom() call
> but before following skb_set_owner_w(). Could you please check it?
It seems I've found the reason:
before my change pskb_expand_head() is called for newly cloned skb where sk was not set.
after my change skb->sk is set before following pskb_expand_head() call
On own turn pskb_expand_head() adjust truesize:
net/core/skbuff.c::pskb_expand_head()
1751 /* It is not generally safe to change skb->truesize.
1752 * For the moment, we really care of rx path, or
1753 * when skb is orphaned (not attached to a socket).
1754 */
1755 if (!skb->sk || skb->destructor == sock_edemux)
1756 skb->truesize += size - osize;
1757
1758 return 0;
Could you please confirm it?
Thank you,
Vasily Averin
Christoph Paasch reports [1] about incorrect skb->truesize
after skb_expand_head() call in ip6_xmit.
This happen because skb_set_owner_w() for newly clone skb is called
too early, before pskb_expand_head() where truesize is adjusted for
(!skb-sk) case.
[1] https://lkml.org/lkml/2021/8/20/1082
Reported-by: Christoph Paasch <[email protected]>
Signed-off-by: Vasily Averin <[email protected]>
---
net/core/skbuff.c | 24 +++++++++++++-----------
1 file changed, 13 insertions(+), 11 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f931176..508d5c4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
+ struct sk_buff *oskb = skb;
+ struct sk_buff *nskb = NULL;
int delta = headroom - skb_headroom(skb);
if (WARN_ONCE(delta <= 0,
@@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
/* pskb_expand_head() might crash, if skb is shared */
if (skb_shared(skb)) {
- struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
-
- if (likely(nskb)) {
- if (skb->sk)
- skb_set_owner_w(nskb, skb->sk);
- consume_skb(skb);
- } else {
- kfree_skb(skb);
- }
+ nskb = skb_clone(skb, GFP_ATOMIC);
skb = nskb;
}
if (skb &&
- pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
- kfree_skb(skb);
+ pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
skb = NULL;
+
+ if (!skb) {
+ kfree_skb(oskb);
+ if (nskb)
+ kfree_skb(nskb);
+ } else if (nskb) {
+ if (oskb->sk)
+ skb_set_owner_w(nskb, oskb->sk);
+ consume_skb(oskb);
}
return skb;
}
--
1.8.3.1
Hello,
On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
>
> Christoph Paasch reports [1] about incorrect skb->truesize
> after skb_expand_head() call in ip6_xmit.
> This happen because skb_set_owner_w() for newly clone skb is called
> too early, before pskb_expand_head() where truesize is adjusted for
> (!skb-sk) case.
>
> [1] https://lkml.org/lkml/2021/8/20/1082
>
> Reported-by: Christoph Paasch <[email protected]>
> Signed-off-by: Vasily Averin <[email protected]>
> ---
> net/core/skbuff.c | 24 +++++++++++++-----------
> 1 file changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f931176..508d5c4 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>
> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> {
> + struct sk_buff *oskb = skb;
> + struct sk_buff *nskb = NULL;
> int delta = headroom - skb_headroom(skb);
>
> if (WARN_ONCE(delta <= 0,
> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>
> /* pskb_expand_head() might crash, if skb is shared */
> if (skb_shared(skb)) {
> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
> -
> - if (likely(nskb)) {
> - if (skb->sk)
> - skb_set_owner_w(nskb, skb->sk);
> - consume_skb(skb);
> - } else {
> - kfree_skb(skb);
> - }
> + nskb = skb_clone(skb, GFP_ATOMIC);
> skb = nskb;
> }
> if (skb &&
> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> - kfree_skb(skb);
> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
> skb = NULL;
> +
> + if (!skb) {
> + kfree_skb(oskb);
> + if (nskb)
> + kfree_skb(nskb);
> + } else if (nskb) {
> + if (oskb->sk)
> + skb_set_owner_w(nskb, oskb->sk);
> + consume_skb(oskb);
sorry, this does not fix the problem. The syzkaller repro still
triggers the WARN.
When it happens, the skb in ip6_xmit() is not shared as it comes from
__tcp_transmit_skb, where it is skb_clone()'d.
Christoph
> }
> return skb;
> }
> --
> 1.8.3.1
>
On 8/23/21 10:25 AM, Christoph Paasch wrote:
> Hello,
>
> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
>>
>> Christoph Paasch reports [1] about incorrect skb->truesize
>> after skb_expand_head() call in ip6_xmit.
>> This happen because skb_set_owner_w() for newly clone skb is called
>> too early, before pskb_expand_head() where truesize is adjusted for
>> (!skb-sk) case.
>>
>> [1] https://lkml.org/lkml/2021/8/20/1082
>>
>> Reported-by: Christoph Paasch <[email protected]>
>> Signed-off-by: Vasily Averin <[email protected]>
>> ---
>> net/core/skbuff.c | 24 +++++++++++++-----------
>> 1 file changed, 13 insertions(+), 11 deletions(-)
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f931176..508d5c4 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>
>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>> {
>> + struct sk_buff *oskb = skb;
>> + struct sk_buff *nskb = NULL;
>> int delta = headroom - skb_headroom(skb);
>>
>> if (WARN_ONCE(delta <= 0,
>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>
>> /* pskb_expand_head() might crash, if skb is shared */
>> if (skb_shared(skb)) {
>> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>> -
>> - if (likely(nskb)) {
>> - if (skb->sk)
>> - skb_set_owner_w(nskb, skb->sk);
>> - consume_skb(skb);
>> - } else {
>> - kfree_skb(skb);
>> - }
>> + nskb = skb_clone(skb, GFP_ATOMIC);
>> skb = nskb;
>> }
>> if (skb &&
>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>> - kfree_skb(skb);
>> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>> skb = NULL;
>> +
>> + if (!skb) {
>> + kfree_skb(oskb);
>> + if (nskb)
>> + kfree_skb(nskb);
>> + } else if (nskb) {
>> + if (oskb->sk)
>> + skb_set_owner_w(nskb, oskb->sk);
>> + consume_skb(oskb);
>
> sorry, this does not fix the problem. The syzkaller repro still
> triggers the WARN.
>
> When it happens, the skb in ip6_xmit() is not shared as it comes from
> __tcp_transmit_skb, where it is skb_clone()'d.
>
>
Old code (in skb_realloc_headroom())
was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
At this point, skb2->sk was NULL
So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
I would try :
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
int delta = headroom - skb_headroom(skb);
+ struct sk_buff *oskb = NULL;
if (WARN_ONCE(delta <= 0,
"%s is expecting an increase in the headroom", __func__))
@@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
if (skb_shared(skb)) {
struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
- if (likely(nskb)) {
- if (skb->sk)
- skb_set_owner_w(nskb, skb->sk);
- consume_skb(skb);
- } else {
+ if (unlikely(!nskb)) {
kfree_skb(skb);
+ return NULL;
}
+ oskb = skb;
skb = nskb;
}
- if (skb &&
- pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+ if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
kfree_skb(skb);
- skb = NULL;
+ kfree_skb(oskb);
+ return NULL;
+ }
+ if (oskb) {
+ skb_set_owner_w(skb, oskb->sk);
+ consume_skb(oskb);
}
return skb;
}
On 8/23/21 2:45 PM, Eric Dumazet wrote:
>
>
> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>> Hello,
>>
>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
>>>
>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>> after skb_expand_head() call in ip6_xmit.
>>> This happen because skb_set_owner_w() for newly clone skb is called
>>> too early, before pskb_expand_head() where truesize is adjusted for
>>> (!skb-sk) case.
>>>
>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>
>>> Reported-by: Christoph Paasch <[email protected]>
>>> Signed-off-by: Vasily Averin <[email protected]>
>>> ---
>>> net/core/skbuff.c | 24 +++++++++++++-----------
>>> 1 file changed, 13 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f931176..508d5c4 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>
>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>> {
>>> + struct sk_buff *oskb = skb;
>>> + struct sk_buff *nskb = NULL;
>>> int delta = headroom - skb_headroom(skb);
>>>
>>> if (WARN_ONCE(delta <= 0,
>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>
>>> /* pskb_expand_head() might crash, if skb is shared */
>>> if (skb_shared(skb)) {
>>> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>> -
>>> - if (likely(nskb)) {
>>> - if (skb->sk)
>>> - skb_set_owner_w(nskb, skb->sk);
>>> - consume_skb(skb);
>>> - } else {
>>> - kfree_skb(skb);
>>> - }
>>> + nskb = skb_clone(skb, GFP_ATOMIC);
>>> skb = nskb;
>>> }
>>> if (skb &&
>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> - kfree_skb(skb);
>>> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>> skb = NULL;
>>> +
>>> + if (!skb) {
>>> + kfree_skb(oskb);
>>> + if (nskb)
>>> + kfree_skb(nskb);
>>> + } else if (nskb) {
>>> + if (oskb->sk)
>>> + skb_set_owner_w(nskb, oskb->sk);
>>> + consume_skb(oskb);
>>
>> sorry, this does not fix the problem. The syzkaller repro still
>> triggers the WARN.
>>
>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>> __tcp_transmit_skb, where it is skb_clone()'d.
>>
>>
>
> Old code (in skb_realloc_headroom())
> was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
>
> At this point, skb2->sk was NULL
> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>
> I would try :
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> {
> int delta = headroom - skb_headroom(skb);
> + struct sk_buff *oskb = NULL;
>
> if (WARN_ONCE(delta <= 0,
> "%s is expecting an increase in the headroom", __func__))
> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> if (skb_shared(skb)) {
> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>
> - if (likely(nskb)) {
> - if (skb->sk)
> - skb_set_owner_w(nskb, skb->sk);
> - consume_skb(skb);
> - } else {
> + if (unlikely(!nskb)) {
> kfree_skb(skb);
> + return NULL;
> }
> + oskb = skb;
> skb = nskb;
> }
> - if (skb &&
> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> + if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> kfree_skb(skb);
> - skb = NULL;
> + kfree_skb(oskb);
> + return NULL;
> + }
> + if (oskb) {
> + skb_set_owner_w(skb, oskb->sk);
> + consume_skb(oskb);
> }
> return skb;
> }
Oh well, probably not going to work.
We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
On 8/23/21 2:51 PM, Eric Dumazet wrote:
>
>
> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>
>>
>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>> Hello,
>>>
>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
>>>>
>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>> after skb_expand_head() call in ip6_xmit.
>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>> (!skb-sk) case.
>>>>
>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>
>>>> Reported-by: Christoph Paasch <[email protected]>
>>>> Signed-off-by: Vasily Averin <[email protected]>
>>>> ---
>>>> net/core/skbuff.c | 24 +++++++++++++-----------
>>>> 1 file changed, 13 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index f931176..508d5c4 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>
>>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>> {
>>>> + struct sk_buff *oskb = skb;
>>>> + struct sk_buff *nskb = NULL;
>>>> int delta = headroom - skb_headroom(skb);
>>>>
>>>> if (WARN_ONCE(delta <= 0,
>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>
>>>> /* pskb_expand_head() might crash, if skb is shared */
>>>> if (skb_shared(skb)) {
>>>> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>> -
>>>> - if (likely(nskb)) {
>>>> - if (skb->sk)
>>>> - skb_set_owner_w(nskb, skb->sk);
>>>> - consume_skb(skb);
>>>> - } else {
>>>> - kfree_skb(skb);
>>>> - }
>>>> + nskb = skb_clone(skb, GFP_ATOMIC);
>>>> skb = nskb;
>>>> }
>>>> if (skb &&
>>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>> - kfree_skb(skb);
>>>> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>> skb = NULL;
>>>> +
>>>> + if (!skb) {
>>>> + kfree_skb(oskb);
>>>> + if (nskb)
>>>> + kfree_skb(nskb);
>>>> + } else if (nskb) {
>>>> + if (oskb->sk)
>>>> + skb_set_owner_w(nskb, oskb->sk);
>>>> + consume_skb(oskb);
>>>
>>> sorry, this does not fix the problem. The syzkaller repro still
>>> triggers the WARN.
>>>
>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>
>>>
>>
>> Old code (in skb_realloc_headroom())
>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
>>
>> At this point, skb2->sk was NULL
>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>
>> I would try :
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>> {
>> int delta = headroom - skb_headroom(skb);
>> + struct sk_buff *oskb = NULL;
>>
>> if (WARN_ONCE(delta <= 0,
>> "%s is expecting an increase in the headroom", __func__))
>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>> if (skb_shared(skb)) {
>> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>
>> - if (likely(nskb)) {
>> - if (skb->sk)
>> - skb_set_owner_w(nskb, skb->sk);
>> - consume_skb(skb);
>> - } else {
>> + if (unlikely(!nskb)) {
>> kfree_skb(skb);
>> + return NULL;
>> }
>> + oskb = skb;
>> skb = nskb;
>> }
>> - if (skb &&
>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>> + if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>> kfree_skb(skb);
>> - skb = NULL;
>> + kfree_skb(oskb);
>> + return NULL;
>> + }
>> + if (oskb) {
>> + skb_set_owner_w(skb, oskb->sk);
>> + consume_skb(oskb);
>> }
>> return skb;
>> }
>
>
> Oh well, probably not going to work.
>
> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
>
I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()
On 8/24/21 1:23 AM, Eric Dumazet wrote:
> On 8/23/21 2:51 PM, Eric Dumazet wrote:
>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>>> Hello,
>>>>
>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
>>>>>
>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>>> after skb_expand_head() call in ip6_xmit.
>>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>>> (!skb-sk) case.
>>>>>
>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>>
>>>>> Reported-by: Christoph Paasch <[email protected]>
>>>>> Signed-off-by: Vasily Averin <[email protected]>
>>>>> ---
>>>>> net/core/skbuff.c | 24 +++++++++++++-----------
>>>>> 1 file changed, 13 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index f931176..508d5c4 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>> {
>>>>> + struct sk_buff *oskb = skb;
>>>>> + struct sk_buff *nskb = NULL;
>>>>> int delta = headroom - skb_headroom(skb);
>>>>>
>>>>> if (WARN_ONCE(delta <= 0,
>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>> /* pskb_expand_head() might crash, if skb is shared */
>>>>> if (skb_shared(skb)) {
>>>>> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>> -
>>>>> - if (likely(nskb)) {
>>>>> - if (skb->sk)
>>>>> - skb_set_owner_w(nskb, skb->sk);
>>>>> - consume_skb(skb);
>>>>> - } else {
>>>>> - kfree_skb(skb);
>>>>> - }
>>>>> + nskb = skb_clone(skb, GFP_ATOMIC);
>>>>> skb = nskb;
>>>>> }
>>>>> if (skb &&
>>>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>>> - kfree_skb(skb);
>>>>> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>>> skb = NULL;
>>>>> +
>>>>> + if (!skb) {
>>>>> + kfree_skb(oskb);
>>>>> + if (nskb)
>>>>> + kfree_skb(nskb);
>>>>> + } else if (nskb) {
>>>>> + if (oskb->sk)
>>>>> + skb_set_owner_w(nskb, oskb->sk);
>>>>> + consume_skb(oskb);
>>>>
>>>> sorry, this does not fix the problem. The syzkaller repro still
>>>> triggers the WARN.
>>>>
>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>>
>>>>
>>>
>>> Old code (in skb_realloc_headroom())
>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
>>>
>>> At this point, skb2->sk was NULL
>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>>
>>> I would try :
>>>
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>> {
>>> int delta = headroom - skb_headroom(skb);
>>> + struct sk_buff *oskb = NULL;
>>>
>>> if (WARN_ONCE(delta <= 0,
>>> "%s is expecting an increase in the headroom", __func__))
>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>> if (skb_shared(skb)) {
>>> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>
>>> - if (likely(nskb)) {
>>> - if (skb->sk)
>>> - skb_set_owner_w(nskb, skb->sk);
>>> - consume_skb(skb);
>>> - } else {
>>> + if (unlikely(!nskb)) {
>>> kfree_skb(skb);
>>> + return NULL;
>>> }
>>> + oskb = skb;
>>> skb = nskb;
>>> }
>>> - if (skb &&
>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> + if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> kfree_skb(skb);
>>> - skb = NULL;
>>> + kfree_skb(oskb);
>>> + return NULL;
>>> + }
>>> + if (oskb) {
>>> + skb_set_owner_w(skb, oskb->sk);
>>> + consume_skb(oskb);
>>> }
>>> return skb;
>>> }
>>
>>
>> Oh well, probably not going to work.
>>
>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
Can we adjust truesize outside pskb_expand_head()?
Could you please explain why it can be not safe?
> I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
> is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()
I agree, however as far as I understand it is separate and more global problem.
Thank you,
Vasily Averin
On 8/24/21 11:50 AM, Vasily Averin wrote:
> On 8/24/21 1:23 AM, Eric Dumazet wrote:
>> On 8/23/21 2:51 PM, Eric Dumazet wrote:
>>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>>>> Hello,
>>>>>
>>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
>>>>>>
>>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>>>> after skb_expand_head() call in ip6_xmit.
>>>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>>>> (!skb-sk) case.
>>>>>>
>>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>>>
>>>>>> Reported-by: Christoph Paasch <[email protected]>
>>>>>> Signed-off-by: Vasily Averin <[email protected]>
>>>>>> ---
>>>>>> net/core/skbuff.c | 24 +++++++++++++-----------
>>>>>> 1 file changed, 13 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>>> index f931176..508d5c4 100644
>>>>>> --- a/net/core/skbuff.c
>>>>>> +++ b/net/core/skbuff.c
>>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>>>
>>>>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>> {
>>>>>> + struct sk_buff *oskb = skb;
>>>>>> + struct sk_buff *nskb = NULL;
>>>>>> int delta = headroom - skb_headroom(skb);
>>>>>>
>>>>>> if (WARN_ONCE(delta <= 0,
>>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>>
>>>>>> /* pskb_expand_head() might crash, if skb is shared */
>>>>>> if (skb_shared(skb)) {
>>>>>> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>>> -
>>>>>> - if (likely(nskb)) {
>>>>>> - if (skb->sk)
>>>>>> - skb_set_owner_w(nskb, skb->sk);
>>>>>> - consume_skb(skb);
>>>>>> - } else {
>>>>>> - kfree_skb(skb);
>>>>>> - }
>>>>>> + nskb = skb_clone(skb, GFP_ATOMIC);
>>>>>> skb = nskb;
>>>>>> }
>>>>>> if (skb &&
>>>>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>>>> - kfree_skb(skb);
>>>>>> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>>>> skb = NULL;
>>>>>> +
>>>>>> + if (!skb) {
>>>>>> + kfree_skb(oskb);
>>>>>> + if (nskb)
>>>>>> + kfree_skb(nskb);
>>>>>> + } else if (nskb) {
>>>>>> + if (oskb->sk)
>>>>>> + skb_set_owner_w(nskb, oskb->sk);
>>>>>> + consume_skb(oskb);
>>>>>
>>>>> sorry, this does not fix the problem. The syzkaller repro still
>>>>> triggers the WARN.
>>>>>
>>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>>>
>>>>>
>>>>
>>>> Old code (in skb_realloc_headroom())
>>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
>>>>
>>>> At this point, skb2->sk was NULL
>>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>>>
>>>> I would try :
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>> {
>>>> int delta = headroom - skb_headroom(skb);
>>>> + struct sk_buff *oskb = NULL;
>>>>
>>>> if (WARN_ONCE(delta <= 0,
>>>> "%s is expecting an increase in the headroom", __func__))
>>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>> if (skb_shared(skb)) {
>>>> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>
>>>> - if (likely(nskb)) {
>>>> - if (skb->sk)
>>>> - skb_set_owner_w(nskb, skb->sk);
>>>> - consume_skb(skb);
>>>> - } else {
>>>> + if (unlikely(!nskb)) {
>>>> kfree_skb(skb);
>>>> + return NULL;
>>>> }
>>>> + oskb = skb;
>>>> skb = nskb;
>>>> }
>>>> - if (skb &&
>>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>> + if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>> kfree_skb(skb);
>>>> - skb = NULL;
>>>> + kfree_skb(oskb);
>>>> + return NULL;
>>>> + }
>>>> + if (oskb) {
>>>> + skb_set_owner_w(skb, oskb->sk);
>>>> + consume_skb(oskb);
>>>> }
>>>> return skb;
>>>> }
>>>
>>>
>>> Oh well, probably not going to work.
>>>
>>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
>
> Can we adjust truesize outside pskb_expand_head()?
> Could you please explain why it can be not safe?
Do you mean truesize change should not break balance of sk->sk_wmem_alloc?
>> I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
>> is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()
>
> I agree, however as far as I understand it is separate and more global problem.
>
> Thank you,
> Vasily Averin
>
On Tue, Aug 24, 2021 at 10:22 AM Vasily Averin <[email protected]> wrote:
>
> On 8/24/21 11:50 AM, Vasily Averin wrote:
> > On 8/24/21 1:23 AM, Eric Dumazet wrote:
> >> On 8/23/21 2:51 PM, Eric Dumazet wrote:
> >>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
> >>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
> >>>>> Hello,
> >>>>>
> >>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
> >>>>>>
> >>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
> >>>>>> after skb_expand_head() call in ip6_xmit.
> >>>>>> This happen because skb_set_owner_w() for newly clone skb is called
> >>>>>> too early, before pskb_expand_head() where truesize is adjusted for
> >>>>>> (!skb-sk) case.
> >>>>>>
> >>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
> >>>>>>
> >>>>>> Reported-by: Christoph Paasch <[email protected]>
> >>>>>> Signed-off-by: Vasily Averin <[email protected]>
> >>>>>> ---
> >>>>>> net/core/skbuff.c | 24 +++++++++++++-----------
> >>>>>> 1 file changed, 13 insertions(+), 11 deletions(-)
> >>>>>>
> >>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>>>> index f931176..508d5c4 100644
> >>>>>> --- a/net/core/skbuff.c
> >>>>>> +++ b/net/core/skbuff.c
> >>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
> >>>>>>
> >>>>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>>>> {
> >>>>>> + struct sk_buff *oskb = skb;
> >>>>>> + struct sk_buff *nskb = NULL;
> >>>>>> int delta = headroom - skb_headroom(skb);
> >>>>>>
> >>>>>> if (WARN_ONCE(delta <= 0,
> >>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>>>>
> >>>>>> /* pskb_expand_head() might crash, if skb is shared */
> >>>>>> if (skb_shared(skb)) {
> >>>>>> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
> >>>>>> -
> >>>>>> - if (likely(nskb)) {
> >>>>>> - if (skb->sk)
> >>>>>> - skb_set_owner_w(nskb, skb->sk);
> >>>>>> - consume_skb(skb);
> >>>>>> - } else {
> >>>>>> - kfree_skb(skb);
> >>>>>> - }
> >>>>>> + nskb = skb_clone(skb, GFP_ATOMIC);
> >>>>>> skb = nskb;
> >>>>>> }
> >>>>>> if (skb &&
> >>>>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> >>>>>> - kfree_skb(skb);
> >>>>>> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
> >>>>>> skb = NULL;
> >>>>>> +
> >>>>>> + if (!skb) {
> >>>>>> + kfree_skb(oskb);
> >>>>>> + if (nskb)
> >>>>>> + kfree_skb(nskb);
> >>>>>> + } else if (nskb) {
> >>>>>> + if (oskb->sk)
> >>>>>> + skb_set_owner_w(nskb, oskb->sk);
> >>>>>> + consume_skb(oskb);
> >>>>>
> >>>>> sorry, this does not fix the problem. The syzkaller repro still
> >>>>> triggers the WARN.
> >>>>>
> >>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
> >>>>> __tcp_transmit_skb, where it is skb_clone()'d.
> >>>>>
> >>>>>
> >>>>
> >>>> Old code (in skb_realloc_headroom())
> >>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
> >>>>
> >>>> At this point, skb2->sk was NULL
> >>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
> >>>>
> >>>> I would try :
> >>>>
> >>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
> >>>> --- a/net/core/skbuff.c
> >>>> +++ b/net/core/skbuff.c
> >>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
> >>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>> {
> >>>> int delta = headroom - skb_headroom(skb);
> >>>> + struct sk_buff *oskb = NULL;
> >>>>
> >>>> if (WARN_ONCE(delta <= 0,
> >>>> "%s is expecting an increase in the headroom", __func__))
> >>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>> if (skb_shared(skb)) {
> >>>> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
> >>>>
> >>>> - if (likely(nskb)) {
> >>>> - if (skb->sk)
> >>>> - skb_set_owner_w(nskb, skb->sk);
> >>>> - consume_skb(skb);
> >>>> - } else {
> >>>> + if (unlikely(!nskb)) {
> >>>> kfree_skb(skb);
> >>>> + return NULL;
> >>>> }
> >>>> + oskb = skb;
> >>>> skb = nskb;
> >>>> }
> >>>> - if (skb &&
> >>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> >>>> + if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> >>>> kfree_skb(skb);
> >>>> - skb = NULL;
> >>>> + kfree_skb(oskb);
> >>>> + return NULL;
> >>>> + }
> >>>> + if (oskb) {
> >>>> + skb_set_owner_w(skb, oskb->sk);
> >>>> + consume_skb(oskb);
> >>>> }
> >>>> return skb;
> >>>> }
> >>>
> >>>
> >>> Oh well, probably not going to work.
> >>>
> >>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
> >
> > Can we adjust truesize outside pskb_expand_head()?
> > Could you please explain why it can be not safe?
>
> Do you mean truesize change should not break balance of sk->sk_wmem_alloc?
AFAICS, that's the problem around adjusting truesize. So, maybe "just"
refcount_add the increase of the truesize.
The below does fix the syzkaller bug for me and seems to do the right
thing overall. But I honestly think that this is becoming too hacky
and not worth it... and who knows what other corner-cases this now
exposes...
Maybe a revert is a better course of action?
---
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f9311762cc47..9cc18a0fdd1c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -71,6 +71,7 @@
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/page_pool.h>
+#include <net/tcp.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -1756,9 +1757,14 @@ int pskb_expand_head(struct sk_buff *skb, int
nhead, int ntail,
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).
*/
- if (!skb->sk || skb->destructor == sock_edemux)
+ if (!skb->sk || skb->destructor == sock_edemux || skb->destructor ==
tcp_wfree) {
skb->truesize += size - osize;
+ if (skb->sk && skb->destructor == tcp_wfree) {
+ refcount_add(size - osize, &skb->sk->sk_wmem_alloc);
+ }
+ }
+
return 0;
nofrags:
On 8/24/21 1:23 AM, Eric Dumazet wrote:
> On 8/23/21 2:51 PM, Eric Dumazet wrote:
>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>>> Hello,
>>>>
>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <[email protected]> wrote:
>>>>>
>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>>> after skb_expand_head() call in ip6_xmit.
>>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>>> (!skb-sk) case.
>>>>>
>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>>
>>>>> Reported-by: Christoph Paasch <[email protected]>
>>>>> Signed-off-by: Vasily Averin <[email protected]>
>>>>> ---
>>>>> net/core/skbuff.c | 24 +++++++++++++-----------
>>>>> 1 file changed, 13 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index f931176..508d5c4 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>> {
>>>>> + struct sk_buff *oskb = skb;
>>>>> + struct sk_buff *nskb = NULL;
>>>>> int delta = headroom - skb_headroom(skb);
>>>>>
>>>>> if (WARN_ONCE(delta <= 0,
>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>> /* pskb_expand_head() might crash, if skb is shared */
>>>>> if (skb_shared(skb)) {
>>>>> - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>> -
>>>>> - if (likely(nskb)) {
>>>>> - if (skb->sk)
>>>>> - skb_set_owner_w(nskb, skb->sk);
>>>>> - consume_skb(skb);
>>>>> - } else {
>>>>> - kfree_skb(skb);
>>>>> - }
>>>>> + nskb = skb_clone(skb, GFP_ATOMIC);
>>>>> skb = nskb;
>>>>> }
>>>>> if (skb &&
>>>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>>> - kfree_skb(skb);
>>>>> + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>>> skb = NULL;
>>>>> +
>>>>> + if (!skb) {
>>>>> + kfree_skb(oskb);
>>>>> + if (nskb)
>>>>> + kfree_skb(nskb);
>>>>> + } else if (nskb) {
>>>>> + if (oskb->sk)
>>>>> + skb_set_owner_w(nskb, oskb->sk);
>>>>> + consume_skb(oskb);
>>>>
>>>> sorry, this does not fix the problem. The syzkaller repro still
>>>> triggers the WARN.
>>>>
>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>>
>>>>
>>>
>>> Old code (in skb_realloc_headroom())
>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
>>>
>>> At this point, skb2->sk was NULL
>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>>
>>> I would try :
>>>
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>> {
>>> int delta = headroom - skb_headroom(skb);
>>> + struct sk_buff *oskb = NULL;
>>>
>>> if (WARN_ONCE(delta <= 0,
>>> "%s is expecting an increase in the headroom", __func__))
>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>> if (skb_shared(skb)) {
>>> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>
>>> - if (likely(nskb)) {
>>> - if (skb->sk)
>>> - skb_set_owner_w(nskb, skb->sk);
>>> - consume_skb(skb);
>>> - } else {
>>> + if (unlikely(!nskb)) {
>>> kfree_skb(skb);
>>> + return NULL;
>>> }
>>> + oskb = skb;
>>> skb = nskb;
>>> }
>>> - if (skb &&
>>> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> + if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> kfree_skb(skb);
>>> - skb = NULL;
>>> + kfree_skb(oskb);
>>> + return NULL;
>>> + }
>>> + if (oskb) {
>>> + skb_set_owner_w(skb, oskb->sk);
>>> + consume_skb(oskb);
>>> }
>>> return skb;
>>> }
>> Oh well, probably not going to work.
>>
>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
>
> I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
> is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()
I asked Alexey Kuznetsov to look at this problem. Below is his answer:
"I think the current scheme is obsolete. It was created
when we had only two kinds of skb accounting (rmem & wmem)
and with more kinds of accounting it just does not work.
Even there we had ignored problems with adjusting accounting.
Logically the best solution would be replacing ->destructor,
set_owner* etc with skb_ops. Something like:
struct skb_ops
{
void init(struct sk_buff * skb, struct skb_ops * ops, struct
sock * owner);
void fini(struct sk_buff * skb);
void update(struct sk_buff * skb, int adjust);
void inherit(struct sk_buff * skb2, struct sk_buff * skb);
};
init - is replacement for skb_set_owner_r|w
fini - is replacement for skb_orphan
update - is new operation to be used in places where skb->truesize changes,
instead of awful constructions like:
if (!skb->sk || skb->destructor == sock_edemux)
skb->truesize += size - osize;
Now it will look like:
if (skb->ops)
skb->ops->update(skb, size - osize);
inherit - is replacement for also awful constructs like:
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
Now it will be:
if (skb->ops)
skb->ops->inherit(skb2, skb);
The implementation looks mostly obvious.
Some troubles can be only with new functionality:
update of accounting was never done before.
More efficient, functionally equivalent, but uglier and less flexible
alternative would be removal of ->destructor, replaced with
a small numeric indicator of ownership:
enum
{
SKB_OWNER_NONE, /* aka destructor == NULL */
SKB_OWNER_WMEM, /* aka destructor == sk_wfree */
SKB_OWNER_RMEM, /* aka destructor == sk_rfree */
SKB_OWNER_SK, /* aka destructor == sk_edemux */
SKB_OWNER_TCP, /* aka destructor == tcp_wfree */
}
And the same init,fini,inherit,update become functions
w/o any inidirect calls. Not sure it is really more efficient though."
Thank you,
Vasily Averin
On 8/27/21 8:23 AM, Vasily Averin wrote:
> I asked Alexey Kuznetsov to look at this problem. Below is his answer:
> "I think the current scheme is obsolete. It was created
> when we had only two kinds of skb accounting (rmem & wmem)
> and with more kinds of accounting it just does not work.
> Even there we had ignored problems with adjusting accounting.
>
> Logically the best solution would be replacing ->destructor,
> set_owner* etc with skb_ops. Something like:
>
> struct skb_ops
> {
> void init(struct sk_buff * skb, struct skb_ops * ops, struct
> sock * owner);
> void fini(struct sk_buff * skb);
> void update(struct sk_buff * skb, int adjust);
> void inherit(struct sk_buff * skb2, struct sk_buff * skb);
> };
>
> init - is replacement for skb_set_owner_r|w
> fini - is replacement for skb_orphan
> update - is new operation to be used in places where skb->truesize changes,
> instead of awful constructions like:
>
> if (!skb->sk || skb->destructor == sock_edemux)
> skb->truesize += size - osize;
>
> Now it will look like:
>
> if (skb->ops)
> skb->ops->update(skb, size - osize);
>
> inherit - is replacement for also awful constructs like:
>
> if (skb->sk)
> skb_set_owner_w(skb2, skb->sk);
>
> Now it will be:
>
> if (skb->ops)
> skb->ops->inherit(skb2, skb);
>
> The implementation looks mostly obvious.
> Some troubles can be only with new functionality:
> update of accounting was never done before.
>
>
> More efficient, functionally equivalent, but uglier and less flexible
> alternative would be removal of ->destructor, replaced with
> a small numeric indicator of ownership:
>
> enum
> {
> SKB_OWNER_NONE, /* aka destructor == NULL */
> SKB_OWNER_WMEM, /* aka destructor == sk_wfree */
> SKB_OWNER_RMEM, /* aka destructor == sk_rfree */
> SKB_OWNER_SK, /* aka destructor == sk_edemux */
> SKB_OWNER_TCP, /* aka destructor == tcp_wfree */
> }
>
> And the same init,fini,inherit,update become functions
> w/o any inidirect calls. Not sure it is really more efficient though."
>
Well, this does not look as stable material, and would add a bunch
of indirect calls which are quite expensive these days (CONFIG_RETPOLINE=y)
I suggest we work on a fix, using existing infra, then eventually later
try to refactor if this is really bringing improvements.
A fix could simply be a revert of 0c9f227bee119 ("ipv6: use skb_expand_head in ip6_xmit")
since only IPv6 has the problem (because of arbitrary headers size)
On 8/27/21 7:47 PM, Eric Dumazet wrote:
>
>
> On 8/27/21 8:23 AM, Vasily Averin wrote:
>
>> I asked Alexey Kuznetsov to look at this problem. Below is his answer:
>> "I think the current scheme is obsolete. It was created
>> when we had only two kinds of skb accounting (rmem & wmem)
>> and with more kinds of accounting it just does not work.
>> Even there we had ignored problems with adjusting accounting.
>>
>> Logically the best solution would be replacing ->destructor,
>> set_owner* etc with skb_ops. Something like:
>>
>> struct skb_ops
>> {
>> void init(struct sk_buff * skb, struct skb_ops * ops, struct
>> sock * owner);
>> void fini(struct sk_buff * skb);
>> void update(struct sk_buff * skb, int adjust);
>> void inherit(struct sk_buff * skb2, struct sk_buff * skb);
>> };
>>
>> init - is replacement for skb_set_owner_r|w
>> fini - is replacement for skb_orphan
>> update - is new operation to be used in places where skb->truesize changes,
>> instead of awful constructions like:
>>
>> if (!skb->sk || skb->destructor == sock_edemux)
>> skb->truesize += size - osize;
>>
>> Now it will look like:
>>
>> if (skb->ops)
>> skb->ops->update(skb, size - osize);
>>
>> inherit - is replacement for also awful constructs like:
>>
>> if (skb->sk)
>> skb_set_owner_w(skb2, skb->sk);
>>
>> Now it will be:
>>
>> if (skb->ops)
>> skb->ops->inherit(skb2, skb);
>>
>> The implementation looks mostly obvious.
>> Some troubles can be only with new functionality:
>> update of accounting was never done before.
>>
>>
>> More efficient, functionally equivalent, but uglier and less flexible
>> alternative would be removal of ->destructor, replaced with
>> a small numeric indicator of ownership:
>>
>> enum
>> {
>> SKB_OWNER_NONE, /* aka destructor == NULL */
>> SKB_OWNER_WMEM, /* aka destructor == sk_wfree */
>> SKB_OWNER_RMEM, /* aka destructor == sk_rfree */
>> SKB_OWNER_SK, /* aka destructor == sk_edemux */
>> SKB_OWNER_TCP, /* aka destructor == tcp_wfree */
>> }
>>
>> And the same init,fini,inherit,update become functions
>> w/o any inidirect calls. Not sure it is really more efficient though."
>>
>
> Well, this does not look as stable material, and would add a bunch
> of indirect calls which are quite expensive these days (CONFIG_RETPOLINE=y)
>
> I suggest we work on a fix, using existing infra, then eventually later
> try to refactor if this is really bringing improvements.
>
> A fix could simply be a revert of 0c9f227bee119 ("ipv6: use skb_expand_head in ip6_xmit")
> since only IPv6 has the problem (because of arbitrary headers size)
I think it is not enough.
Root of the problem is that skb_expand_head() works incorrectly with non-shared skb.
In this case it do not call skb_clone before pskb_expand_head() execution,
and as result pskb_expand_head() and does not adjust skb->truesize.
I think non-shared skb is more frequent case,
so all skb_expand_head() are affected.
Therefore we need to revert all my patch set in net-next:
f1260ff skbuff: introduce skb_expand_head()
e415ed3 ipv6: use skb_expand_head in ip6_finish_output2
0c9f227 ipv6: use skb_expand_head in ip6_xmit
5678a59 ipv4: use skb_expand_head in ip_finish_output2
14ee70c vrf: use skb_expand_head in vrf_finish_output
53744a4 ax25: use skb_expand_head
a1e975e bpf: use skb_expand_head in bpf_out_neigh_v4/6
07e1d6b Merge branch 'skb_expand_head'
with fixup
06669e6 vrf: fix NULL dereference in vrf_finish_output()
And then rework ip6_finish_output2() in upstream,
to call skb_realloc_headroom() like it was done in first patch version:
https://lkml.org/lkml/2021/7/7/469.
Thank you,
Vasily Averin
Christoph Paasch reports [1] about incorrect skb->truesize
after skb_expand_head() call in ip6_xmit.
This may happen because of two reasons:
- skb_set_owner_w() for newly cloned skb is called too early,
before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
- pskb_expand_head() does not adjust truesize in (skb->sk) case.
In this case sk->sk_wmem_alloc should be adjusted too.
[1] https://lkml.org/lkml/2021/8/20/1082
Reported-by: Christoph Paasch <[email protected]>
Signed-off-by: Vasily Averin <[email protected]>
---
v2: based on patch version from Eric Dumazet,
added __pskb_expand_head() function, which can be forced
to adjust skb->truesize and sk->sk_wmem_alloc.
---
net/core/skbuff.c | 43 +++++++++++++++++++++++++++++--------------
1 file changed, 29 insertions(+), 14 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f931176..4691023 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1681,10 +1681,10 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
* reloaded after call to this function.
*/
-int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
- gfp_t gfp_mask)
+static int __pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ gfp_t gfp_mask, bool update_truesize)
{
- int i, osize = skb_end_offset(skb);
+ int delta, i, osize = skb_end_offset(skb);
int size = osize + nhead + ntail;
long off;
u8 *data;
@@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).
*/
- if (!skb->sk || skb->destructor == sock_edemux)
- skb->truesize += size - osize;
-
+ delta = size - osize;
+ if (!skb->sk || skb->destructor == sock_edemux) {
+ skb->truesize += delta;
+ } else if (update_truesize) {
+ refcount_add(delta, &skb->sk->sk_wmem_alloc);
+ skb->truesize += delta;
+ }
return 0;
nofrags:
@@ -1766,6 +1770,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
nodata:
return -ENOMEM;
}
+
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ gfp_t gfp_mask)
+{
+ return __pskb_expand_head(skb, nhead, ntail, gfp_mask, false);
+}
EXPORT_SYMBOL(pskb_expand_head);
/* Make private copy of skb with writable head and some headroom */
@@ -1804,28 +1814,33 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
int delta = headroom - skb_headroom(skb);
+ struct sk_buff *oskb = NULL;
if (WARN_ONCE(delta <= 0,
"%s is expecting an increase in the headroom", __func__))
return skb;
+ delta = SKB_DATA_ALIGN(delta);
/* pskb_expand_head() might crash, if skb is shared */
if (skb_shared(skb)) {
struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
- if (likely(nskb)) {
- if (skb->sk)
- skb_set_owner_w(nskb, skb->sk);
- consume_skb(skb);
- } else {
+ if (unlikely(!nskb)) {
kfree_skb(skb);
+ return NULL;
}
+ oskb = skb;
skb = nskb;
}
- if (skb &&
- pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+ if (__pskb_expand_head(skb, delta, 0, GFP_ATOMIC, true)) {
kfree_skb(skb);
- skb = NULL;
+ kfree_skb(oskb);
+ return NULL;
+ }
+ if (oskb) {
+ if (oskb->sk)
+ skb_set_owner_w(skb, oskb->sk);
+ consume_skb(oskb);
}
return skb;
}
--
1.8.3.1
1) I forgot to specify that the patch is intended fro net-next git
2) I forgot to ad Alexey Kuznetsov in cc. I resend the patch to him
in a separate letter and received his consent.
3) I forgot to set Fixed mark
Fixes: f1260ff15a71 ("skbuff: introduce skb_expand_head()")
Thank you,
Vasily Averin
On 8/29/21 3:59 PM, Vasily Averin wrote:
> Christoph Paasch reports [1] about incorrect skb->truesize
> after skb_expand_head() call in ip6_xmit.
> This may happen because of two reasons:
> - skb_set_owner_w() for newly cloned skb is called too early,
> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
> - pskb_expand_head() does not adjust truesize in (skb->sk) case.
> In this case sk->sk_wmem_alloc should be adjusted too.
>
> [1] https://lkml.org/lkml/2021/8/20/1082
>
> Reported-by: Christoph Paasch <[email protected]>
> Signed-off-by: Vasily Averin <[email protected]>
> ---
> v2: based on patch version from Eric Dumazet,
> added __pskb_expand_head() function, which can be forced
> to adjust skb->truesize and sk->sk_wmem_alloc.
> ---
> net/core/skbuff.c | 43 +++++++++++++++++++++++++++++--------------
> 1 file changed, 29 insertions(+), 14 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f931176..4691023 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1681,10 +1681,10 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
> * reloaded after call to this function.
> */
>
> -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> - gfp_t gfp_mask)
> +static int __pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> + gfp_t gfp_mask, bool update_truesize)
> {
> - int i, osize = skb_end_offset(skb);
> + int delta, i, osize = skb_end_offset(skb);
> int size = osize + nhead + ntail;
> long off;
> u8 *data;
> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> * For the moment, we really care of rx path, or
> * when skb is orphaned (not attached to a socket).
> */
> - if (!skb->sk || skb->destructor == sock_edemux)
> - skb->truesize += size - osize;
> -
> + delta = size - osize;
> + if (!skb->sk || skb->destructor == sock_edemux) {
> + skb->truesize += delta;
> + } else if (update_truesize) {
> + refcount_add(delta, &skb->sk->sk_wmem_alloc);
> + skb->truesize += delta;
> + }
> return 0;
>
> nofrags:
> @@ -1766,6 +1770,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> nodata:
> return -ENOMEM;
> }
> +
> +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> + gfp_t gfp_mask)
> +{
> + return __pskb_expand_head(skb, nhead, ntail, gfp_mask, false);
> +}
> EXPORT_SYMBOL(pskb_expand_head);
>
> /* Make private copy of skb with writable head and some headroom */
> @@ -1804,28 +1814,33 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> {
> int delta = headroom - skb_headroom(skb);
> + struct sk_buff *oskb = NULL;
>
> if (WARN_ONCE(delta <= 0,
> "%s is expecting an increase in the headroom", __func__))
> return skb;
>
> + delta = SKB_DATA_ALIGN(delta);
> /* pskb_expand_head() might crash, if skb is shared */
> if (skb_shared(skb)) {
> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>
> - if (likely(nskb)) {
> - if (skb->sk)
> - skb_set_owner_w(nskb, skb->sk);
> - consume_skb(skb);
> - } else {
> + if (unlikely(!nskb)) {
> kfree_skb(skb);
> + return NULL;
> }
> + oskb = skb;
> skb = nskb;
> }
> - if (skb &&
> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> + if (__pskb_expand_head(skb, delta, 0, GFP_ATOMIC, true)) {
> kfree_skb(skb);
> - skb = NULL;
> + kfree_skb(oskb);
> + return NULL;
> + }
> + if (oskb) {
> + if (oskb->sk)
> + skb_set_owner_w(skb, oskb->sk);
> + consume_skb(oskb);
> }
> return skb;
> }
>
On 8/29/21 5:59 AM, Vasily Averin wrote:
> Christoph Paasch reports [1] about incorrect skb->truesize
> after skb_expand_head() call in ip6_xmit.
> This may happen because of two reasons:
> - skb_set_owner_w() for newly cloned skb is called too early,
> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
> - pskb_expand_head() does not adjust truesize in (skb->sk) case.
> In this case sk->sk_wmem_alloc should be adjusted too.
>
> [1] https://lkml.org/lkml/2021/8/20/1082
>
> Reported-by: Christoph Paasch <[email protected]>
> Signed-off-by: Vasily Averin <[email protected]>
> ---
> v2: based on patch version from Eric Dumazet,
> added __pskb_expand_head() function, which can be forced
> to adjust skb->truesize and sk->sk_wmem_alloc.
> ---
> net/core/skbuff.c | 43 +++++++++++++++++++++++++++++--------------
> 1 file changed, 29 insertions(+), 14 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f931176..4691023 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1681,10 +1681,10 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
> * reloaded after call to this function.
> */
>
> -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> - gfp_t gfp_mask)
> +static int __pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> + gfp_t gfp_mask, bool update_truesize)
> {
> - int i, osize = skb_end_offset(skb);
> + int delta, i, osize = skb_end_offset(skb);
> int size = osize + nhead + ntail;
> long off;
> u8 *data;
> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> * For the moment, we really care of rx path, or
> * when skb is orphaned (not attached to a socket).
> */
> - if (!skb->sk || skb->destructor == sock_edemux)
> - skb->truesize += size - osize;
> -
> + delta = size - osize;
> + if (!skb->sk || skb->destructor == sock_edemux) {
> + skb->truesize += delta;
> + } else if (update_truesize) {
Unfortunately we can not always do this sk_wmem_alloc change here.
Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc
It seems you need a helper to make sure skb->destructor is one of
the destructors that use skb->truesize and sk->sk_wmem_alloc
For instance, skb_orphan_partial() could have been used.
> + refcount_add(delta, &skb->sk->sk_wmem_alloc);
> + skb->truesize += delta;
> + }
> return 0;
>
> nofrags:
> @@ -1766,6 +1770,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> nodata:
> return -ENOMEM;
> }
> +
> +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> + gfp_t gfp_mask)
> +{
> + return __pskb_expand_head(skb, nhead, ntail, gfp_mask, false);
> +}
> EXPORT_SYMBOL(pskb_expand_head);
>
> /* Make private copy of skb with writable head and some headroom */
> @@ -1804,28 +1814,33 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> {
> int delta = headroom - skb_headroom(skb);
> + struct sk_buff *oskb = NULL;
>
> if (WARN_ONCE(delta <= 0,
> "%s is expecting an increase in the headroom", __func__))
> return skb;
>
> + delta = SKB_DATA_ALIGN(delta);
> /* pskb_expand_head() might crash, if skb is shared */
> if (skb_shared(skb)) {
> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>
> - if (likely(nskb)) {
> - if (skb->sk)
> - skb_set_owner_w(nskb, skb->sk);
> - consume_skb(skb);
> - } else {
> + if (unlikely(!nskb)) {
> kfree_skb(skb);
> + return NULL;
> }
> + oskb = skb;
> skb = nskb;
> }
> - if (skb &&
> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> + if (__pskb_expand_head(skb, delta, 0, GFP_ATOMIC, true)) {
> kfree_skb(skb);
> - skb = NULL;
> + kfree_skb(oskb);
> + return NULL;
> + }
> + if (oskb) {
> + if (oskb->sk)
> + skb_set_owner_w(skb, oskb->sk);
> + consume_skb(oskb);
> }
> return skb;
> }
>
On 8/30/21 9:09 PM, Vasily Averin wrote:
> On 8/30/21 7:01 PM, Eric Dumazet wrote:
>> On 8/29/21 5:59 AM, Vasily Averin wrote:
>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>> after skb_expand_head() call in ip6_xmit.
>>> This may happen because of two reasons:
>>> - skb_set_owner_w() for newly cloned skb is called too early,
>>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
>>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.
>>> In this case sk->sk_wmem_alloc should be adjusted too.
>>>
>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>>> * For the moment, we really care of rx path, or
>>> * when skb is orphaned (not attached to a socket).
>>> */
>>> - if (!skb->sk || skb->destructor == sock_edemux)
>>> - skb->truesize += size - osize;
>>> -
>>> + delta = size - osize;
>>> + if (!skb->sk || skb->destructor == sock_edemux) {
>>> + skb->truesize += delta;
>>> + } else if (update_truesize) {
>>
>> Unfortunately we can not always do this sk_wmem_alloc change here.
>>
>> Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc
>
> Could you please provide some example?
> In past in all handeled cases we have cloned original skb and then unconditionally assigned skb sock_wfree destructor.
> Do you want to say that it worked correctly somehow?
>
> I expected if we set sock_wfree, we have guarantee that old skb adjusted sk_wmem_alloc.
> Am I wrong?
> Could you please point on such case?
However if it is true -- it is not enough to adjust sk_wmem_alloc for proper destructors,
because another destructors may require to do something else.
In this case I can check destructor first and clone skb before pskb_expand_head() call,
like it was happen before.
>> It seems you need a helper to make sure skb->destructor is one of
>> the destructors that use skb->truesize and sk->sk_wmem_alloc
>>
>> For instance, skb_orphan_partial() could have been used.
>
> Thank you, will investigate.
> Vasily Averin
>
On 8/30/21 7:01 PM, Eric Dumazet wrote:
> On 8/29/21 5:59 AM, Vasily Averin wrote:
>> Christoph Paasch reports [1] about incorrect skb->truesize
>> after skb_expand_head() call in ip6_xmit.
>> This may happen because of two reasons:
>> - skb_set_owner_w() for newly cloned skb is called too early,
>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.
>> In this case sk->sk_wmem_alloc should be adjusted too.
>>
>> [1] https://lkml.org/lkml/2021/8/20/1082
>> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>> * For the moment, we really care of rx path, or
>> * when skb is orphaned (not attached to a socket).
>> */
>> - if (!skb->sk || skb->destructor == sock_edemux)
>> - skb->truesize += size - osize;
>> -
>> + delta = size - osize;
>> + if (!skb->sk || skb->destructor == sock_edemux) {
>> + skb->truesize += delta;
>> + } else if (update_truesize) {
>
> Unfortunately we can not always do this sk_wmem_alloc change here.
>
> Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc
Could you please provide some example?
In past in all handeled cases we have cloned original skb and then unconditionally assigned skb sock_wfree destructor.
Do you want to say that it worked correctly somehow?
I expected if we set sock_wfree, we have guarantee that old skb adjusted sk_wmem_alloc.
Am I wrong?
Could you please point on such case?
> It seems you need a helper to make sure skb->destructor is one of
> the destructors that use skb->truesize and sk->sk_wmem_alloc
>
> For instance, skb_orphan_partial() could have been used.
Thank you, will investigate.
Vasily Averin
On 8/30/21 11:09 AM, Vasily Averin wrote:
> On 8/30/21 7:01 PM, Eric Dumazet wrote:
>> On 8/29/21 5:59 AM, Vasily Averin wrote:
>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>> after skb_expand_head() call in ip6_xmit.
>>> This may happen because of two reasons:
>>> - skb_set_owner_w() for newly cloned skb is called too early,
>>> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
>>> - pskb_expand_head() does not adjust truesize in (skb->sk) case.
>>> In this case sk->sk_wmem_alloc should be adjusted too.
>>>
>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>> @@ -1756,9 +1756,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>>> * For the moment, we really care of rx path, or
>>> * when skb is orphaned (not attached to a socket).
>>> */
>>> - if (!skb->sk || skb->destructor == sock_edemux)
>>> - skb->truesize += size - osize;
>>> -
>>> + delta = size - osize;
>>> + if (!skb->sk || skb->destructor == sock_edemux) {
>>> + skb->truesize += delta;
>>> + } else if (update_truesize) {
>>
>> Unfortunately we can not always do this sk_wmem_alloc change here.
>>
>> Some skb have skb->sk set, but the 'reference on socket' is not through sk_wmem_alloc
>
> Could you please provide some example?
> In past in all handeled cases we have cloned original skb and then unconditionally assigned skb sock_wfree destructor.
In the past we ignored old value of skb->destructor,
since the clone got a NULL destructor.
In your patch you assumes it is sock_wfree, or other destructors changing sk_wmem_alloc
You need to make sure skb->destructor is one of the known destructors which
will basically remove skb->truesize from sk->sk_wmem_alloc.
This will also make sure skb->sk is a 'full socket'
If not, you should not change sk->sk_wmem_alloc
> Do you want to say that it worked correctly somehow?
I am simply saying your patch adds a wrong assumption.
>
> I expected if we set sock_wfree, we have guarantee that old skb adjusted sk_wmem_alloc.
> Am I wrong?
> Could you please point on such case?
>
>> It seems you need a helper to make sure skb->destructor is one of
>> the destructors that use skb->truesize and sk->sk_wmem_alloc
>>
>> For instance, skb_orphan_partial() could have been used.
>
> Thank you, will investigate.
> Vasily Averin
>
RFC because it have an extra changes:
new is_skb_wmem() helper can be called
- either before pskb_expand_head(), to create skb clones
for skb with destructors that does not change sk->sk_wmem_alloc
- or after pskb_expand_head(), to change owner in skb_set_owner_w()
In current patch I've added both these ways,
we need to keep one of them.
---
Christoph Paasch reports [1] about incorrect skb->truesize
after skb_expand_head() call in ip6_xmit.
This may happen because of two reasons:
- skb_set_owner_w() for newly cloned skb is called too early,
before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
- pskb_expand_head() does not adjust truesize in (skb->sk) case.
In this case sk->sk_wmem_alloc should be adjusted too.
[1] https://lkml.org/lkml/2021/8/20/1082
Reported-by: Christoph Paasch <[email protected]>
Signed-off-by: Vasily Averin <[email protected]>
---
v3: removed __pskb_expand_head(),
added is_skb_wmem() helper for skb with wmem-compatible destructors
there are 2 ways to use it:
- before pskb_expand_head(), to create skb clones
- after pskb_expand_head(), to change owner on extended skb.
v2: based on patch version from Eric Dumazet,
added __pskb_expand_head() function, which can be forced
to adjust skb->truesize and sk->sk_wmem_alloc.
---
include/net/sock.h | 1 +
net/core/skbuff.c | 39 ++++++++++++++++++++++++++++-----------
net/core/sock.c | 8 ++++++++
3 files changed, 37 insertions(+), 11 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 95b2577..173d58c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
+bool is_skb_wmem(const struct sk_buff *skb);
struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f931176..3ce33f2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1804,30 +1804,47 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
int delta = headroom - skb_headroom(skb);
+ int osize = skb_end_offset(skb);
+ struct sk_buff *oskb = NULL;
+ struct sock *sk = skb->sk;
if (WARN_ONCE(delta <= 0,
"%s is expecting an increase in the headroom", __func__))
return skb;
- /* pskb_expand_head() might crash, if skb is shared */
- if (skb_shared(skb)) {
+ delta = SKB_DATA_ALIGN(delta);
+ /* pskb_expand_head() might crash, if skb is shared.
+ * Also we should clone skb if its destructor does
+ * not adjust skb->truesize and sk->sk_wmem_alloc
+ */
+ if (skb_shared(skb) ||
+ (sk && (!sk_fullsock(sk) || !is_skb_wmem(skb)))) {
struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
- if (likely(nskb)) {
- if (skb->sk)
- skb_set_owner_w(nskb, skb->sk);
- consume_skb(skb);
- } else {
+ if (unlikely(!nskb)) {
kfree_skb(skb);
+ return NULL;
}
+ oskb = skb;
skb = nskb;
}
- if (skb &&
- pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+ if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {
kfree_skb(skb);
- skb = NULL;
+ kfree_skb(oskb);
+ return NULL;
}
- return skb;
+ if (oskb) {
+ if (sk)
+ skb_set_owner_w(skb, sk);
+ consume_skb(oskb);
+ } else if (sk) {
+ delta = osize - skb_end_offset(skb);
+ if (!is_skb_wmem(skb))
+ skb_set_owner_w(skb, sk);
+ skb->truesize += delta;
+ if (sk_fullsock(sk))
+ refcount_add(delta, &sk->sk_wmem_alloc);
+ } return skb;
}
EXPORT_SYMBOL(skb_expand_head);
diff --git a/net/core/sock.c b/net/core/sock.c
index 950f1e7..0315dcb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
}
EXPORT_SYMBOL(skb_set_owner_w);
+bool is_skb_wmem(const struct sk_buff *skb)
+{
+ return (skb->destructor == sock_wfree ||
+ skb->destructor == __sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
+}
+EXPORT_SYMBOL(is_skb_wmem);
+
static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
#ifdef CONFIG_TLS_DEVICE
--
1.8.3.1
On 8/31/21 7:34 AM, Vasily Averin wrote:
> RFC because it have an extra changes:
> new is_skb_wmem() helper can be called
> - either before pskb_expand_head(), to create skb clones
> for skb with destructors that does not change sk->sk_wmem_alloc
> - or after pskb_expand_head(), to change owner in skb_set_owner_w()
>
> In current patch I've added both these ways,
> we need to keep one of them.
> ---
> Christoph Paasch reports [1] about incorrect skb->truesize
> after skb_expand_head() call in ip6_xmit.
> This may happen because of two reasons:
> - skb_set_owner_w() for newly cloned skb is called too early,
> before pskb_expand_head() where truesize is adjusted for (!skb-sk) case.
> - pskb_expand_head() does not adjust truesize in (skb->sk) case.
> In this case sk->sk_wmem_alloc should be adjusted too.
>
> [1] https://lkml.org/lkml/2021/8/20/1082
>
> Reported-by: Christoph Paasch <[email protected]>
> Signed-off-by: Vasily Averin <[email protected]>
> ---
> v3: removed __pskb_expand_head(),
> added is_skb_wmem() helper for skb with wmem-compatible destructors
> there are 2 ways to use it:
> - before pskb_expand_head(), to create skb clones
> - after pskb_expand_head(), to change owner on extended skb.
> v2: based on patch version from Eric Dumazet,
> added __pskb_expand_head() function, which can be forced
> to adjust skb->truesize and sk->sk_wmem_alloc.
> ---
> include/net/sock.h | 1 +
> net/core/skbuff.c | 39 ++++++++++++++++++++++++++++-----------
> net/core/sock.c | 8 ++++++++
> 3 files changed, 37 insertions(+), 11 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 95b2577..173d58c 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1695,6 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
> gfp_t priority);
> void __sock_wfree(struct sk_buff *skb);
> void sock_wfree(struct sk_buff *skb);
> +bool is_skb_wmem(const struct sk_buff *skb);
> struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
> gfp_t priority);
> void skb_orphan_partial(struct sk_buff *skb);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f931176..3ce33f2 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1804,30 +1804,47 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> {
> int delta = headroom - skb_headroom(skb);
> + int osize = skb_end_offset(skb);
> + struct sk_buff *oskb = NULL;
> + struct sock *sk = skb->sk;
>
> if (WARN_ONCE(delta <= 0,
> "%s is expecting an increase in the headroom", __func__))
> return skb;
>
> - /* pskb_expand_head() might crash, if skb is shared */
> - if (skb_shared(skb)) {
> + delta = SKB_DATA_ALIGN(delta);
> + /* pskb_expand_head() might crash, if skb is shared.
> + * Also we should clone skb if its destructor does
> + * not adjust skb->truesize and sk->sk_wmem_alloc
> + */
> + if (skb_shared(skb) ||
> + (sk && (!sk_fullsock(sk) || !is_skb_wmem(skb)))) {
is_skb_wmem() is only possibly true for full sockets by definition.
So the (sk_fullsock(sk) && is_skb_wmem(skb)) can be reduced to is_skb_wmem(skb)
> struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>
> - if (likely(nskb)) {
> - if (skb->sk)
> - skb_set_owner_w(nskb, skb->sk);
> - consume_skb(skb);
> - } else {
> + if (unlikely(!nskb)) {
> kfree_skb(skb);
> + return NULL;
> }
> + oskb = skb;
> skb = nskb;
> }
> - if (skb &&
> - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> + if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {
> kfree_skb(skb);
> - skb = NULL;
> + kfree_skb(oskb);
> + return NULL;
> }
> - return skb;
> + if (oskb) {
> + if (sk)
> + skb_set_owner_w(skb, sk);
Broken for non full sockets.
Calling skb_set_owner_w(skb, sk) for them is a bug.
> + consume_skb(oskb);
> + } else if (sk) {
> + delta = osize - skb_end_offset(skb);
> + if (!is_skb_wmem(skb))
> + skb_set_owner_w(skb, sk);
This would be broken for non full sockets.
Calling skb_set_owner_w(skb, sk) for them is a bug.
> + skb->truesize += delta;
> + if (sk_fullsock(sk))
> + refcount_add(delta, &sk->sk_wmem_alloc);
> + } return skb;
> }
> EXPORT_SYMBOL(skb_expand_head);
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 950f1e7..0315dcb 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2227,6 +2227,14 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
> }
> EXPORT_SYMBOL(skb_set_owner_w);
>
> +bool is_skb_wmem(const struct sk_buff *skb)
> +{
> + return (skb->destructor == sock_wfree ||
> + skb->destructor == __sock_wfree ||
> + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
No need for return (EXPRESSION);
You can simply : return EXPRESSION;
ie
return skb->destructor == sock_wfree ||
skb->destructor == __sock_wfree ||
(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
> +}
> +EXPORT_SYMBOL(is_skb_wmem);
> +
> static bool can_skb_orphan_partial(const struct sk_buff *skb)
> {
> #ifdef CONFIG_TLS_DEVICE
>
On 8/31/21 10:38 PM, Eric Dumazet wrote:
> On 8/31/21 7:34 AM, Vasily Averin wrote:
>> RFC because it have an extra changes:
>> new is_skb_wmem() helper can be called
>> - either before pskb_expand_head(), to create skb clones
>> for skb with destructors that does not change sk->sk_wmem_alloc
>> - or after pskb_expand_head(), to change owner in skb_set_owner_w()
>>
>> In current patch I've added both these ways,
>> we need to keep one of them.
If nobody object I vote for 2nd way:
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f931176..3ce33f2 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -1804,30 +1804,47 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>> struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>> {
... skipped ...
>> - return skb;
>> + if (oskb) {
>> + if (sk)
>> + skb_set_owner_w(skb, sk);
>
> Broken for non full sockets.
> Calling skb_set_owner_w(skb, sk) for them is a bug.
I think you're wrong here.
It is 100% equivalent of old code,
skb_set_owner_w() handles sk_fullsock(sk) inside and does not adjust sk->sk_wmem_alloc.
Please explain if I'm wrong.
>> + consume_skb(oskb);
>> + } else if (sk) {
>> + delta = osize - skb_end_offset(skb);
>> + if (!is_skb_wmem(skb))
>> + skb_set_owner_w(skb, sk);
>
> This would be broken for non full sockets.
> Calling skb_set_owner_w(skb, sk) for them is a bug.
See my comment above.
>> + skb->truesize += delta;
>> + if (sk_fullsock(sk))
>> + refcount_add(delta, &sk->sk_wmem_alloc);
>
>
>> + } return skb;
Strange line, will fix it.
>> }
>> EXPORT_SYMBOL(skb_expand_head);