Existing L2TP kernel code does not derive the optimal MTU for Ethernet
pseudowires and instead leaves this to a userspace L2TP daemon or
operator. If an MTU is not specified, the existing kernel code chooses
an MTU that does not take account of all tunnel header overheads, which
can lead to unwanted IP fragmentation. When L2TP is used without a
control plane (userspace daemon), we would prefer that the kernel does a
better job of choosing a default pseudowire MTU, taking account of all
tunnel header overheads, including IP header options, if any. This patch
addresses this.
Change-set here uses the new kernel function, kernel_sock_ip_overhead(),
to factor the outer IP overhead on the L2TP tunnel socket (including
IP Options, if any) when calculating the default MTU for an Ethernet
pseudowire, along with consideration of the inner Ethernet header.
Signed-off-by: R. Parameswaran <[email protected]>
---
net/l2tp/l2tp_eth.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 51 insertions(+), 4 deletions(-)
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 8bf18a5..9c18a4e 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -30,6 +30,9 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
#include "l2tp_core.h"
@@ -204,6 +207,53 @@ static void l2tp_eth_show(struct seq_file *m, void *arg)
}
#endif
+static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
+ struct l2tp_session *session,
+ struct net_device *dev)
+{
+ unsigned int overhead = 0;
+ struct dst_entry *dst;
+ u32 l3_overhead = 0;
+
+ /* if the encap is UDP, account for UDP header size */
+ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
+ overhead += sizeof(struct udphdr);
+ dev->needed_headroom += sizeof(struct udphdr);
+ }
+ if (session->mtu != 0) {
+ dev->mtu = session->mtu;
+ dev->needed_headroom += session->hdr_len;
+ return;
+ }
+ l3_overhead = kernel_sock_ip_overhead(tunnel->sock);
+ if (l3_overhead == 0) {
+ /* L3 Overhead couldn't be identified, this could be
+ * because tunnel->sock was NULL or the socket's
+ * address family was not IPv4 or IPv6,
+ * dev mtu stays at 1500.
+ */
+ return;
+ }
+ /* Adjust MTU, factor overhead - underlay L3, overlay L2 hdr
+ * UDP overhead, if any, was already factored in above.
+ */
+ overhead += session->hdr_len + ETH_HLEN + l3_overhead;
+
+ /* If PMTU discovery was enabled, use discovered MTU on L2TP device */
+ dst = sk_dst_get(tunnel->sock);
+ if (dst) {
+ /* dst_mtu will use PMTU if found, else fallback to intf MTU */
+ u32 pmtu = dst_mtu(dst);
+
+ if (pmtu != 0)
+ dev->mtu = pmtu;
+ dst_release(dst);
+ }
+ session->mtu = dev->mtu - overhead;
+ dev->mtu = session->mtu;
+ dev->needed_headroom += session->hdr_len;
+}
+
static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
{
struct net_device *dev;
@@ -253,12 +303,9 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
}
dev_net_set(dev, net);
- if (session->mtu == 0)
- session->mtu = dev->mtu - session->hdr_len;
- dev->mtu = session->mtu;
- dev->needed_headroom += session->hdr_len;
dev->min_mtu = 0;
dev->max_mtu = ETH_MAX_MTU;
+ l2tp_eth_adjust_mtu(tunnel, session, dev);
priv = netdev_priv(dev);
priv->dev = dev;
--
2.1.4
On Wed, Apr 05, 2017 at 05:00:07PM -0700, R. Parameswaran wrote:
>
> Change-set here uses the new kernel function, kernel_sock_ip_overhead(),
> to factor the outer IP overhead on the L2TP tunnel socket (including
> IP Options, if any) when calculating the default MTU for an Ethernet
> pseudowire, along with consideration of the inner Ethernet header.
>
I get the following warning with CONFIG_LOCKDEP when creating a new
session:
# ip l2tp add tunnel local 10.1.8.64 remote 10.1.8.32 udp_sport 1701 udp_dport 1701 tunnel_id 1 peer_tunnel_id 1
# ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
...
[ 45.524535] ------------[ cut here ]------------
[ 45.524570] WARNING: CPU: 3 PID: 732 at ./include/net/sock.h:1509 kernel_sock_ip_overhead+0x54/0x1a1
[ 45.524574] Modules linked in: l2tp_eth l2tp_netlink l2tp_core ip6_udp_tunnel udp_tunnel crc32c_intel ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev acpi_cpufreq processor button serio_raw ext4 crc16 jbd2 mbcache virtio_blk virtio_net virtio_pci virtio_ring virtio
[ 45.524696] CPU: 3 PID: 732 Comm: ip Not tainted 4.11.0-rc5 #1
[ 45.524700] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 45.524704] Call Trace:
[ 45.524714] dump_stack+0x67/0x90
[ 45.524725] __warn+0xfd/0x118
[ 45.524739] warn_slowpath_null+0x18/0x1a
[ 45.524747] kernel_sock_ip_overhead+0x54/0x1a1
[ 45.524761] l2tp_eth_create+0x1eb/0x557 [l2tp_eth]
[ 45.524768] ? __mutex_unlock_slowpath+0xb5/0x2c2
[ 45.524787] ? l2tp_eth_dev_uninit+0xd9/0xd9 [l2tp_eth]
[ 45.524800] l2tp_nl_cmd_session_create+0x521/0x56b [l2tp_netlink]
[ 45.524827] genl_family_rcv_msg+0x445/0x4b3
[ 45.524857] genl_rcv_msg+0x60/0x84
[ 45.524867] ? genl_family_rcv_msg+0x4b3/0x4b3
[ 45.524875] netlink_rcv_skb+0x95/0x102
[ 45.524881] ? down_read+0x41/0x62
[ 45.524893] genl_rcv+0x23/0x32
[ 45.524901] netlink_unicast+0x1b0/0x23b
[ 45.524915] netlink_sendmsg+0x46f/0x48f
[ 45.524933] ? netlink_unicast+0x23b/0x23b
[ 45.524942] sock_sendmsg_nosec+0x41/0x51
[ 45.524953] sock_sendmsg+0x33/0x38
[ 45.524962] ___sys_sendmsg+0x2a0/0x374
[ 45.524991] ? do_raw_spin_unlock+0xc2/0xcc
[ 45.525002] ? _raw_spin_unlock+0x22/0x25
[ 45.525014] ? match_held_lock+0x20/0x113
[ 45.525027] ? __fget_light+0x89/0xae
[ 45.525045] __sys_sendmsg+0x40/0x6b
[ 45.525052] ? __sys_sendmsg+0x40/0x6b
[ 45.525075] SyS_sendmsg+0x9/0xb
[ 45.525083] entry_SYSCALL_64_fastpath+0x18/0xad
[ 45.525089] RIP: 0033:0x7fb224391690
[ 45.525094] RSP: 002b:00007ffe53943dd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[ 45.525104] RAX: ffffffffffffffda RBX: 00007ffe539480f0 RCX: 00007fb224391690
[ 45.525108] RDX: 0000000000000000 RSI: 00007ffe53943e20 RDI: 0000000000000004
[ 45.525113] RBP: ffffffff810a1e49 R08: 0000000000000000 R09: 0000000000000005
[ 45.525119] R10: 0000000000000000 R11: 0000000000000246 R12: ffff88003436ff98
[ 45.525124] R13: 0000000000000046 R14: 00007ffe539486a0 R15: 00007ffe53947ea0
[ 45.525136] ? trace_hardirqs_off_caller+0x121/0x12f
[ 45.525157] ---[ end trace 0834023e7b30e761 ]---
I guess you neet to lock_sock(tunnel->socket) before calling
kernel_sock_ip_overhead().
Hi Guillaume,
On Tue, Apr 11, 2017 at 3:40 AM, Guillaume Nault <[email protected]> wrote:
> On Wed, Apr 05, 2017 at 05:00:07PM -0700, R. Parameswaran wrote:
>>
>> Change-set here uses the new kernel function, kernel_sock_ip_overhead(),
>> to factor the outer IP overhead on the L2TP tunnel socket (including
>> IP Options, if any) when calculating the default MTU for an Ethernet
>> pseudowire, along with consideration of the inner Ethernet header.
>>
> I get the following warning with CONFIG_LOCKDEP when creating a new
> session:
> # ip l2tp add tunnel local 10.1.8.64 remote 10.1.8.32 udp_sport 1701 udp_dport 1701 tunnel_id 1 peer_tunnel_id 1
> # ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
> ...
Thanks for reporting this - I'll try and put up a patch soon,
hopefully the patch can stay in while I add this. One Q - how many CPU
cores do you have? Can you give me some idea of how many tunnels and
sessions when you saw this?
I did not see this warning in my testing, possibly because
CONFIG_LOCKDEP_SUPPORT is turned off on the product build? Will
re-test with this turned on.
thanks,
Ramkumar
> [ 45.524535] ------------[ cut here ]------------
> [ 45.524570] WARNING: CPU: 3 PID: 732 at ./include/net/sock.h:1509 kernel_sock_ip_overhead+0x54/0x1a1
> [ 45.524574] Modules linked in: l2tp_eth l2tp_netlink l2tp_core ip6_udp_tunnel udp_tunnel crc32c_intel ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev acpi_cpufreq processor button serio_raw ext4 crc16 jbd2 mbcache virtio_blk virtio_net virtio_pci virtio_ring virtio
> [ 45.524696] CPU: 3 PID: 732 Comm: ip Not tainted 4.11.0-rc5 #1
> [ 45.524700] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> [ 45.524704] Call Trace:
> [ 45.524714] dump_stack+0x67/0x90
> [ 45.524725] __warn+0xfd/0x118
> [ 45.524739] warn_slowpath_null+0x18/0x1a
> [ 45.524747] kernel_sock_ip_overhead+0x54/0x1a1
> [ 45.524761] l2tp_eth_create+0x1eb/0x557 [l2tp_eth]
> [ 45.524768] ? __mutex_unlock_slowpath+0xb5/0x2c2
> [ 45.524787] ? l2tp_eth_dev_uninit+0xd9/0xd9 [l2tp_eth]
> [ 45.524800] l2tp_nl_cmd_session_create+0x521/0x56b [l2tp_netlink]
> [ 45.524827] genl_family_rcv_msg+0x445/0x4b3
> [ 45.524857] genl_rcv_msg+0x60/0x84
> [ 45.524867] ? genl_family_rcv_msg+0x4b3/0x4b3
> [ 45.524875] netlink_rcv_skb+0x95/0x102
> [ 45.524881] ? down_read+0x41/0x62
> [ 45.524893] genl_rcv+0x23/0x32
> [ 45.524901] netlink_unicast+0x1b0/0x23b
> [ 45.524915] netlink_sendmsg+0x46f/0x48f
> [ 45.524933] ? netlink_unicast+0x23b/0x23b
> [ 45.524942] sock_sendmsg_nosec+0x41/0x51
> [ 45.524953] sock_sendmsg+0x33/0x38
> [ 45.524962] ___sys_sendmsg+0x2a0/0x374
> [ 45.524991] ? do_raw_spin_unlock+0xc2/0xcc
> [ 45.525002] ? _raw_spin_unlock+0x22/0x25
> [ 45.525014] ? match_held_lock+0x20/0x113
> [ 45.525027] ? __fget_light+0x89/0xae
> [ 45.525045] __sys_sendmsg+0x40/0x6b
> [ 45.525052] ? __sys_sendmsg+0x40/0x6b
> [ 45.525075] SyS_sendmsg+0x9/0xb
> [ 45.525083] entry_SYSCALL_64_fastpath+0x18/0xad
> [ 45.525089] RIP: 0033:0x7fb224391690
> [ 45.525094] RSP: 002b:00007ffe53943dd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
> [ 45.525104] RAX: ffffffffffffffda RBX: 00007ffe539480f0 RCX: 00007fb224391690
> [ 45.525108] RDX: 0000000000000000 RSI: 00007ffe53943e20 RDI: 0000000000000004
> [ 45.525113] RBP: ffffffff810a1e49 R08: 0000000000000000 R09: 0000000000000005
> [ 45.525119] R10: 0000000000000000 R11: 0000000000000246 R12: ffff88003436ff98
> [ 45.525124] R13: 0000000000000046 R14: 00007ffe539486a0 R15: 00007ffe53947ea0
> [ 45.525136] ? trace_hardirqs_off_caller+0x121/0x12f
> [ 45.525157] ---[ end trace 0834023e7b30e761 ]---
>
> I guess you neet to lock_sock(tunnel->socket) before calling
> kernel_sock_ip_overhead().
On Tue, Apr 11, 2017 at 09:39:58AM -0700, R Parameswaran wrote:
> Hi Guillaume,
>
> On Tue, Apr 11, 2017 at 3:40 AM, Guillaume Nault <[email protected]> wrote:
> > On Wed, Apr 05, 2017 at 05:00:07PM -0700, R. Parameswaran wrote:
> >>
> >> Change-set here uses the new kernel function, kernel_sock_ip_overhead(),
> >> to factor the outer IP overhead on the L2TP tunnel socket (including
> >> IP Options, if any) when calculating the default MTU for an Ethernet
> >> pseudowire, along with consideration of the inner Ethernet header.
> >>
> > I get the following warning with CONFIG_LOCKDEP when creating a new
> > session:
> > # ip l2tp add tunnel local 10.1.8.64 remote 10.1.8.32 udp_sport 1701 udp_dport 1701 tunnel_id 1 peer_tunnel_id 1
> > # ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
> > ...
>
> Thanks for reporting this - I'll try and put up a patch soon,
> hopefully the patch can stay in while I add this. One Q - how many CPU
> cores do you have?
This is a virtual machine with 4 vcores, but that shouldn't matter.
> Can you give me some idea of how many tunnels and
> sessions when you saw this?
>
Creating one session is enough. I simply used the following command:
# ip l2tp add tunnel local 10.1.8.64 remote 10.1.8.32 udp_sport 1701 udp_dport 1701 tunnel_id 1 peer_tunnel_id 1
# ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
> I did not see this warning in my testing, possibly because
> CONFIG_LOCKDEP_SUPPORT is turned off on the product build? Will
> re-test with this turned on.
>
Yes, enabling lockdep should let you reproduce the problem.
The issue goes away if the tunnel's socket is locked while calling
kernel_sock_ip_overhead():
+ lock_sock(tunnel->sock);
kernel_sock_ip_overhead(tunnel->sock);
+ release_sock(tunnel->sock);
Hi Guillaume,
Please see inline:
On Tue, Apr 11, 2017 at 10:05 AM, Guillaume Nault <[email protected]> wrote:
> On Tue, Apr 11, 2017 at 09:39:58AM -0700, R Parameswaran wrote:
>> Hi Guillaume,
>>
>> On Tue, Apr 11, 2017 at 3:40 AM, Guillaume Nault <[email protected]> wrote:
>> > On Wed, Apr 05, 2017 at 05:00:07PM -0700, R. Parameswaran wrote:
>> >>
>> >> Change-set here uses the new kernel function, kernel_sock_ip_overhead(),
>> >> to factor the outer IP overhead on the L2TP tunnel socket (including
>> >> IP Options, if any) when calculating the default MTU for an Ethernet
>> >> pseudowire, along with consideration of the inner Ethernet header.
>> >>
>> > I get the following warning with CONFIG_LOCKDEP when creating a new
>> > session:
>> > # ip l2tp add tunnel local 10.1.8.64 remote 10.1.8.32 udp_sport 1701 udp_dport 1701 tunnel_id 1 peer_tunnel_id 1
>> > # ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
>> > ...
>>
>> Thanks for reporting this - I'll try and put up a patch soon,
>> hopefully the patch can stay in while I add this. One Q - how many CPU
>> cores do you have?
> This is a virtual machine with 4 vcores, but that shouldn't matter.
>
>> Can you give me some idea of how many tunnels and
>> sessions when you saw this?
>>
> Creating one session is enough. I simply used the following command:
> # ip l2tp add tunnel local 10.1.8.64 remote 10.1.8.32 udp_sport 1701 udp_dport 1701 tunnel_id 1 peer_tunnel_id 1
> # ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
>
>> I did not see this warning in my testing, possibly because
>> CONFIG_LOCKDEP_SUPPORT is turned off on the product build? Will
>> re-test with this turned on.
>>
> Yes, enabling lockdep should let you reproduce the problem.
>
> The issue goes away if the tunnel's socket is locked while calling
> kernel_sock_ip_overhead():
> + lock_sock(tunnel->sock);
> kernel_sock_ip_overhead(tunnel->sock);
> + release_sock(tunnel->sock);
Ack, thanks - was thinking along this line, since I see similar code
at other places in L2TP. I'll try and have a preliminary
patch out by tonight.
regards,
Ramkumar