LinuxLists.cc - [PATCH v5] ipvs: allow tunneling with gue encapsulation

2019-03-19 05:28:50

Subject: [PATCH v5] ipvs: allow tunneling with gue encapsulation

ipip packets are blocked in some public cloud environments, this patch
allows gue encapsulation with the tunneling method, which would make
tunneling working in those environments.

Signed-off-by: Jacky Hu <[email protected]>
---
include/net/ip_vs.h | 5 ++
include/uapi/linux/ip_vs.h | 11 +++
net/netfilter/ipvs/ip_vs_ctl.c | 43 ++++++++++-
net/netfilter/ipvs/ip_vs_xmit.c | 124 ++++++++++++++++++++++++++++++--
4 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a0d2e0bb9a94..cdc7b621930d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -603,6 +603,9 @@ struct ip_vs_dest_user_kern {

/* Address family of addr */
u16 af;
+
+ u16 tun_type; /* tunnel type */
+ __be16 tun_port; /* tunnel port */
};

@@ -663,6 +666,8 @@ struct ip_vs_dest {
atomic_t conn_flags; /* flags to copy to conn */
atomic_t weight; /* server weight */
atomic_t last_weight; /* server latest weight */
+ __u16 tun_type; /* tunnel type */
+ __be16 tun_port; /* tunnel port */

refcount_t refcnt; /* reference counter */
struct ip_vs_stats stats; /* statistics */
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 1c916b2f89dc..e34f436fc79d 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -124,6 +124,13 @@

#define IP_VS_PEDATA_MAXLEN 255

+/* Tunnel types */
+enum {
+ IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */
+ IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */
+ IP_VS_CONN_F_TUNNEL_TYPE_MAX,
+};
+
/*
* The struct ip_vs_service_user and struct ip_vs_dest_user are
* used to set IPVS rules through setsockopt.
@@ -392,6 +399,10 @@ enum {

IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */

+ IPVS_DEST_ATTR_TUN_TYPE, /* tunnel type */
+
+ IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */
+
__IPVS_DEST_ATTR_MAX,
};

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 432141f04af3..ddee6266b78b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -830,6 +830,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;

+ /* set the tunnel info */
+ dest->tun_type = udest->tun_type;
+ dest->tun_port = udest->tun_port;
+
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
@@ -980,6 +984,17 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ERANGE;
}

+ switch (udest->tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ if (udest->tun_port == 0) {
+ pr_err("%s(): tunnel port is zero\n", __func__);
+ return -EINVAL;
+ }
+ break;
+ default:
+ break;
+ }
+
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);

/* We use function that requires RCU lock */
@@ -1044,6 +1059,17 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ERANGE;
}

+ switch (udest->tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ if (udest->tun_port == 0) {
+ pr_err("%s(): tunnel port is zero\n", __func__);
+ return -EINVAL;
+ }
+ break;
+ default:
+ break;
+ }
+
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);

/* We use function that requires RCU lock */
@@ -2310,6 +2336,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
udest->u_threshold = udest_compat->u_threshold;
udest->l_threshold = udest_compat->l_threshold;
udest->af = AF_INET;
+ udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
}

static int
@@ -2869,6 +2896,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
[IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
[IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
+ [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
};

static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
@@ -3172,6 +3201,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
IP_VS_CONN_F_FWD_MASK)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
atomic_read(&dest->weight)) ||
+ nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
+ dest->tun_type) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
+ dest->tun_port) ||
nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
@@ -3294,12 +3327,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
- *nla_l_thresh;
+ *nla_l_thresh, *nla_tun_type, *nla_tun_port;

nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
+ nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
+ nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];

if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
return -EINVAL;
@@ -3309,6 +3344,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
udest->weight = nla_get_u32(nla_weight);
udest->u_threshold = nla_get_u32(nla_u_thresh);
udest->l_threshold = nla_get_u32(nla_l_thresh);
+
+ if (nla_tun_type)
+ udest->tun_type = nla_get_u8(nla_tun_type);
+
+ if (nla_tun_port)
+ udest->tun_port = nla_get_be16(nla_tun_port);
}

return 0;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 473cce2a5231..730e108b9f36 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -32,6 +32,7 @@
#include <linux/slab.h>
#include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h>
+#include <net/gue.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
@@ -382,6 +383,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst);
} else {
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (!dest)
+ goto err_put;
+ switch (dest->tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ break;
+ default:
+ break;
+ }
if (mtu < 68) {
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
goto err_put;
@@ -533,6 +543,15 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst);
else {
mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (!dest)
+ goto err_put;
+ switch (dest->tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ break;
+ default:
+ break;
+ }
if (mtu < IPV6_MIN_MTU) {
IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
IPV6_MIN_MTU);
@@ -989,6 +1008,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
}
}

+static int
+ipvs_gue_encap(struct net *net, struct sk_buff *skb,
+ struct ip_vs_conn *cp, __u8 *next_protocol)
+{
+ __be16 dport;
+ __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
+ struct udphdr *udph; /* Our new UDP header */
+ struct guehdr *gueh; /* Our new GUE header */
+
+ skb_push(skb, sizeof(struct guehdr));
+
+ gueh = (struct guehdr *)skb->data;
+
+ gueh->control = 0;
+ gueh->version = 0;
+ gueh->hlen = 0;
+ gueh->flags = 0;
+ gueh->proto_ctype = *next_protocol;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ udph = udp_hdr(skb);
+
+ dport = cp->dest->tun_port;
+ udph->dest = dport;
+ udph->source = sport;
+ udph->len = htons(skb->len);
+ udph->check = 0;
+
+ *next_protocol = IPPROTO_UDP;
+
+ return 0;
+}
+
/*
* IP Tunneling transmitter
*
@@ -1025,6 +1079,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
+ int tun_type, gso_type;

EnterFunction(10);

@@ -1046,6 +1101,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);

+ tun_type = cp->dest->tun_type;
+
+ switch (tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+ break;
+ default:
+ break;
+ }
+
/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
@@ -1054,11 +1119,30 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;

- if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
+ switch (tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ gso_type = SKB_GSO_UDP_TUNNEL;
+ break;
+ default:
+ gso_type = __tun_gso_type_mask(AF_INET, cp->af);
+ break;
+ }
+
+ if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;

skb->transport_header = skb->network_header;

+ skb_set_inner_ipproto(skb, next_protocol);
+
+ switch (tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ ipvs_gue_encap(net, skb, cp, &next_protocol);
+ break;
+ default:
+ break;
+ }
+
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1102,6 +1186,8 @@ int
ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
+ struct netns_ipvs *ipvs = cp->ipvs;
+ struct net *net = ipvs->net;
struct rt6_info *rt; /* Route to the other host */
struct in6_addr saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
@@ -1112,10 +1198,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
+ int tun_type, gso_type;

EnterFunction(10);

- local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
&saddr, ipvsh, 1,
IP_VS_RT_MODE_LOCAL |
@@ -1134,17 +1221,46 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);

+ tun_type = cp->dest->tun_type;
+
+ switch (tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+ break;
+ default:
+ break;
+ }
+
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, &payload_len,
&dsfield, &ttl, NULL);
if (IS_ERR(skb))
goto tx_error;

- if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
+ switch (tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ gso_type = SKB_GSO_UDP_TUNNEL;
+ break;
+ default:
+ gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
+ break;
+ }
+
+ if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;

skb->transport_header = skb->network_header;

+ skb_set_inner_ipproto(skb, next_protocol);
+
+ switch (tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ ipvs_gue_encap(net, skb, cp, &next_protocol);
+ break;
+ default:
+ break;
+ }
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1167,7 +1283,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,

ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
- ip6_local_out(cp->ipvs->net, skb->sk, skb);
+ ip6_local_out(net, skb->sk, skb);
else if (ret == NF_DROP)
kfree_skb(skb);

--
2.21.0

2019-03-21 20:32:10

by Julian Anastasov

[permalink] [raw]

Subject: Re: [PATCH v5] ipvs: allow tunneling with gue encapsulation

Hello,

On Tue, 19 Mar 2019, Jacky Hu wrote:

> ipip packets are blocked in some public cloud environments, this patch
> allows gue encapsulation with the tunneling method, which would make
> tunneling working in those environments.
>
> Signed-off-by: Jacky Hu <[email protected]>

The patch looks good to me, thanks!

Acked-by: Julian Anastasov <[email protected]>

I plan to add the needed ICMP-error support for the new tunnel
type in the following days, as a followup patch.

> ---
> include/net/ip_vs.h | 5 ++
> include/uapi/linux/ip_vs.h | 11 +++
> net/netfilter/ipvs/ip_vs_ctl.c | 43 ++++++++++-
> net/netfilter/ipvs/ip_vs_xmit.c | 124 ++++++++++++++++++++++++++++++--
> 4 files changed, 178 insertions(+), 5 deletions(-)
>
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index a0d2e0bb9a94..cdc7b621930d 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -603,6 +603,9 @@ struct ip_vs_dest_user_kern {
>
> /* Address family of addr */
> u16 af;
> +
> + u16 tun_type; /* tunnel type */
> + __be16 tun_port; /* tunnel port */
> };
>
>
> @@ -663,6 +666,8 @@ struct ip_vs_dest {
> atomic_t conn_flags; /* flags to copy to conn */
> atomic_t weight; /* server weight */
> atomic_t last_weight; /* server latest weight */
> + __u16 tun_type; /* tunnel type */
> + __be16 tun_port; /* tunnel port */
>
> refcount_t refcnt; /* reference counter */
> struct ip_vs_stats stats; /* statistics */
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index 1c916b2f89dc..e34f436fc79d 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -124,6 +124,13 @@
>
> #define IP_VS_PEDATA_MAXLEN 255
>
> +/* Tunnel types */
> +enum {
> + IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */
> + IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */
> + IP_VS_CONN_F_TUNNEL_TYPE_MAX,
> +};
> +
> /*
> * The struct ip_vs_service_user and struct ip_vs_dest_user are
> * used to set IPVS rules through setsockopt.
> @@ -392,6 +399,10 @@ enum {
>
> IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */
>
> + IPVS_DEST_ATTR_TUN_TYPE, /* tunnel type */
> +
> + IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */
> +
> __IPVS_DEST_ATTR_MAX,
> };
>
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index 432141f04af3..ddee6266b78b 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -830,6 +830,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
> conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
> conn_flags |= IP_VS_CONN_F_INACTIVE;
>
> + /* set the tunnel info */
> + dest->tun_type = udest->tun_type;
> + dest->tun_port = udest->tun_port;
> +
> /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
> if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
> conn_flags |= IP_VS_CONN_F_NOOUTPUT;
> @@ -980,6 +984,17 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> return -ERANGE;
> }
>
> + switch (udest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + if (udest->tun_port == 0) {
> + pr_err("%s(): tunnel port is zero\n", __func__);
> + return -EINVAL;
> + }
> + break;
> + default:
> + break;
> + }
> +
> ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
>
> /* We use function that requires RCU lock */
> @@ -1044,6 +1059,17 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> return -ERANGE;
> }
>
> + switch (udest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + if (udest->tun_port == 0) {
> + pr_err("%s(): tunnel port is zero\n", __func__);
> + return -EINVAL;
> + }
> + break;
> + default:
> + break;
> + }
> +
> ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
>
> /* We use function that requires RCU lock */
> @@ -2310,6 +2336,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
> udest->u_threshold = udest_compat->u_threshold;
> udest->l_threshold = udest_compat->l_threshold;
> udest->af = AF_INET;
> + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
> }
>
> static int
> @@ -2869,6 +2896,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
> [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
> [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
> [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
> + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
> + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
> };
>
> static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
> @@ -3172,6 +3201,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
> IP_VS_CONN_F_FWD_MASK)) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
> atomic_read(&dest->weight)) ||
> + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
> + dest->tun_type) ||
> + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
> + dest->tun_port) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
> @@ -3294,12 +3327,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> /* If a full entry was requested, check for the additional fields */
> if (full_entry) {
> struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
> - *nla_l_thresh;
> + *nla_l_thresh, *nla_tun_type, *nla_tun_port;
>
> nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
> nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
> nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
> nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
> + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
> + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
>
> if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
> return -EINVAL;
> @@ -3309,6 +3344,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> udest->weight = nla_get_u32(nla_weight);
> udest->u_threshold = nla_get_u32(nla_u_thresh);
> udest->l_threshold = nla_get_u32(nla_l_thresh);
> +
> + if (nla_tun_type)
> + udest->tun_type = nla_get_u8(nla_tun_type);
> +
> + if (nla_tun_port)
> + udest->tun_port = nla_get_be16(nla_tun_port);
> }
>
> return 0;
> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
> index 473cce2a5231..730e108b9f36 100644
> --- a/net/netfilter/ipvs/ip_vs_xmit.c
> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> @@ -32,6 +32,7 @@
> #include <linux/slab.h>
> #include <linux/tcp.h> /* for tcphdr */
> #include <net/ip.h>
> +#include <net/gue.h>
> #include <net/tcp.h> /* for csum_tcpudp_magic */
> #include <net/udp.h>
> #include <net/icmp.h> /* for icmp_send */
> @@ -382,6 +383,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> mtu = dst_mtu(&rt->dst);
> } else {
> mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
> + if (!dest)
> + goto err_put;
> + switch (dest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> if (mtu < 68) {
> IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
> goto err_put;
> @@ -533,6 +543,15 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> mtu = dst_mtu(&rt->dst);
> else {
> mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> + if (!dest)
> + goto err_put;
> + switch (dest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> if (mtu < IPV6_MIN_MTU) {
> IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> IPV6_MIN_MTU);
> @@ -989,6 +1008,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
> }
> }
>
> +static int
> +ipvs_gue_encap(struct net *net, struct sk_buff *skb,
> + struct ip_vs_conn *cp, __u8 *next_protocol)
> +{
> + __be16 dport;
> + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
> + struct udphdr *udph; /* Our new UDP header */
> + struct guehdr *gueh; /* Our new GUE header */
> +
> + skb_push(skb, sizeof(struct guehdr));
> +
> + gueh = (struct guehdr *)skb->data;
> +
> + gueh->control = 0;
> + gueh->version = 0;
> + gueh->hlen = 0;
> + gueh->flags = 0;
> + gueh->proto_ctype = *next_protocol;
> +
> + skb_push(skb, sizeof(struct udphdr));
> + skb_reset_transport_header(skb);
> +
> + udph = udp_hdr(skb);
> +
> + dport = cp->dest->tun_port;
> + udph->dest = dport;
> + udph->source = sport;
> + udph->len = htons(skb->len);
> + udph->check = 0;
> +
> + *next_protocol = IPPROTO_UDP;
> +
> + return 0;
> +}
> +
> /*
> * IP Tunneling transmitter
> *
> @@ -1025,6 +1079,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct iphdr *iph; /* Our new IP header */
> unsigned int max_headroom; /* The extra header space needed */
> int ret, local;
> + int tun_type, gso_type;
>
> EnterFunction(10);
>
> @@ -1046,6 +1101,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> */
> max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
>
> + tun_type = cp->dest->tun_type;
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> +
> /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
> dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
> skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> @@ -1054,11 +1119,30 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> if (IS_ERR(skb))
> goto tx_error;
>
> - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + gso_type = SKB_GSO_UDP_TUNNEL;
> + break;
> + default:
> + gso_type = __tun_gso_type_mask(AF_INET, cp->af);
> + break;
> + }
> +
> + if (iptunnel_handle_offloads(skb, gso_type))
> goto tx_error;
>
> skb->transport_header = skb->network_header;
>
> + skb_set_inner_ipproto(skb, next_protocol);
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + ipvs_gue_encap(net, skb, cp, &next_protocol);
> + break;
> + default:
> + break;
> + }
> +
> skb_push(skb, sizeof(struct iphdr));
> skb_reset_network_header(skb);
> memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> @@ -1102,6 +1186,8 @@ int
> ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
> {
> + struct netns_ipvs *ipvs = cp->ipvs;
> + struct net *net = ipvs->net;
> struct rt6_info *rt; /* Route to the other host */
> struct in6_addr saddr; /* Source for tunnel */
> struct net_device *tdev; /* Device to other host */
> @@ -1112,10 +1198,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct ipv6hdr *iph; /* Our new IP header */
> unsigned int max_headroom; /* The extra header space needed */
> int ret, local;
> + int tun_type, gso_type;
>
> EnterFunction(10);
>
> - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
> + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
> &cp->daddr.in6,
> &saddr, ipvsh, 1,
> IP_VS_RT_MODE_LOCAL |
> @@ -1134,17 +1221,46 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> */
> max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
>
> + tun_type = cp->dest->tun_type;
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> +
> skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> &next_protocol, &payload_len,
> &dsfield, &ttl, NULL);
> if (IS_ERR(skb))
> goto tx_error;
>
> - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + gso_type = SKB_GSO_UDP_TUNNEL;
> + break;
> + default:
> + gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
> + break;
> + }
> +
> + if (iptunnel_handle_offloads(skb, gso_type))
> goto tx_error;
>
> skb->transport_header = skb->network_header;
>
> + skb_set_inner_ipproto(skb, next_protocol);
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + ipvs_gue_encap(net, skb, cp, &next_protocol);
> + break;
> + default:
> + break;
> + }
> +
> skb_push(skb, sizeof(struct ipv6hdr));
> skb_reset_network_header(skb);
> memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> @@ -1167,7 +1283,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
>
> ret = ip_vs_tunnel_xmit_prepare(skb, cp);
> if (ret == NF_ACCEPT)
> - ip6_local_out(cp->ipvs->net, skb->sk, skb);
> + ip6_local_out(net, skb->sk, skb);
> else if (ret == NF_DROP)
> kfree_skb(skb);
>
> --
> 2.21.0

Regards

--
Julian Anastasov <[email protected]>

2019-03-25 10:47:41

by Simon Horman

[permalink] [raw]

Subject: Re: [PATCH v5] ipvs: allow tunneling with gue encapsulation

On Tue, Mar 19, 2019 at 01:26:55PM +0800, Jacky Hu wrote:
> ipip packets are blocked in some public cloud environments, this patch
> allows gue encapsulation with the tunneling method, which would make
> tunneling working in those environments.
>
> Signed-off-by: Jacky Hu <[email protected]>
> ---
> include/net/ip_vs.h | 5 ++
> include/uapi/linux/ip_vs.h | 11 +++
> net/netfilter/ipvs/ip_vs_ctl.c | 43 ++++++++++-
> net/netfilter/ipvs/ip_vs_xmit.c | 124 ++++++++++++++++++++++++++++++--
> 4 files changed, 178 insertions(+), 5 deletions(-)
>
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index a0d2e0bb9a94..cdc7b621930d 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -603,6 +603,9 @@ struct ip_vs_dest_user_kern {
>
> /* Address family of addr */
> u16 af;
> +
> + u16 tun_type; /* tunnel type */
> + __be16 tun_port; /* tunnel port */
> };
>
>
> @@ -663,6 +666,8 @@ struct ip_vs_dest {
> atomic_t conn_flags; /* flags to copy to conn */
> atomic_t weight; /* server weight */
> atomic_t last_weight; /* server latest weight */
> + __u16 tun_type; /* tunnel type */
> + __be16 tun_port; /* tunnel port */
>
> refcount_t refcnt; /* reference counter */
> struct ip_vs_stats stats; /* statistics */
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index 1c916b2f89dc..e34f436fc79d 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -124,6 +124,13 @@
>
> #define IP_VS_PEDATA_MAXLEN 255
>
> +/* Tunnel types */
> +enum {
> + IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */
> + IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */
> + IP_VS_CONN_F_TUNNEL_TYPE_MAX,
> +};
> +
> /*
> * The struct ip_vs_service_user and struct ip_vs_dest_user are
> * used to set IPVS rules through setsockopt.
> @@ -392,6 +399,10 @@ enum {
>
> IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */
>
> + IPVS_DEST_ATTR_TUN_TYPE, /* tunnel type */
> +
> + IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */
> +
> __IPVS_DEST_ATTR_MAX,
> };
>
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index 432141f04af3..ddee6266b78b 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -830,6 +830,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
> conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
> conn_flags |= IP_VS_CONN_F_INACTIVE;
>
> + /* set the tunnel info */
> + dest->tun_type = udest->tun_type;
> + dest->tun_port = udest->tun_port;
> +
> /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
> if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
> conn_flags |= IP_VS_CONN_F_NOOUTPUT;
> @@ -980,6 +984,17 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> return -ERANGE;
> }
>
> + switch (udest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:

Given that tun_type check acts on an integer, as opposed to an enum,
I don't see that using a switch statement gives us much here,
and throughout the patch, other than extra verbosity.

Did you consider simply using an if statement?

> + if (udest->tun_port == 0) {
> + pr_err("%s(): tunnel port is zero\n", __func__);
> + return -EINVAL;
> + }
> + break;
> + default:
> + break;
> + }
> +
> ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
>
> /* We use function that requires RCU lock */
> @@ -1044,6 +1059,17 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> return -ERANGE;
> }
>
> + switch (udest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + if (udest->tun_port == 0) {
> + pr_err("%s(): tunnel port is zero\n", __func__);
> + return -EINVAL;
> + }
> + break;
> + default:
> + break;
> + }
> +
> ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
>
> /* We use function that requires RCU lock */
> @@ -2310,6 +2336,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
> udest->u_threshold = udest_compat->u_threshold;
> udest->l_threshold = udest_compat->l_threshold;
> udest->af = AF_INET;
> + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
> }
>
> static int
> @@ -2869,6 +2896,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
> [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
> [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
> [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
> + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
> + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
> };
>
> static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
> @@ -3172,6 +3201,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
> IP_VS_CONN_F_FWD_MASK)) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
> atomic_read(&dest->weight)) ||
> + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
> + dest->tun_type) ||
> + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
> + dest->tun_port) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
> nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
> @@ -3294,12 +3327,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> /* If a full entry was requested, check for the additional fields */
> if (full_entry) {
> struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
> - *nla_l_thresh;
> + *nla_l_thresh, *nla_tun_type, *nla_tun_port;
>
> nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
> nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
> nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
> nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
> + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
> + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
>
> if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
> return -EINVAL;
> @@ -3309,6 +3344,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> udest->weight = nla_get_u32(nla_weight);
> udest->u_threshold = nla_get_u32(nla_u_thresh);
> udest->l_threshold = nla_get_u32(nla_l_thresh);
> +
> + if (nla_tun_type)
> + udest->tun_type = nla_get_u8(nla_tun_type);
> +
> + if (nla_tun_port)
> + udest->tun_port = nla_get_be16(nla_tun_port);
> }
>
> return 0;
> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
> index 473cce2a5231..730e108b9f36 100644
> --- a/net/netfilter/ipvs/ip_vs_xmit.c
> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> @@ -32,6 +32,7 @@
> #include <linux/slab.h>
> #include <linux/tcp.h> /* for tcphdr */
> #include <net/ip.h>
> +#include <net/gue.h>
> #include <net/tcp.h> /* for csum_tcpudp_magic */
> #include <net/udp.h>
> #include <net/icmp.h> /* for icmp_send */
> @@ -382,6 +383,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> mtu = dst_mtu(&rt->dst);
> } else {
> mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
> + if (!dest)
> + goto err_put;
> + switch (dest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> if (mtu < 68) {
> IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
> goto err_put;
> @@ -533,6 +543,15 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> mtu = dst_mtu(&rt->dst);
> else {
> mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> + if (!dest)
> + goto err_put;
> + switch (dest->tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> if (mtu < IPV6_MIN_MTU) {
> IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> IPV6_MIN_MTU);
> @@ -989,6 +1008,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
> }
> }
>
> +static int
> +ipvs_gue_encap(struct net *net, struct sk_buff *skb,
> + struct ip_vs_conn *cp, __u8 *next_protocol)
> +{
> + __be16 dport;
> + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
> + struct udphdr *udph; /* Our new UDP header */
> + struct guehdr *gueh; /* Our new GUE header */
> +
> + skb_push(skb, sizeof(struct guehdr));
> +
> + gueh = (struct guehdr *)skb->data;
> +
> + gueh->control = 0;
> + gueh->version = 0;
> + gueh->hlen = 0;
> + gueh->flags = 0;
> + gueh->proto_ctype = *next_protocol;
> +
> + skb_push(skb, sizeof(struct udphdr));
> + skb_reset_transport_header(skb);
> +
> + udph = udp_hdr(skb);
> +
> + dport = cp->dest->tun_port;
> + udph->dest = dport;
> + udph->source = sport;
> + udph->len = htons(skb->len);
> + udph->check = 0;
> +
> + *next_protocol = IPPROTO_UDP;
> +
> + return 0;
> +}
> +
> /*
> * IP Tunneling transmitter
> *
> @@ -1025,6 +1079,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct iphdr *iph; /* Our new IP header */
> unsigned int max_headroom; /* The extra header space needed */
> int ret, local;
> + int tun_type, gso_type;
>
> EnterFunction(10);
>
> @@ -1046,6 +1101,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> */
> max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
>
> + tun_type = cp->dest->tun_type;
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> +
> /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
> dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
> skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> @@ -1054,11 +1119,30 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> if (IS_ERR(skb))
> goto tx_error;
>
> - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + gso_type = SKB_GSO_UDP_TUNNEL;
> + break;
> + default:
> + gso_type = __tun_gso_type_mask(AF_INET, cp->af);
> + break;
> + }
> +
> + if (iptunnel_handle_offloads(skb, gso_type))
> goto tx_error;
>
> skb->transport_header = skb->network_header;
>
> + skb_set_inner_ipproto(skb, next_protocol);
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + ipvs_gue_encap(net, skb, cp, &next_protocol);
> + break;
> + default:
> + break;
> + }
> +
> skb_push(skb, sizeof(struct iphdr));
> skb_reset_network_header(skb);
> memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> @@ -1102,6 +1186,8 @@ int
> ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
> {
> + struct netns_ipvs *ipvs = cp->ipvs;
> + struct net *net = ipvs->net;
> struct rt6_info *rt; /* Route to the other host */
> struct in6_addr saddr; /* Source for tunnel */
> struct net_device *tdev; /* Device to other host */
> @@ -1112,10 +1198,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct ipv6hdr *iph; /* Our new IP header */
> unsigned int max_headroom; /* The extra header space needed */
> int ret, local;
> + int tun_type, gso_type;
>
> EnterFunction(10);
>
> - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
> + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
> &cp->daddr.in6,
> &saddr, ipvsh, 1,
> IP_VS_RT_MODE_LOCAL |
> @@ -1134,17 +1221,46 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> */
> max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
>
> + tun_type = cp->dest->tun_type;
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> + break;
> + default:
> + break;
> + }
> +
> skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> &next_protocol, &payload_len,
> &dsfield, &ttl, NULL);
> if (IS_ERR(skb))
> goto tx_error;
>
> - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + gso_type = SKB_GSO_UDP_TUNNEL;
> + break;
> + default:
> + gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
> + break;
> + }
> +
> + if (iptunnel_handle_offloads(skb, gso_type))
> goto tx_error;
>
> skb->transport_header = skb->network_header;
>
> + skb_set_inner_ipproto(skb, next_protocol);
> +
> + switch (tun_type) {
> + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> + ipvs_gue_encap(net, skb, cp, &next_protocol);
> + break;
> + default:
> + break;
> + }
> +
> skb_push(skb, sizeof(struct ipv6hdr));
> skb_reset_network_header(skb);
> memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> @@ -1167,7 +1283,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
>
> ret = ip_vs_tunnel_xmit_prepare(skb, cp);
> if (ret == NF_ACCEPT)
> - ip6_local_out(cp->ipvs->net, skb->sk, skb);
> + ip6_local_out(net, skb->sk, skb);
> else if (ret == NF_DROP)
> kfree_skb(skb);
>
> --
> 2.21.0
>

2019-03-25 12:57:12

by Jacky Hu

[permalink] [raw]

Subject: Re: [PATCH v5] ipvs: allow tunneling with gue encapsulation

On Mon, Mar 25, 2019 at 11:46:40AM +0100, Simon Horman wrote:
> On Tue, Mar 19, 2019 at 01:26:55PM +0800, Jacky Hu wrote:
> > ipip packets are blocked in some public cloud environments, this patch
> > allows gue encapsulation with the tunneling method, which would make
> > tunneling working in those environments.
> >
> > Signed-off-by: Jacky Hu <[email protected]>
> > ---
> > include/net/ip_vs.h | 5 ++
> > include/uapi/linux/ip_vs.h | 11 +++
> > net/netfilter/ipvs/ip_vs_ctl.c | 43 ++++++++++-
> > net/netfilter/ipvs/ip_vs_xmit.c | 124 ++++++++++++++++++++++++++++++--
> > 4 files changed, 178 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> > index a0d2e0bb9a94..cdc7b621930d 100644
> > --- a/include/net/ip_vs.h
> > +++ b/include/net/ip_vs.h
> > @@ -603,6 +603,9 @@ struct ip_vs_dest_user_kern {
> >
> > /* Address family of addr */
> > u16 af;
> > +
> > + u16 tun_type; /* tunnel type */
> > + __be16 tun_port; /* tunnel port */
> > };
> >
> >
> > @@ -663,6 +666,8 @@ struct ip_vs_dest {
> > atomic_t conn_flags; /* flags to copy to conn */
> > atomic_t weight; /* server weight */
> > atomic_t last_weight; /* server latest weight */
> > + __u16 tun_type; /* tunnel type */
> > + __be16 tun_port; /* tunnel port */
> >
> > refcount_t refcnt; /* reference counter */
> > struct ip_vs_stats stats; /* statistics */
> > diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> > index 1c916b2f89dc..e34f436fc79d 100644
> > --- a/include/uapi/linux/ip_vs.h
> > +++ b/include/uapi/linux/ip_vs.h
> > @@ -124,6 +124,13 @@
> >
> > #define IP_VS_PEDATA_MAXLEN 255
> >
> > +/* Tunnel types */
> > +enum {
> > + IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */
> > + IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */
> > + IP_VS_CONN_F_TUNNEL_TYPE_MAX,
> > +};
> > +
> > /*
> > * The struct ip_vs_service_user and struct ip_vs_dest_user are
> > * used to set IPVS rules through setsockopt.
> > @@ -392,6 +399,10 @@ enum {
> >
> > IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */
> >
> > + IPVS_DEST_ATTR_TUN_TYPE, /* tunnel type */
> > +
> > + IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */
> > +
> > __IPVS_DEST_ATTR_MAX,
> > };
> >
> > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> > index 432141f04af3..ddee6266b78b 100644
> > --- a/net/netfilter/ipvs/ip_vs_ctl.c
> > +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> > @@ -830,6 +830,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
> > conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
> > conn_flags |= IP_VS_CONN_F_INACTIVE;
> >
> > + /* set the tunnel info */
> > + dest->tun_type = udest->tun_type;
> > + dest->tun_port = udest->tun_port;
> > +
> > /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
> > if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
> > conn_flags |= IP_VS_CONN_F_NOOUTPUT;
> > @@ -980,6 +984,17 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> > return -ERANGE;
> > }
> >
> > + switch (udest->tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
>
> Given that tun_type check acts on an integer, as opposed to an enum,
> I don't see that using a switch statement gives us much here,
> and throughout the patch, other than extra verbosity.
>
> Did you consider simply using an if statement?
>
I was thinking about maybe IP_VS_CONN_F_TUNNEL_TYPE_FOU would be introduced
later.
But yes, I can simply using an if statement if you think that's more
appropriate.
> > + if (udest->tun_port == 0) {
> > + pr_err("%s(): tunnel port is zero\n", __func__);
> > + return -EINVAL;
> > + }
> > + break;
> > + default:
> > + break;
> > + }
> > +
> > ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
> >
> > /* We use function that requires RCU lock */
> > @@ -1044,6 +1059,17 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> > return -ERANGE;
> > }
> >
> > + switch (udest->tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + if (udest->tun_port == 0) {
> > + pr_err("%s(): tunnel port is zero\n", __func__);
> > + return -EINVAL;
> > + }
> > + break;
> > + default:
> > + break;
> > + }
> > +
> > ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
> >
> > /* We use function that requires RCU lock */
> > @@ -2310,6 +2336,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
> > udest->u_threshold = udest_compat->u_threshold;
> > udest->l_threshold = udest_compat->l_threshold;
> > udest->af = AF_INET;
> > + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
> > }
> >
> > static int
> > @@ -2869,6 +2896,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
> > [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
> > [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
> > [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
> > + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
> > + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
> > };
> >
> > static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
> > @@ -3172,6 +3201,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
> > IP_VS_CONN_F_FWD_MASK)) ||
> > nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
> > atomic_read(&dest->weight)) ||
> > + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
> > + dest->tun_type) ||
> > + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
> > + dest->tun_port) ||
> > nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
> > nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
> > nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
> > @@ -3294,12 +3327,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> > /* If a full entry was requested, check for the additional fields */
> > if (full_entry) {
> > struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
> > - *nla_l_thresh;
> > + *nla_l_thresh, *nla_tun_type, *nla_tun_port;
> >
> > nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
> > nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
> > nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
> > nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
> > + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
> > + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
> >
> > if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
> > return -EINVAL;
> > @@ -3309,6 +3344,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> > udest->weight = nla_get_u32(nla_weight);
> > udest->u_threshold = nla_get_u32(nla_u_thresh);
> > udest->l_threshold = nla_get_u32(nla_l_thresh);
> > +
> > + if (nla_tun_type)
> > + udest->tun_type = nla_get_u8(nla_tun_type);
> > +
> > + if (nla_tun_port)
> > + udest->tun_port = nla_get_be16(nla_tun_port);
> > }
> >
> > return 0;
> > diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
> > index 473cce2a5231..730e108b9f36 100644
> > --- a/net/netfilter/ipvs/ip_vs_xmit.c
> > +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> > @@ -32,6 +32,7 @@
> > #include <linux/slab.h>
> > #include <linux/tcp.h> /* for tcphdr */
> > #include <net/ip.h>
> > +#include <net/gue.h>
> > #include <net/tcp.h> /* for csum_tcpudp_magic */
> > #include <net/udp.h>
> > #include <net/icmp.h> /* for icmp_send */
> > @@ -382,6 +383,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> > mtu = dst_mtu(&rt->dst);
> > } else {
> > mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
> > + if (!dest)
> > + goto err_put;
> > + switch (dest->tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> > + break;
> > + default:
> > + break;
> > + }
> > if (mtu < 68) {
> > IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
> > goto err_put;
> > @@ -533,6 +543,15 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> > mtu = dst_mtu(&rt->dst);
> > else {
> > mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> > + if (!dest)
> > + goto err_put;
> > + switch (dest->tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> > + break;
> > + default:
> > + break;
> > + }
> > if (mtu < IPV6_MIN_MTU) {
> > IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> > IPV6_MIN_MTU);
> > @@ -989,6 +1008,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
> > }
> > }
> >
> > +static int
> > +ipvs_gue_encap(struct net *net, struct sk_buff *skb,
> > + struct ip_vs_conn *cp, __u8 *next_protocol)
> > +{
> > + __be16 dport;
> > + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
> > + struct udphdr *udph; /* Our new UDP header */
> > + struct guehdr *gueh; /* Our new GUE header */
> > +
> > + skb_push(skb, sizeof(struct guehdr));
> > +
> > + gueh = (struct guehdr *)skb->data;
> > +
> > + gueh->control = 0;
> > + gueh->version = 0;
> > + gueh->hlen = 0;
> > + gueh->flags = 0;
> > + gueh->proto_ctype = *next_protocol;
> > +
> > + skb_push(skb, sizeof(struct udphdr));
> > + skb_reset_transport_header(skb);
> > +
> > + udph = udp_hdr(skb);
> > +
> > + dport = cp->dest->tun_port;
> > + udph->dest = dport;
> > + udph->source = sport;
> > + udph->len = htons(skb->len);
> > + udph->check = 0;
> > +
> > + *next_protocol = IPPROTO_UDP;
> > +
> > + return 0;
> > +}
> > +
> > /*
> > * IP Tunneling transmitter
> > *
> > @@ -1025,6 +1079,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> > struct iphdr *iph; /* Our new IP header */
> > unsigned int max_headroom; /* The extra header space needed */
> > int ret, local;
> > + int tun_type, gso_type;
> >
> > EnterFunction(10);
> >
> > @@ -1046,6 +1101,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> > */
> > max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
> >
> > + tun_type = cp->dest->tun_type;
> > +
> > + switch (tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> > + break;
> > + default:
> > + break;
> > + }
> > +
> > /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
> > dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
> > skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> > @@ -1054,11 +1119,30 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> > if (IS_ERR(skb))
> > goto tx_error;
> >
> > - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
> > + switch (tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + gso_type = SKB_GSO_UDP_TUNNEL;
> > + break;
> > + default:
> > + gso_type = __tun_gso_type_mask(AF_INET, cp->af);
> > + break;
> > + }
> > +
> > + if (iptunnel_handle_offloads(skb, gso_type))
> > goto tx_error;
> >
> > skb->transport_header = skb->network_header;
> >
> > + skb_set_inner_ipproto(skb, next_protocol);
> > +
> > + switch (tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + ipvs_gue_encap(net, skb, cp, &next_protocol);
> > + break;
> > + default:
> > + break;
> > + }
> > +
> > skb_push(skb, sizeof(struct iphdr));
> > skb_reset_network_header(skb);
> > memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> > @@ -1102,6 +1186,8 @@ int
> > ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
> > {
> > + struct netns_ipvs *ipvs = cp->ipvs;
> > + struct net *net = ipvs->net;
> > struct rt6_info *rt; /* Route to the other host */
> > struct in6_addr saddr; /* Source for tunnel */
> > struct net_device *tdev; /* Device to other host */
> > @@ -1112,10 +1198,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > struct ipv6hdr *iph; /* Our new IP header */
> > unsigned int max_headroom; /* The extra header space needed */
> > int ret, local;
> > + int tun_type, gso_type;
> >
> > EnterFunction(10);
> >
> > - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
> > + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
> > &cp->daddr.in6,
> > &saddr, ipvsh, 1,
> > IP_VS_RT_MODE_LOCAL |
> > @@ -1134,17 +1221,46 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > */
> > max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
> >
> > + tun_type = cp->dest->tun_type;
> > +
> > + switch (tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> > + break;
> > + default:
> > + break;
> > + }
> > +
> > skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> > &next_protocol, &payload_len,
> > &dsfield, &ttl, NULL);
> > if (IS_ERR(skb))
> > goto tx_error;
> >
> > - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
> > + switch (tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + gso_type = SKB_GSO_UDP_TUNNEL;
> > + break;
> > + default:
> > + gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
> > + break;
> > + }
> > +
> > + if (iptunnel_handle_offloads(skb, gso_type))
> > goto tx_error;
> >
> > skb->transport_header = skb->network_header;
> >
> > + skb_set_inner_ipproto(skb, next_protocol);
> > +
> > + switch (tun_type) {
> > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > + ipvs_gue_encap(net, skb, cp, &next_protocol);
> > + break;
> > + default:
> > + break;
> > + }
> > +
> > skb_push(skb, sizeof(struct ipv6hdr));
> > skb_reset_network_header(skb);
> > memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> > @@ -1167,7 +1283,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> >
> > ret = ip_vs_tunnel_xmit_prepare(skb, cp);
> > if (ret == NF_ACCEPT)
> > - ip6_local_out(cp->ipvs->net, skb->sk, skb);
> > + ip6_local_out(net, skb->sk, skb);
> > else if (ret == NF_DROP)
> > kfree_skb(skb);
> >
> > --
> > 2.21.0
> >

2019-03-25 16:38:24

by Simon Horman

[permalink] [raw]

Subject: Re: [PATCH v5] ipvs: allow tunneling with gue encapsulation

On Mon, Mar 25, 2019 at 08:56:08PM +0800, Jacky Hu wrote:
> On Mon, Mar 25, 2019 at 11:46:40AM +0100, Simon Horman wrote:
> > On Tue, Mar 19, 2019 at 01:26:55PM +0800, Jacky Hu wrote:
> > > ipip packets are blocked in some public cloud environments, this patch
> > > allows gue encapsulation with the tunneling method, which would make
> > > tunneling working in those environments.
> > >
> > > Signed-off-by: Jacky Hu <[email protected]>
> > > ---
> > > include/net/ip_vs.h | 5 ++
> > > include/uapi/linux/ip_vs.h | 11 +++
> > > net/netfilter/ipvs/ip_vs_ctl.c | 43 ++++++++++-
> > > net/netfilter/ipvs/ip_vs_xmit.c | 124 ++++++++++++++++++++++++++++++--
> > > 4 files changed, 178 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> > > index a0d2e0bb9a94..cdc7b621930d 100644
> > > --- a/include/net/ip_vs.h
> > > +++ b/include/net/ip_vs.h
> > > @@ -603,6 +603,9 @@ struct ip_vs_dest_user_kern {
> > >
> > > /* Address family of addr */
> > > u16 af;
> > > +
> > > + u16 tun_type; /* tunnel type */
> > > + __be16 tun_port; /* tunnel port */
> > > };
> > >
> > >
> > > @@ -663,6 +666,8 @@ struct ip_vs_dest {
> > > atomic_t conn_flags; /* flags to copy to conn */
> > > atomic_t weight; /* server weight */
> > > atomic_t last_weight; /* server latest weight */
> > > + __u16 tun_type; /* tunnel type */
> > > + __be16 tun_port; /* tunnel port */
> > >
> > > refcount_t refcnt; /* reference counter */
> > > struct ip_vs_stats stats; /* statistics */
> > > diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> > > index 1c916b2f89dc..e34f436fc79d 100644
> > > --- a/include/uapi/linux/ip_vs.h
> > > +++ b/include/uapi/linux/ip_vs.h
> > > @@ -124,6 +124,13 @@
> > >
> > > #define IP_VS_PEDATA_MAXLEN 255
> > >
> > > +/* Tunnel types */
> > > +enum {
> > > + IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */
> > > + IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */
> > > + IP_VS_CONN_F_TUNNEL_TYPE_MAX,
> > > +};
> > > +
> > > /*
> > > * The struct ip_vs_service_user and struct ip_vs_dest_user are
> > > * used to set IPVS rules through setsockopt.
> > > @@ -392,6 +399,10 @@ enum {
> > >
> > > IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */
> > >
> > > + IPVS_DEST_ATTR_TUN_TYPE, /* tunnel type */
> > > +
> > > + IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */
> > > +
> > > __IPVS_DEST_ATTR_MAX,
> > > };
> > >
> > > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> > > index 432141f04af3..ddee6266b78b 100644
> > > --- a/net/netfilter/ipvs/ip_vs_ctl.c
> > > +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> > > @@ -830,6 +830,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
> > > conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
> > > conn_flags |= IP_VS_CONN_F_INACTIVE;
> > >
> > > + /* set the tunnel info */
> > > + dest->tun_type = udest->tun_type;
> > > + dest->tun_port = udest->tun_port;
> > > +
> > > /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
> > > if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
> > > conn_flags |= IP_VS_CONN_F_NOOUTPUT;
> > > @@ -980,6 +984,17 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> > > return -ERANGE;
> > > }
> > >
> > > + switch (udest->tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> >
> > Given that tun_type check acts on an integer, as opposed to an enum,
> > I don't see that using a switch statement gives us much here,
> > and throughout the patch, other than extra verbosity.
> >
> > Did you consider simply using an if statement?
> >
> I was thinking about maybe IP_VS_CONN_F_TUNNEL_TYPE_FOU would be introduced
> later.
> But yes, I can simply using an if statement if you think that's more
> appropriate.

Thanks, I think that would be best.

We can always change the code to use a switch statement later if
more cases need to be handled.

> > > + if (udest->tun_port == 0) {
> > > + pr_err("%s(): tunnel port is zero\n", __func__);
> > > + return -EINVAL;
> > > + }
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > +
> > > ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
> > >
> > > /* We use function that requires RCU lock */
> > > @@ -1044,6 +1059,17 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
> > > return -ERANGE;
> > > }
> > >
> > > + switch (udest->tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + if (udest->tun_port == 0) {
> > > + pr_err("%s(): tunnel port is zero\n", __func__);
> > > + return -EINVAL;
> > > + }
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > +
> > > ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
> > >
> > > /* We use function that requires RCU lock */
> > > @@ -2310,6 +2336,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
> > > udest->u_threshold = udest_compat->u_threshold;
> > > udest->l_threshold = udest_compat->l_threshold;
> > > udest->af = AF_INET;
> > > + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
> > > }
> > >
> > > static int
> > > @@ -2869,6 +2896,8 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
> > > [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
> > > [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
> > > [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
> > > + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
> > > + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
> > > };
> > >
> > > static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
> > > @@ -3172,6 +3201,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
> > > IP_VS_CONN_F_FWD_MASK)) ||
> > > nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
> > > atomic_read(&dest->weight)) ||
> > > + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
> > > + dest->tun_type) ||
> > > + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
> > > + dest->tun_port) ||
> > > nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
> > > nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
> > > nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
> > > @@ -3294,12 +3327,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> > > /* If a full entry was requested, check for the additional fields */
> > > if (full_entry) {
> > > struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
> > > - *nla_l_thresh;
> > > + *nla_l_thresh, *nla_tun_type, *nla_tun_port;
> > >
> > > nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
> > > nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
> > > nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
> > > nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
> > > + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
> > > + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
> > >
> > > if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
> > > return -EINVAL;
> > > @@ -3309,6 +3344,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
> > > udest->weight = nla_get_u32(nla_weight);
> > > udest->u_threshold = nla_get_u32(nla_u_thresh);
> > > udest->l_threshold = nla_get_u32(nla_l_thresh);
> > > +
> > > + if (nla_tun_type)
> > > + udest->tun_type = nla_get_u8(nla_tun_type);
> > > +
> > > + if (nla_tun_port)
> > > + udest->tun_port = nla_get_be16(nla_tun_port);
> > > }
> > >
> > > return 0;
> > > diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
> > > index 473cce2a5231..730e108b9f36 100644
> > > --- a/net/netfilter/ipvs/ip_vs_xmit.c
> > > +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> > > @@ -32,6 +32,7 @@
> > > #include <linux/slab.h>
> > > #include <linux/tcp.h> /* for tcphdr */
> > > #include <net/ip.h>
> > > +#include <net/gue.h>
> > > #include <net/tcp.h> /* for csum_tcpudp_magic */
> > > #include <net/udp.h>
> > > #include <net/icmp.h> /* for icmp_send */
> > > @@ -382,6 +383,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> > > mtu = dst_mtu(&rt->dst);
> > > } else {
> > > mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
> > > + if (!dest)
> > > + goto err_put;
> > > + switch (dest->tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > if (mtu < 68) {
> > > IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
> > > goto err_put;
> > > @@ -533,6 +543,15 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
> > > mtu = dst_mtu(&rt->dst);
> > > else {
> > > mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> > > + if (!dest)
> > > + goto err_put;
> > > + switch (dest->tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > if (mtu < IPV6_MIN_MTU) {
> > > IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> > > IPV6_MIN_MTU);
> > > @@ -989,6 +1008,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
> > > }
> > > }
> > >
> > > +static int
> > > +ipvs_gue_encap(struct net *net, struct sk_buff *skb,
> > > + struct ip_vs_conn *cp, __u8 *next_protocol)
> > > +{
> > > + __be16 dport;
> > > + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
> > > + struct udphdr *udph; /* Our new UDP header */
> > > + struct guehdr *gueh; /* Our new GUE header */
> > > +
> > > + skb_push(skb, sizeof(struct guehdr));
> > > +
> > > + gueh = (struct guehdr *)skb->data;
> > > +
> > > + gueh->control = 0;
> > > + gueh->version = 0;
> > > + gueh->hlen = 0;
> > > + gueh->flags = 0;
> > > + gueh->proto_ctype = *next_protocol;
> > > +
> > > + skb_push(skb, sizeof(struct udphdr));
> > > + skb_reset_transport_header(skb);
> > > +
> > > + udph = udp_hdr(skb);
> > > +
> > > + dport = cp->dest->tun_port;
> > > + udph->dest = dport;
> > > + udph->source = sport;
> > > + udph->len = htons(skb->len);
> > > + udph->check = 0;
> > > +
> > > + *next_protocol = IPPROTO_UDP;
> > > +
> > > + return 0;
> > > +}
> > > +
> > > /*
> > > * IP Tunneling transmitter
> > > *
> > > @@ -1025,6 +1079,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> > > struct iphdr *iph; /* Our new IP header */
> > > unsigned int max_headroom; /* The extra header space needed */
> > > int ret, local;
> > > + int tun_type, gso_type;
> > >
> > > EnterFunction(10);
> > >
> > > @@ -1046,6 +1101,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> > > */
> > > max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
> > >
> > > + tun_type = cp->dest->tun_type;
> > > +
> > > + switch (tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > +
> > > /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
> > > dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
> > > skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> > > @@ -1054,11 +1119,30 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> > > if (IS_ERR(skb))
> > > goto tx_error;
> > >
> > > - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
> > > + switch (tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + gso_type = SKB_GSO_UDP_TUNNEL;
> > > + break;
> > > + default:
> > > + gso_type = __tun_gso_type_mask(AF_INET, cp->af);
> > > + break;
> > > + }
> > > +
> > > + if (iptunnel_handle_offloads(skb, gso_type))
> > > goto tx_error;
> > >
> > > skb->transport_header = skb->network_header;
> > >
> > > + skb_set_inner_ipproto(skb, next_protocol);
> > > +
> > > + switch (tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + ipvs_gue_encap(net, skb, cp, &next_protocol);
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > +
> > > skb_push(skb, sizeof(struct iphdr));
> > > skb_reset_network_header(skb);
> > > memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> > > @@ -1102,6 +1186,8 @@ int
> > > ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > > struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
> > > {
> > > + struct netns_ipvs *ipvs = cp->ipvs;
> > > + struct net *net = ipvs->net;
> > > struct rt6_info *rt; /* Route to the other host */
> > > struct in6_addr saddr; /* Source for tunnel */
> > > struct net_device *tdev; /* Device to other host */
> > > @@ -1112,10 +1198,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > > struct ipv6hdr *iph; /* Our new IP header */
> > > unsigned int max_headroom; /* The extra header space needed */
> > > int ret, local;
> > > + int tun_type, gso_type;
> > >
> > > EnterFunction(10);
> > >
> > > - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
> > > + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
> > > &cp->daddr.in6,
> > > &saddr, ipvsh, 1,
> > > IP_VS_RT_MODE_LOCAL |
> > > @@ -1134,17 +1221,46 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > > */
> > > max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
> > >
> > > + tun_type = cp->dest->tun_type;
> > > +
> > > + switch (tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > +
> > > skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
> > > &next_protocol, &payload_len,
> > > &dsfield, &ttl, NULL);
> > > if (IS_ERR(skb))
> > > goto tx_error;
> > >
> > > - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
> > > + switch (tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + gso_type = SKB_GSO_UDP_TUNNEL;
> > > + break;
> > > + default:
> > > + gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
> > > + break;
> > > + }
> > > +
> > > + if (iptunnel_handle_offloads(skb, gso_type))
> > > goto tx_error;
> > >
> > > skb->transport_header = skb->network_header;
> > >
> > > + skb_set_inner_ipproto(skb, next_protocol);
> > > +
> > > + switch (tun_type) {
> > > + case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
> > > + ipvs_gue_encap(net, skb, cp, &next_protocol);
> > > + break;
> > > + default:
> > > + break;
> > > + }
> > > +
> > > skb_push(skb, sizeof(struct ipv6hdr));
> > > skb_reset_network_header(skb);
> > > memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> > > @@ -1167,7 +1283,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > >
> > > ret = ip_vs_tunnel_xmit_prepare(skb, cp);
> > > if (ret == NF_ACCEPT)
> > > - ip6_local_out(cp->ipvs->net, skb->sk, skb);
> > > + ip6_local_out(net, skb->sk, skb);
> > > else if (ret == NF_DROP)
> > > kfree_skb(skb);
> > >
> > > --
> > > 2.21.0
> > >
>