2020-08-25 16:04:04

by Ahmed Abdelsalam

[permalink] [raw]
Subject: [net-next v5 1/2] seg6: inherit DSCP of inner IPv4 packets

This patch allows SRv6 encapsulation to inherit the DSCP value of
the inner IPv4 packet.

This allows forwarding packet across the SRv6 fabric based on their
original traffic class.

The option is controlled through a sysctl (seg6_inherit_inner_ipv4_dscp).
The sysctl has to be set to 1 to enable this feature.

Signed-off-by: Ahmed Abdelsalam <[email protected]>
---
include/net/netns/ipv6.h | 1 +
net/ipv6/seg6_iptunnel.c | 37 ++++++++++++++++++++-----------------
net/ipv6/sysctl_net_ipv6.c | 9 +++++++++
3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 5ec054473d81..6ed73951f479 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -50,6 +50,7 @@ struct netns_sysctl_ipv6 {
int max_dst_opts_len;
int max_hbh_opts_len;
int seg6_flowlabel;
+ bool seg6_inherit_inner_ipv4_dscp;
bool skip_notify_on_dev_down;
};

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 897fa59c47de..9cc168462e11 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -104,8 +104,7 @@ static void set_tun_src(struct net *net, struct net_device *dev,
}

/* Compute flowlabel for outer IPv6 header */
-static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
- struct ipv6hdr *inner_hdr)
+static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb)
{
int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
__be32 flowlabel = 0;
@@ -116,7 +115,7 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
hash = rol32(hash, 16);
flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
} else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
- flowlabel = ip6_flowlabel(inner_hdr);
+ flowlabel = ip6_flowlabel(ipv6_hdr(skb));
}
return flowlabel;
}
@@ -129,6 +128,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
struct ipv6hdr *hdr, *inner_hdr;
struct ipv6_sr_hdr *isrh;
int hdrlen, tot_len, err;
+ u8 tos = 0, hop_limit;
__be32 flowlabel;

hdrlen = (osrh->hdrlen + 1) << 3;
@@ -138,30 +138,33 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
if (unlikely(err))
return err;

- inner_hdr = ipv6_hdr(skb);
- flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
-
- skb_push(skb, tot_len);
- skb_reset_network_header(skb);
- skb_mac_header_rebuild(skb);
- hdr = ipv6_hdr(skb);
-
/* inherit tc, flowlabel and hlim
* hlim will be decremented in ip6_forward() afterwards and
* decapsulation will overwrite inner hlim with outer hlim
*/

+ flowlabel = seg6_make_flowlabel(net, skb);
+ hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
if (skb->protocol == htons(ETH_P_IPV6)) {
- ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
- flowlabel);
- hdr->hop_limit = inner_hdr->hop_limit;
+ inner_hdr = ipv6_hdr(skb);
+ hop_limit = inner_hdr->hop_limit;
+ tos = ip6_tclass(ip6_flowinfo(inner_hdr));
+ } else if (skb->protocol == htons(ETH_P_IP)) {
+ if (net->ipv6.sysctl.seg6_inherit_inner_ipv4_dscp)
+ tos = ip_hdr(skb)->tos;
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
} else {
- ip6_flow_hdr(hdr, 0, flowlabel);
- hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
-
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
}

+ skb_push(skb, tot_len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ ip6_flow_hdr(hdr, tos, flowlabel);
+ hdr->hop_limit = hop_limit;
hdr->nexthdr = NEXTHDR_ROUTING;

isrh = (void *)hdr + sizeof(*hdr);
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index fac2135aa47b..4b2cf8764524 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -159,6 +159,15 @@ static struct ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "seg6_inherit_inner_ipv4_dscp",
+ .data = &init_net.ipv6.sysctl.seg6_inherit_inner_ipv4_dscp,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
{ }
};

--
2.17.1


2020-08-25 16:47:50

by David Ahern

[permalink] [raw]
Subject: Re: [net-next v5 1/2] seg6: inherit DSCP of inner IPv4 packets

On 8/25/20 10:02 AM, Ahmed Abdelsalam wrote:
> This patch allows SRv6 encapsulation to inherit the DSCP value of
> the inner IPv4 packet.
>
> This allows forwarding packet across the SRv6 fabric based on their
> original traffic class.
>
> The option is controlled through a sysctl (seg6_inherit_inner_ipv4_dscp).
> The sysctl has to be set to 1 to enable this feature.
>

rather than adding another sysctl, can this be done as a SEG6_LOCAL
attribute and managed via seg6_local_lwt?

2020-08-25 23:46:18

by Ahmed Abdelsalam

[permalink] [raw]
Subject: Re: [net-next v5 1/2] seg6: inherit DSCP of inner IPv4 packets

On 25/08/2020 18:45, David Ahern wrote:
> On 8/25/20 10:02 AM, Ahmed Abdelsalam wrote:
>> This patch allows SRv6 encapsulation to inherit the DSCP value of
>> the inner IPv4 packet.
>>
>> This allows forwarding packet across the SRv6 fabric based on their
>> original traffic class.
>>
>> The option is controlled through a sysctl (seg6_inherit_inner_ipv4_dscp).
>> The sysctl has to be set to 1 to enable this feature.
>>
>
> rather than adding another sysctl, can this be done as a SEG6_LOCAL
> attribute and managed via seg6_local_lwt?
>

Hi David

The seg6 encap is implemented through the seg6_lwt rather than
seg6_local_lwt.
We can add a flag(SEG6_IPTUNNEL_DSCP) in seg6_iptunnel.h if we do not
want to go the sysctl direction.
Perhaps this would require various changes to seg6 infrastructure
including seg6_iptunnel_policy, seg6_build_state, fill_encap,
get_encap_size, etc.

We have proposed a patch before to support optional parameters for SRv6
behaviors [1].
Unfortunately, this patch was rejected.

So i do not know which option is better.

[1]
https://patchwork.ozlabs.org/project/netdev/patch/[email protected]/

Ahmed

2020-08-26 00:46:43

by David Ahern

[permalink] [raw]
Subject: Re: [net-next v5 1/2] seg6: inherit DSCP of inner IPv4 packets

On 8/25/20 5:45 PM, Ahmed Abdelsalam wrote:
>
> Hi David
>
> The seg6 encap is implemented through the seg6_lwt rather than
> seg6_local_lwt.

ok. I don't know the seg6 code; just taking a guess from a quick look.

> We can add a flag(SEG6_IPTUNNEL_DSCP) in seg6_iptunnel.h if we do not
> want to go the sysctl direction.

sysctl is just a big hammer with side effects.

It struck me that the DSCP propagation is very similar to the TTL
propagation with MPLS which is per route entry (MPLS_IPTUNNEL_TTL and
stored as ttl_propagate in mpls_iptunnel_encap). Hence the question of
whether SR could make this a per route attribute. Consistency across
implementations is best.

> Perhaps this would require various changes to seg6 infrastructure
> including seg6_iptunnel_policy, seg6_build_state, fill_encap,
> get_encap_size, etc.
>
> We have proposed a patch before to support optional parameters for SRv6
> behaviors [1].
> Unfortunately, this patch was rejected.
>

not sure I follow why the patch was rejected. Does it change behavior of
existing code?

I would expect that new attributes can be added without affecting
handling of current ones. Looking at seg6_iptunnel.c the new attribute
would be ignored on older kernels but should be fine on new ones and
forward.

###

Since seg6 does not have strict attribute checking the only way to find
out if it is supported is to send down the config and then read it back.
If the attribute is missing, the kernel does not support. Ugly, but one
way to determine support. The next time an attribute is added to seg6
code, strict checking should be enabled so that going forward as new
attributes are added older kernels with strict checking would reject it.

2020-08-26 12:15:11

by Ahmed Abdelsalam

[permalink] [raw]
Subject: Re: [net-next v5 1/2] seg6: inherit DSCP of inner IPv4 packets


On 26/08/2020 02:45, David Ahern wrote:
> On 8/25/20 5:45 PM, Ahmed Abdelsalam wrote:
>>
>> Hi David
>>
>> The seg6 encap is implemented through the seg6_lwt rather than
>> seg6_local_lwt.
>
> ok. I don't know the seg6 code; just taking a guess from a quick look.
>
>> We can add a flag(SEG6_IPTUNNEL_DSCP) in seg6_iptunnel.h if we do not
>> want to go the sysctl direction.
>
> sysctl is just a big hammer with side effects.
>
> It struck me that the DSCP propagation is very similar to the TTL
> propagation with MPLS which is per route entry (MPLS_IPTUNNEL_TTL and
> stored as ttl_propagate in mpls_iptunnel_encap). Hence the question of
> whether SR could make this a per route attribute. Consistency across
> implementations is best.
>SRv6 does not have an issue of having this per route.
Actually, as SRv6 leverage IPv6 encapsulation, I would say it should
consistent with ip6_tunnel not MPLS.

In ip6_tunnel, both ttl and flowinfo (tclass and flowlabel) are provided.

Ideally, SRv6 code should have done the same with:
TTL := VLAUE | DEFAULT | inherit.
TCLASS := 0x00 .. 0xFF | inherit
FLOWLABEL := { 0x00000 .. 0xfffff | inherit | compute.

>> Perhaps this would require various changes to seg6 infrastructure
>> including seg6_iptunnel_policy, seg6_build_state, fill_encap,
>> get_encap_size, etc.
>>
>> We have proposed a patch before to support optional parameters for SRv6
>> behaviors [1].
>> Unfortunately, this patch was rejected.
>>
>
> not sure I follow why the patch was rejected. Does it change behavior of
> existing code?
>

The comment from David miller was "People taking advantage of this new
flexibility will write applications that DO NOT WORK on older kernels."

Perhaps, here we can a bit of discussion. Because also applications that
leverage SRv6 encapsulation will not work on kernels before 4.10.
Applications that leverage SRv6 VPN behvaiors will not work on kernels
before 4.14. Applications that leverages SRv6 capabilites in iptables
will not work on kernels before 4.16.

So when people write an application they have minimum requirement (e.g.,
kernel 5.x)

I would like to get David miller feedback as well as yours on how we
should proceed and I can work on these features.

> I would expect that new attributes can be added without affecting
> handling of current ones. Looking at seg6_iptunnel.c the new attribute
> would be ignored on older kernels but should be fine on new ones and
> forward.
>
> ###
>
> Since seg6 does not have strict attribute checking the only way to find
> out if it is supported is to send down the config and then read it back.
> If the attribute is missing, the kernel does not support. Ugly, but one
> way to determine support. The next time an attribute is added to seg6
> code, strict checking should be enabled so that going forward as new
> attributes are added older kernels with strict checking would reject it.
>

2020-08-26 19:45:28

by David Ahern

[permalink] [raw]
Subject: Re: [net-next v5 1/2] seg6: inherit DSCP of inner IPv4 packets

On 8/26/20 6:12 AM, Ahmed Abdelsalam wrote:
>
> On 26/08/2020 02:45, David Ahern wrote:
>> On 8/25/20 5:45 PM, Ahmed Abdelsalam wrote:
>>>
>>> Hi David
>>>
>>> The seg6 encap is implemented through the seg6_lwt rather than
>>> seg6_local_lwt.
>>
>> ok. I don't know the seg6 code; just taking a guess from a quick look.
>>
>>> We can add a flag(SEG6_IPTUNNEL_DSCP) in seg6_iptunnel.h if we do not
>>> want to go the sysctl direction.
>>
>> sysctl is just a big hammer with side effects.
>>
>> It struck me that the DSCP propagation is very similar to the TTL
>> propagation with MPLS which is per route entry (MPLS_IPTUNNEL_TTL and
>> stored as ttl_propagate in mpls_iptunnel_encap). Hence the question of
>> whether SR could make this a per route attribute. Consistency across
>> implementations is best.
>> SRv6 does not have an issue of having this per route.
> Actually, as SRv6 leverage IPv6 encapsulation, I would say it should
> consistent with ip6_tunnel not MPLS.
>
> In ip6_tunnel, both ttl and flowinfo (tclass and flowlabel) are provided.
>
> Ideally, SRv6 code should have done the same with:
> TTL       := VLAUE | DEFAULT | inherit.
> TCLASS    := 0x00 .. 0xFF | inherit
> FLOWLABEL := { 0x00000 .. 0xfffff | inherit | compute.
>

New attributes get added all the time. Why does something like this now
work for these features:

diff --git a/include/uapi/linux/seg6_iptunnel.h
b/include/uapi/linux/seg6_iptunnel.h
index eb815e0d0ac3..b628333ba100 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -20,6 +20,8 @@
enum {
SEG6_IPTUNNEL_UNSPEC,
SEG6_IPTUNNEL_SRH,
+ SEG6_IPTUNNEL_TTL, /* u8 */
+ SEG6_IPTUNNEL_TCLASS, /* u8 */
__SEG6_IPTUNNEL_MAX,
};
#define SEG6_IPTUNNEL_MAX (__SEG6_IPTUNNEL_MAX - 1)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 897fa59c47de..7cb512b65bc3 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -46,6 +46,11 @@ static size_t seg6_lwt_headroom(struct
seg6_iptunnel_encap *tuninfo)

struct seg6_lwt {
struct dst_cache cache;
+ u8 ttl_propagate; /* propagate ttl from inner header */
+ u8 default_ttl; /* ttl value to use */
+ u8 tclass_inherit; /* inherit tclass from inner header */
+ u8 tclass; /* tclass value to use */
+
struct seg6_iptunnel_encap tuninfo[];
};

@@ -61,7 +66,10 @@ seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
}

static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX +
1] = {
- [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+ [SEG6_IPTUNNEL_UNSPEC] = { .strict_start_type =
SEG6_IPTUNNEL_SRH + 1 },
+ [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+ [SEG6_IPTUNNEL_TTL] = { .type = NLA_U8 },
+ [SEG6_IPTUNNEL_TCLASS] = { .type = NLA_U8 },
};

static int nla_put_srh(struct sk_buff *skb, int attrtype,
@@ -460,6 +468,22 @@ static int seg6_build_state(struct net *net, struct
nlattr *nla,

memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);

+ if (tb[SEG6_IPTUNNEL_TTL]) {
+ slwt->default_ttl = nla_get_u8(tb[SEG6_IPTUNNEL_TTL]);
+ slwt->ttl_propagate = slwt->default_ttl ? 0 : 1;
+ }
+ if (tb[SEG6_IPTUNNEL_TCLASS]) {
+ u32 tmp = nla_get_u32(tb[SEG6_IPTUNNEL_TCLASS]);
+
+ if (tmp == (u32)-1) {
+ slwt->tclass_inherit = true;
+ } else if (tmp & <some valid range mask>) {
+ error
+ } else {
+ slwt->tclass = ...
+ }
+ }
+
newts->type = LWTUNNEL_ENCAP_SEG6;
newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;


And the use the values in slwt as needed.

2020-08-27 11:12:04

by Ahmed Abdelsalam

[permalink] [raw]
Subject: Re: [net-next v5 1/2] seg6: inherit DSCP of inner IPv4 packets



On 26/08/2020 21:41, David Ahern wrote:
> On 8/26/20 6:12 AM, Ahmed Abdelsalam wrote:
>>
>> On 26/08/2020 02:45, David Ahern wrote:
>>> On 8/25/20 5:45 PM, Ahmed Abdelsalam wrote:
>>>>
>>>> Hi David
>>>>
>>>> The seg6 encap is implemented through the seg6_lwt rather than
>>>> seg6_local_lwt.
>>>
>>> ok. I don't know the seg6 code; just taking a guess from a quick look.
>>>
>>>> We can add a flag(SEG6_IPTUNNEL_DSCP) in seg6_iptunnel.h if we do not
>>>> want to go the sysctl direction.
>>>
>>> sysctl is just a big hammer with side effects.
>>>
>>> It struck me that the DSCP propagation is very similar to the TTL
>>> propagation with MPLS which is per route entry (MPLS_IPTUNNEL_TTL and
>>> stored as ttl_propagate in mpls_iptunnel_encap). Hence the question of
>>> whether SR could make this a per route attribute. Consistency across
>>> implementations is best.
>>> SRv6 does not have an issue of having this per route.
>> Actually, as SRv6 leverage IPv6 encapsulation, I would say it should
>> consistent with ip6_tunnel not MPLS.
>>
>> In ip6_tunnel, both ttl and flowinfo (tclass and flowlabel) are provided.
>>
>> Ideally, SRv6 code should have done the same with:
>> TTL       := VLAUE | DEFAULT | inherit.
>> TCLASS    := 0x00 .. 0xFF | inherit
>> FLOWLABEL := { 0x00000 .. 0xfffff | inherit | compute.
>>
>
> New attributes get added all the time. Why does something like this now
> work for these features:
>
> diff --git a/include/uapi/linux/seg6_iptunnel.h
> b/include/uapi/linux/seg6_iptunnel.h
> index eb815e0d0ac3..b628333ba100 100644
> --- a/include/uapi/linux/seg6_iptunnel.h
> +++ b/include/uapi/linux/seg6_iptunnel.h
> @@ -20,6 +20,8 @@
> enum {
> SEG6_IPTUNNEL_UNSPEC,
> SEG6_IPTUNNEL_SRH,
> + SEG6_IPTUNNEL_TTL, /* u8 */
> + SEG6_IPTUNNEL_TCLASS, /* u8 */
> __SEG6_IPTUNNEL_MAX,
> };
> #define SEG6_IPTUNNEL_MAX (__SEG6_IPTUNNEL_MAX - 1)
> diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
> index 897fa59c47de..7cb512b65bc3 100644
> --- a/net/ipv6/seg6_iptunnel.c
> +++ b/net/ipv6/seg6_iptunnel.c
> @@ -46,6 +46,11 @@ static size_t seg6_lwt_headroom(struct
> seg6_iptunnel_encap *tuninfo)
>
> struct seg6_lwt {
> struct dst_cache cache;
> + u8 ttl_propagate; /* propagate ttl from inner header */
> + u8 default_ttl; /* ttl value to use */
> + u8 tclass_inherit; /* inherit tclass from inner header */
> + u8 tclass; /* tclass value to use */
> +
> struct seg6_iptunnel_encap tuninfo[];
> };
>
> @@ -61,7 +66,10 @@ seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
> }
>
> static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX +
> 1] = {
> - [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
> + [SEG6_IPTUNNEL_UNSPEC] = { .strict_start_type =
> SEG6_IPTUNNEL_SRH + 1 },
> + [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
> + [SEG6_IPTUNNEL_TTL] = { .type = NLA_U8 },
> + [SEG6_IPTUNNEL_TCLASS] = { .type = NLA_U8 },
> };
>
> static int nla_put_srh(struct sk_buff *skb, int attrtype,
> @@ -460,6 +468,22 @@ static int seg6_build_state(struct net *net, struct
> nlattr *nla,
>
> memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
>
> + if (tb[SEG6_IPTUNNEL_TTL]) {
> + slwt->default_ttl = nla_get_u8(tb[SEG6_IPTUNNEL_TTL]);
> + slwt->ttl_propagate = slwt->default_ttl ? 0 : 1;
> + }
> + if (tb[SEG6_IPTUNNEL_TCLASS]) {
> + u32 tmp = nla_get_u32(tb[SEG6_IPTUNNEL_TCLASS]);
> +
> + if (tmp == (u32)-1) {
> + slwt->tclass_inherit = true;
> + } else if (tmp & <some valid range mask>) {
> + error
> + } else {
> + slwt->tclass = ...
> + }
> + }
> +
> newts->type = LWTUNNEL_ENCAP_SEG6;
> newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
>
>
> And the use the values in slwt as needed.
>

Thanks for the suggestions and the code example
I will write the patches and submit to net-next.