Adds support for masquerading into a smaller subset of ports -
defined by the PSID values from RFC-7597 Section 5.1. This is part of
the support for MAP-E and Lightweight 4over6, which allows multiple
devices to share an IPv4 address by splitting the L4 port / id into
ranges.
Co-developed-by: Anthony Lineham <[email protected]>
Signed-off-by: Anthony Lineham <[email protected]>
Co-developed-by: Scott Parlane <[email protected]>
Signed-off-by: Scott Parlane <[email protected]>
Signed-off-by: Blair Steven <[email protected]>
Signed-off-by: Cole Dishington <[email protected]>
Reviewed-by: Florian Westphal <[email protected]>
---
Notes:
Changes:
- Added Reviewed-by: Florian Westphal <[email protected]>.
net/netfilter/nf_nat_core.c | 39 +++++++++++++++++++++++++++----
net/netfilter/nf_nat_masquerade.c | 27 +++++++++++++++++++--
2 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 7de595ead06a..f07a3473aab5 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -195,13 +195,36 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype,
const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
+ const union nf_conntrack_man_proto *max,
+ const union nf_conntrack_man_proto *base,
+ bool is_psid)
{
__be16 port;
+ u16 psid, psid_mask, offset_mask;
+
+ /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */
+ if (is_psid) {
+ u32 power_j = ntohs(max->all) - ntohs(min->all) + 1;
+ u32 offset = ntohs(base->all);
+ u16 power_a;
+
+ if (offset == 0)
+ offset = 1 << 16;
+
+ power_a = (1 << 16) / offset;
+ offset_mask = (power_a - 1) * offset;
+ psid_mask = ((offset / power_j) << 1) - 1;
+ psid = ntohs(min->all) & psid_mask;
+ }
switch (tuple->dst.protonum) {
case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
+ if (is_psid) {
+ return (offset_mask == 0 ||
+ (ntohs(tuple->src.u.icmp.id) & offset_mask) != 0) &&
+ ((ntohs(tuple->src.u.icmp.id) & psid_mask) == psid);
+ }
return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
case IPPROTO_GRE: /* all fall though */
@@ -215,6 +238,10 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
else
port = tuple->dst.u.all;
+ if (is_psid) {
+ return (offset_mask == 0 || (ntohs(port) & offset_mask) != 0) &&
+ ((ntohs(port) & psid_mask) == psid);
+ }
return ntohs(port) >= ntohs(min->all) &&
ntohs(port) <= ntohs(max->all);
default:
@@ -239,7 +266,8 @@ static int in_range(const struct nf_conntrack_tuple *tuple,
return 1;
return l4proto_in_range(tuple, NF_NAT_MANIP_SRC,
- &range->min_proto, &range->max_proto);
+ &range->min_proto, &range->max_proto, &range->base_proto,
+ range->flags & NF_NAT_RANGE_PSID);
}
static inline int
@@ -532,8 +560,11 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ &range->min_proto,
+ &range->max_proto,
+ &range->base_proto,
+ range->flags &
+ NF_NAT_RANGE_PSID) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
return;
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index 8e8a65d46345..19a4754cda76 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
newrange.min_addr.ip = newsrc;
newrange.max_addr.ip = newsrc;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
+
+ if (range->flags & NF_NAT_RANGE_PSID) {
+ u16 base = ntohs(range->base_proto.all);
+ u16 min = ntohs(range->min_proto.all);
+ u16 off = 0;
+
+ /* xtables should stop base > 2^15 by enforcement of
+ * 0 <= offset_len < 16 argument, with offset_len=0
+ * as a special case inwhich base=0.
+ */
+ if (WARN_ON_ONCE(base > (1 << 15)))
+ return NF_DROP;
+
+ /* If offset=0, port range is in one contiguous block */
+ if (base)
+ off = prandom_u32_max(((1 << 16) / base) - 1);
+
+ newrange.min_proto.all = htons(min + base * off);
+ newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min);
+ newrange.base_proto = range->base_proto;
+ newrange.flags = newrange.flags | NF_NAT_RANGE_PROTO_SPECIFIED;
+ } else {
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+ }
/* Hand modified range to generic setup. */
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
--
2.32.0
Hi,
On Mon, Aug 09, 2021 at 04:10:36PM +1200, Cole Dishington wrote:
> Adds support for masquerading into a smaller subset of ports -
> defined by the PSID values from RFC-7597 Section 5.1. This is part of
> the support for MAP-E and Lightweight 4over6, which allows multiple
> devices to share an IPv4 address by splitting the L4 port / id into
> ranges.
>
> Co-developed-by: Anthony Lineham <[email protected]>
> Signed-off-by: Anthony Lineham <[email protected]>
> Co-developed-by: Scott Parlane <[email protected]>
> Signed-off-by: Scott Parlane <[email protected]>
> Signed-off-by: Blair Steven <[email protected]>
> Signed-off-by: Cole Dishington <[email protected]>
> Reviewed-by: Florian Westphal <[email protected]>
[...]
Looking at the userspace logic:
https://patchwork.ozlabs.org/project/netfilter-devel/patch/[email protected]/
Chunk extracted from void parse_psid(...)
> offset = (1 << (16 - offset_len));
Assuming offset_len = 6, then you skip 0-1023 ports, OK.
> psid = psid << (16 - offset_len - psid_len);
This psid calculation is correct? Maybe:
psid = psid << (16 - offset_len);
instead?
psid=0 => 0 << (16 - 6) = 1024
psid=1 => 1 << (16 - 6) = 2048
This is implicitly assuming that 64 PSIDs are available, each of them
taking 1024 ports, ie. psid_len is 6 bits. But why are you subtracting
the psid_len above?
> /* Handle the special case of no offset bits (a=0), so offset loops */
> min = psid;
OK, this line above is the minimal port in the range
> if (offset)
> min += offset;
... which is incremented by the offset (to skip the 0-1023 ports).
> r->min_proto.all = htons(min);
> r->max_proto.all = htons(min + ((1 << (16 - offset_len - psid_len)) - 1));
Here, you subtract psid_len again, not sure why.
> r->base_proto.all = htons(offset);
base is set to offset, ie. 1024.
> r->flags |= NF_NAT_RANGE_PSID;
> r->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
Now looking at the kernel side.
> diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
> index 8e8a65d46345..19a4754cda76 100644
> --- a/net/netfilter/nf_nat_masquerade.c
> +++ b/net/netfilter/nf_nat_masquerade.c
> @@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
> newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
> newrange.min_addr.ip = newsrc;
> newrange.max_addr.ip = newsrc;
> - newrange.min_proto = range->min_proto;
> - newrange.max_proto = range->max_proto;
> +
> + if (range->flags & NF_NAT_RANGE_PSID) {
> + u16 base = ntohs(range->base_proto.all);
> + u16 min = ntohs(range->min_proto.all);
> + u16 off = 0;
> +
> + /* xtables should stop base > 2^15 by enforcement of
> + * 0 <= offset_len < 16 argument, with offset_len=0
> + * as a special case inwhich base=0.
I don't understand this comment.
> + */
> + if (WARN_ON_ONCE(base > (1 << 15)))
> + return NF_DROP;
> +
> + /* If offset=0, port range is in one contiguous block */
> + if (base)
> + off = prandom_u32_max(((1 << 16) / base) - 1);
Assuming the example above, base is set to 1024. Then, off is a random
value between UINT16_MAX (you expressed this as 1 << 16) and the base
which is 1024 minus 1.
So this is picking a random off (actually the PSID?) between 0 and 63.
What about clashes? I mean, two different machines behind the NAT
might get the same off.
> + newrange.min_proto.all = htons(min + base * off);
min could be 1024, 2048, 3072... you add base which is 1024 * off.
Is this duplicated? Both calculated in user and kernel space?
> + newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min);
I'm stopping here, I'm getting lost.
My understanding about this RFC is that you would like to split the
16-bit ports in ranges to uniquely identify the host behind the NAT.
Why don't you just you just select the port range from userspace
utilizing the existing infrastructure? I mean, why do you need this
kernel patch?
Florian already suggested:
> Is it really needed to place all of this in the nat core?
>
> The only thing that has to be done in the NAT core, afaics, is to
> suppress port reallocation attmepts when NF_NAT_RANGE_PSID is set.
>
> Is there a reason why nf_nat_masquerade_ipv4/6 can't be changed instead
> to do what you want?
>
> AFAICS its enough to set NF_NAT_RANGE_PROTO_SPECIFIED and init the
> upper/lower boundaries, i.e. change input given to nf_nat_setup_info().
extracted from:
https://patchwork.ozlabs.org/project/netfilter-devel/patch/[email protected]/
Hello,
Thanks for your time reviewing!
On Wed, 2021-08-25 at 19:05 +0200, Pablo Neira Ayuso wrote:
> Hi,
>
> On Mon, Aug 09, 2021 at 04:10:36PM +1200, Cole Dishington wrote:
> > Adds support for masquerading into a smaller subset of ports -
> > defined by the PSID values from RFC-7597 Section 5.1. This is part of
> > the support for MAP-E and Lightweight 4over6, which allows multiple
> > devices to share an IPv4 address by splitting the L4 port / id into
> > ranges.
> >
> > Co-developed-by: Anthony Lineham <[email protected]>
> > Signed-off-by: Anthony Lineham <[email protected]>
> > Co-developed-by: Scott Parlane <[email protected]>
> > Signed-off-by: Scott Parlane <[email protected]>
> > Signed-off-by: Blair Steven <[email protected]>
> > Signed-off-by: Cole Dishington <[email protected]>
> > Reviewed-by: Florian Westphal <[email protected]>
> [...]
>
> Looking at the userspace logic:
>
> https://scanmail.trustwave.com/?c=20988&d=6vim4fcVLjPkIbLUDqz3Tj2W4gXWNCkYa5llWggBjA&u=https%3a%2f%2fpatchwork%2eozlabs%2eorg%2fproject%2fnetfilter-devel%2fpatch%2f20210716002219%2e30193-1-Cole%2eDishington%40alliedtelesis%2eco%2enz%2f
>
> Chunk extracted from void parse_psid(...)
>
> > offset = (1 << (16 - offset_len));
>
> Assuming offset_len = 6, then you skip 0-1023 ports, OK.
>
> > psid = psid << (16 - offset_len - psid_len);
>
> This psid calculation is correct? Maybe:
>
> psid = psid << (16 - offset_len);
PSID port numbers have the form
[offset|PSID|j]
and
16 = offset_length + PSID_length + j_length.
The PSID calculation above is bit shifting the passed psid up j_length.
The userspace tool accepts the unshifted psid to be consistent with how RFC7597 specified it (see RFC7597 Appendix A. Examples).
>
> instead?
>
> psid=0 => 0 << (16 - 6) = 1024
> psid=1 => 1 << (16 - 6) = 2048
>
> This is implicitly assuming that 64 PSIDs are available, each of them
> taking 1024 ports, ie. psid_len is 6 bits. But why are you subtracting
> the psid_len above?
>
> > /* Handle the special case of no offset bits (a=0), so offset loops */
> > min = psid;
>
> OK, this line above is the minimal port in the range
>
> > if (offset)
> > min += offset;
>
> ... which is incremented by the offset (to skip the 0-1023 ports).
>
> > r->min_proto.all = htons(min);
> > r->max_proto.all = htons(min + ((1 << (16 - offset_len - psid_len)) - 1));
>
> Here, you subtract psid_len again, not sure why.
Each PSID port range is made up of many smaller contiguous port sub-ranges (except for the special case of offset_len = 0) e.g. for PSID=0x34,psid_length=8,psid_offset=6 the ranges are 1232-1235, 2256-2259, ..., 63696-63699, 64720-64723 (Taken from rfc7597 Appendix A. Examples).
The above calculation is selecting the first sub-range. Max is computed by finding j_length and filling it with 1's.
>
> > r->base_proto.all = htons(offset);
>
> base is set to offset, ie. 1024.
>
> > r->flags |= NF_NAT_RANGE_PSID;
> > r->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
>
> Now looking at the kernel side.
>
> > diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
> > index 8e8a65d46345..19a4754cda76 100644
> > --- a/net/netfilter/nf_nat_masquerade.c
> > +++ b/net/netfilter/nf_nat_masquerade.c
> > @@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
> > newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
> > newrange.min_addr.ip = newsrc;
> > newrange.max_addr.ip = newsrc;
> > - newrange.min_proto = range->min_proto;
> > - newrange.max_proto = range->max_proto;
> > +
> > + if (range->flags & NF_NAT_RANGE_PSID) {
> > + u16 base = ntohs(range->base_proto.all);
> > + u16 min = ntohs(range->min_proto.all);
> > + u16 off = 0;
> > +
> > + /* xtables should stop base > 2^15 by enforcement of
> > + * 0 <= offset_len < 16 argument, with offset_len=0
> > + * as a special case inwhich base=0.
>
> I don't understand this comment.
This is a sanity check. The userspace tool restricts offset_len to the specified range and since base = 2^(16 - offset_len) (or base = 0 for the special case of offset_len = 16) the below condition should never be true.
However, if base greater than 1<<15 was allowed, a divide by zero error would occur on the block below.
>
> > + */
> > + if (WARN_ON_ONCE(base > (1 << 15)))
> > + return NF_DROP;
> > +
> > + /* If offset=0, port range is in one contiguous block */
> > + if (base)
> > + off = prandom_u32_max(((1 << 16) / base) - 1);
>
> Assuming the example above, base is set to 1024. Then, off is a random
> value between UINT16_MAX (you expressed this as 1 << 16) and the base
> which is 1024 minus 1.
>
> So this is picking a random off (actually the PSID?) between 0 and 63.
> What about clashes? I mean, two different machines behind the NAT
> might get the same off.
>
> > + newrange.min_proto.all = htons(min + base * off);
>
> min could be 1024, 2048, 3072... you add base which is 1024 * off.
>
> Is this duplicated? Both calculated in user and kernel space?
Each PSID value defines many contiguous port sub-ranges. The randomly chosen off selects the ith sub-range for a given PSID e.g. off=1 would select 2256-2259 for rfc7597 Appendix A. Examples.
The userspace tool calculates the min and max of the first sub-range for a given psid, whereas the above randomly selects one of the sub-ranges for a given psid.
j_length determines how large each sub-range will be, so for small j_length values there still is the risk the chosen sub-range will be exhausted.
>
> > + newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min);
>
> I'm stopping here, I'm getting lost.
>
> My understanding about this RFC is that you would like to split the
> 16-bit ports in ranges to uniquely identify the host behind the NAT.
>
> Why don't you just you just select the port range from userspace
> utilizing the existing infrastructure? I mean, why do you need this
> kernel patch?
If utilizing existing infrastruture to install PSID port ranges a lot of rules would be required as each PSID port range is made up of many smaller sub-ranges.
e.g. (from rfc7597 Appendix A. Examples)
for psid_length=8,offset_length=6 each PSID would need 63 NF_NAT_RANGE_PROTO_SPECIFIED rules, hence a total of 16128 rules if all the PSIDs were allocated.
>
> Florian already suggested:
>
> > Is it really needed to place all of this in the nat core?
> >
> > The only thing that has to be done in the NAT core, afaics, is to
> > suppress port reallocation attmepts when NF_NAT_RANGE_PSID is set.
> >
> > Is there a reason why nf_nat_masquerade_ipv4/6 can't be changed instead
> > to do what you want?
> >
> > AFAICS its enough to set NF_NAT_RANGE_PROTO_SPECIFIED and init the
> > upper/lower boundaries, i.e. change input given to nf_nat_setup_info().
>
> extracted from:
>
> https://scanmail.trustwave.com/?c=20988&d=6vim4fcVLjPkIbLUDqz3Tj2W4gXWNCkYa5s0Bg8JjA&u=https%3a%2f%2fpatchwork%2eozlabs%2eorg%2fproject%2fnetfilter-devel%2fpatch%2f20210422023506%2e4651-1-Cole%2eDishington%40alliedtelesis%2eco%2enz%2f