2021-04-22 02:37:36

by Cole Dishington

[permalink] [raw]
Subject: [PATCH] net: netfilter: Add RFC-7597 Section 5.1 PSID support

This adds support for masquerading into a smaller subset of ports -
defined by the PSID values from RFC-7597 Section 5.1. This is part of
the support for MAP-E and Lightweight 4over6, which allows multiple
devices to share an IPv4 address by splitting the L4 port / id into
ranges.

Co-developed-by: Anthony Lineham <[email protected]>
Signed-off-by: Anthony Lineham <[email protected]>
Co-developed-by: Scott Parlane <[email protected]>
Signed-off-by: Scott Parlane <[email protected]>
Signed-off-by: Blair Steven <[email protected]>
Signed-off-by: Cole Dishington <[email protected]>
---
include/net/netfilter/nf_conntrack.h | 2 +
.../netfilter/nf_conntrack_tuple_common.h | 5 +
include/uapi/linux/netfilter/nf_nat.h | 3 +-
net/netfilter/nf_nat_core.c | 101 ++++++++++++++++--
net/netfilter/nf_nat_ftp.c | 23 ++--
net/netfilter/nf_nat_helper.c | 15 ++-
6 files changed, 120 insertions(+), 29 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 439379ca9ffa..d63d38aa7188 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -92,6 +92,8 @@ struct nf_conn {
/* If we were expected by an expectation, this will be it */
struct nf_conn *master;

+ struct nf_nat_range2 *range;
+
#if defined(CONFIG_NF_CONNTRACK_MARK)
u_int32_t mark;
#endif
diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
index 64390fac6f7e..36d16d47c2b0 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
@@ -39,6 +39,11 @@ union nf_conntrack_man_proto {
struct {
__be16 key; /* GRE key is 32bit, PPtP only uses 16bit */
} gre;
+ struct {
+ unsigned char psid_length;
+ unsigned char offset;
+ __be16 psid;
+ } psid;
};

#define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
index a64586e77b24..660e53ffdb57 100644
--- a/include/uapi/linux/netfilter/nf_nat.h
+++ b/include/uapi/linux/netfilter/nf_nat.h
@@ -12,6 +12,7 @@
#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5)
#define NF_NAT_RANGE_NETMAP (1 << 6)
+#define NF_NAT_RANGE_PSID (1 << 7)

#define NF_NAT_RANGE_PROTO_RANDOM_ALL \
(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
@@ -20,7 +21,7 @@
(NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \
NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \
NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \
- NF_NAT_RANGE_NETMAP)
+ NF_NAT_RANGE_NETMAP | NF_NAT_RANGE_PSID)

struct nf_nat_ipv4_range {
unsigned int flags;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index b7c3c902290f..7730ce4ca9a9 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -232,13 +232,33 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype,
const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
+ const union nf_conntrack_man_proto *max,
+ bool is_psid)
{
__be16 port;

+ int m = 0;
+ u16 offset_mask = 0;
+ u16 psid_mask = 0;
+
+ /* In this case we are in PSID mode and the rules are all different */
+ if (is_psid) {
+ /* m = number of bits in each valid range */
+ m = 16 - min->psid.psid_length - min->psid.offset;
+ offset_mask = ((1 << min->psid.offset) - 1) <<
+ (16 - min->psid.offset);
+ psid_mask = ((1 << min->psid.psid_length) - 1) << m;
+ }
+
switch (tuple->dst.protonum) {
case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
+ if (is_psid) {
+ return ((ntohs(tuple->src.u.icmp.id) & offset_mask) !=
+ 0) &&
+ ((ntohs(tuple->src.u.icmp.id) & psid_mask) ==
+ min->psid.psid);
+ }
return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
case IPPROTO_GRE: /* all fall though */
@@ -252,6 +272,11 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
else
port = tuple->dst.u.all;

+ if (is_psid) {
+ return ((ntohs(port) & offset_mask) != 0) &&
+ (((ntohs(port) & psid_mask) >> m) ==
+ min->psid.psid);
+ }
return ntohs(port) >= ntohs(min->all) &&
ntohs(port) <= ntohs(max->all);
default:
@@ -274,9 +299,9 @@ static int in_range(const struct nf_conntrack_tuple *tuple,

if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
return 1;
-
return l4proto_in_range(tuple, NF_NAT_MANIP_SRC,
- &range->min_proto, &range->max_proto);
+ &range->min_proto, &range->max_proto,
+ range->flags & NF_NAT_RANGE_PSID);
}

static inline int
@@ -397,10 +422,10 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
*
* Per-protocol part of tuple is initialized to the incoming packet.
*/
-static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
+void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
{
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
@@ -457,6 +482,50 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
return;
}

+ if (range->flags & NF_NAT_RANGE_PSID) {
+ /* Find the non-PSID parts of the port.
+ * To do this we look for an unused port that is
+ * comprised of [t_chunk|PSID|b_chunk]. The size of
+ * these pieces is defined by the psid_length and
+ * offset.
+ */
+ int m = 16 - range->min_proto.psid.psid_length -
+ range->min_proto.psid.offset;
+ int available;
+ int range_count = ((1 << range->min_proto.psid.offset) - 1);
+
+ /* Calculate the size of the bottom block */
+ range_size = (1 << m);
+
+ /* Calculate the total IDs to check */
+ available = range_size * range_count;
+ if (!available)
+ available = range_size;
+
+ off = ntohs(*keyptr);
+ for (i = 0;; ++off) {
+ int b_chunk = off % range_size;
+ int t_chunk = 0;
+
+ /* Move up to avoid the all-zeroes reserved chunk
+ * (if there is one).
+ */
+ if (range->min_proto.psid.offset > 0) {
+ t_chunk = (off >> m) % range_count;
+ ++t_chunk;
+ t_chunk <<= (m +
+ range->min_proto.psid.psid_length);
+ }
+
+ *keyptr = htons(t_chunk |
+ (range->min_proto.psid.psid << m)
+ | b_chunk);
+
+ if (++i >= available || !nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+ }
+
/* If no range specified... */
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
/* If it's dst rewrite, can't change port */
@@ -566,11 +635,18 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,

/* Only bother mapping if it's not already in range and unique */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
- if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ /* Now that the PSID mode is present we always need to check
+ * to see if the source ports are in range.
+ */
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED ||
+ (range->flags & NF_NAT_RANGE_PSID &&
+ !in_range(orig_tuple, range))) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ &range->min_proto,
+ &range->max_proto,
+ range->flags &
+ NF_NAT_RANGE_PSID) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
return;
@@ -623,6 +699,11 @@ nf_nat_setup_info(struct nf_conn *ct,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);

get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
+ if (range) {
+ if (!ct->range)
+ ct->range = kmalloc(sizeof(*ct->range), 0);
+ memcpy(ct->range, range, sizeof(*ct->range));
+ }

if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
struct nf_conntrack_tuple reply;
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index aace6768a64e..006b7e1836ff 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -17,6 +17,10 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <linux/netfilter/nf_conntrack_ftp.h>
+void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct);

#define NAT_HELPER_NAME "ftp"

@@ -86,19 +90,12 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
* this one. */
exp->expectfn = nf_nat_follow_master;

- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
+ /* Find a port that matches the MASQ rule. */
+ nf_nat_l4proto_unique_tuple(&exp->tuple, ct->range,
+ dir ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST,
+ ct);
+ port = ntohs(exp->tuple.dst.u.tcp.port);
+ nf_ct_expect_related(exp, 0);

if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use");
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index a263505455fc..090153475d4d 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -184,11 +184,16 @@ void nf_nat_follow_master(struct nf_conn *ct,
/* This must be a fresh one. */
BUG_ON(ct->status & IPS_NAT_DONE_MASK);

- /* Change src to where master sends to */
- range.flags = NF_NAT_RANGE_MAP_IPS;
- range.min_addr = range.max_addr
- = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
- nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+ if (exp->master && exp->master->range && !exp->dir) {
+ range = *exp->master->range;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+ } else {
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ range.max_addr = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+ }

/* For DST manip, map port here to where it's expected. */
range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
--
2.31.1


2021-04-26 12:25:17

by Florian Westphal

[permalink] [raw]
Subject: Re: [PATCH] net: netfilter: Add RFC-7597 Section 5.1 PSID support

Cole Dishington <[email protected]> wrote:
> This adds support for masquerading into a smaller subset of ports -
> defined by the PSID values from RFC-7597 Section 5.1. This is part of
> the support for MAP-E and Lightweight 4over6, which allows multiple
> devices to share an IPv4 address by splitting the L4 port / id into
> ranges.
>
> Co-developed-by: Anthony Lineham <[email protected]>
> Signed-off-by: Anthony Lineham <[email protected]>
> Co-developed-by: Scott Parlane <[email protected]>
> Signed-off-by: Scott Parlane <[email protected]>
> Signed-off-by: Blair Steven <[email protected]>
> Signed-off-by: Cole Dishington <[email protected]>
> ---
> include/net/netfilter/nf_conntrack.h | 2 +
> .../netfilter/nf_conntrack_tuple_common.h | 5 +
> include/uapi/linux/netfilter/nf_nat.h | 3 +-
> net/netfilter/nf_nat_core.c | 101 ++++++++++++++++--
> net/netfilter/nf_nat_ftp.c | 23 ++--
> net/netfilter/nf_nat_helper.c | 15 ++-
> 6 files changed, 120 insertions(+), 29 deletions(-)
>
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 439379ca9ffa..d63d38aa7188 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -92,6 +92,8 @@ struct nf_conn {
> /* If we were expected by an expectation, this will be it */
> struct nf_conn *master;
>
> + struct nf_nat_range2 *range;

Increasing nf_conn size should be avoided unless
absolutely neccessary.

> --- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> +++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> @@ -39,6 +39,11 @@ union nf_conntrack_man_proto {
> struct {
> __be16 key; /* GRE key is 32bit, PPtP only uses 16bit */
> } gre;
> + struct {
> + unsigned char psid_length;
> + unsigned char offset;
> + __be16 psid;
> + } psid;

This breaks the ABI, you cannot change these structures.

This is the reason there is a 'struct nf_nat_range2', it wasn't
possible to add to the existing 'struct nf_nat_range'.

> diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
> index b7c3c902290f..7730ce4ca9a9 100644
> --- a/net/netfilter/nf_nat_core.c
> +++ b/net/netfilter/nf_nat_core.c
> @@ -232,13 +232,33 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
> static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
> enum nf_nat_manip_type maniptype,
> const union nf_conntrack_man_proto *min,
> - const union nf_conntrack_man_proto *max)
> + const union nf_conntrack_man_proto *max,
> + bool is_psid)
> {

...
> __be16 port;
>
> + int m = 0;
> + u16 offset_mask = 0;
> + u16 psid_mask = 0;
> +
> + /* In this case we are in PSID mode and the rules are all different */
> + if (is_psid) {
> + /* m = number of bits in each valid range */
> + m = 16 - min->psid.psid_length - min->psid.offset;
> + offset_mask = ((1 << min->psid.offset) - 1) <<
> + (16 - min->psid.offset);
> + psid_mask = ((1 << min->psid.psid_length) - 1) << m;
> + }

...

Is it really needed to place all of this in the nat core?

The only thing that has to be done in the NAT core, afaics, is to
suppress port reallocation attmepts when NF_NAT_RANGE_PSID is set.

Is there a reason why nf_nat_masquerade_ipv4/6 can't be changed instead
to do what you want?

AFAICS its enough to set NF_NAT_RANGE_PROTO_SPECIFIED and init the
upper/lower boundaries, i.e. change input given to nf_nat_setup_info().

> get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
> + if (range) {
> + if (!ct->range)
> + ct->range = kmalloc(sizeof(*ct->range), 0);

If you absolutely have to store extra data in nf_conn, please extend
struct nf_conn_nat, masquerade already stores the interface index, so
you could place the psid len/offset there as well.

> + /* Find a port that matches the MASQ rule. */
> + nf_nat_l4proto_unique_tuple(&exp->tuple, ct->range,
> + dir ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST,
> + ct);
> + port = ntohs(exp->tuple.dst.u.tcp.port);
> + nf_ct_expect_related(exp, 0);

This removes there error check for nf_ct_expect_related(), why?

Also, how is this going to be used?

I see no changes to any of the nftables or iptables modules that would
be needed for userspace to enable this feature.

2021-06-29 00:50:14

by Cole Dishington

[permalink] [raw]
Subject: [PATCH] net: netfilter: Add RFC-7597 Section 5.1 PSID support

This adds support for masquerading into a smaller subset of ports -
defined by the PSID values from RFC-7597 Section 5.1. This is part of
the support for MAP-E and Lightweight 4over6, which allows multiple
devices to share an IPv4 address by splitting the L4 port / id into
ranges.

Co-developed-by: Anthony Lineham <[email protected]>
Signed-off-by: Anthony Lineham <[email protected]>
Co-developed-by: Scott Parlane <[email protected]>
Signed-off-by: Scott Parlane <[email protected]>
Signed-off-by: Blair Steven <[email protected]>
Signed-off-by: Cole Dishington <[email protected]>
---

Notes:
Thanks for your time reviewing. I have also submitted a patch to netfilter iptables for these changes.

Comments:
Selecting the ports for psid needs to be in nf_nat_core since the PSID ranges are not a single range. e.g. offset=1024, PSID=0, psid_length=8 generates the ranges 1024-1027, 2048-2051, ..., 63488-63491, ... (example taken from RFC7597 B.2).
This is why it is enough to set NF_NAT_RANGE_PROTO_SPECIFIED and init upper/lower boundaries.

Changes in v2:
- Moved cached range2 from struct nf_conn to nf_conn_nat.
- Moved psid fields out of union nf_conntrack_man_proto. Now using range2 fields src, dst, and base to store psid parameters.
- Readded removed error check for nf_ct_expect_related()
- Added new version to masquerade iptables extension to use the range2 base field.

include/net/netfilter/nf_nat.h | 1 +
include/uapi/linux/netfilter/nf_nat.h | 3 +-
net/netfilter/nf_nat_core.c | 69 +++++++++++++++++++++++----
net/netfilter/nf_nat_ftp.c | 29 ++++++-----
net/netfilter/nf_nat_helper.c | 16 +++++--
net/netfilter/nf_nat_masquerade.c | 13 +++--
net/netfilter/xt_MASQUERADE.c | 44 +++++++++++++++--
7 files changed, 140 insertions(+), 35 deletions(-)

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 987111ae5240..67cc033f76bb 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -32,6 +32,7 @@ struct nf_conn_nat {
union nf_conntrack_nat_help help;
#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE)
int masq_index;
+ struct nf_nat_range2 *range;
#endif
};

diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
index a64586e77b24..660e53ffdb57 100644
--- a/include/uapi/linux/netfilter/nf_nat.h
+++ b/include/uapi/linux/netfilter/nf_nat.h
@@ -12,6 +12,7 @@
#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5)
#define NF_NAT_RANGE_NETMAP (1 << 6)
+#define NF_NAT_RANGE_PSID (1 << 7)

#define NF_NAT_RANGE_PROTO_RANDOM_ALL \
(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
@@ -20,7 +21,7 @@
(NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \
NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \
NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \
- NF_NAT_RANGE_NETMAP)
+ NF_NAT_RANGE_NETMAP | NF_NAT_RANGE_PSID)

struct nf_nat_ipv4_range {
unsigned int flags;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 7de595ead06a..7307bb28ece2 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -195,13 +195,32 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype,
const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
+ const union nf_conntrack_man_proto *max,
+ const union nf_conntrack_man_proto *base,
+ bool is_psid)
{
__be16 port;
+ u16 offset_mask = 0;
+ u16 psid_mask = 0;
+ u16 psid = 0;
+
+ /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */
+ if (is_psid) {
+ u16 j = ntohs(max->all) - ntohs(min->all) + 1;
+ u16 a = (1 << 16) / ntohs(base->all);
+
+ offset_mask = (a - 1) * ntohs(base->all);
+ psid_mask = ((ntohs(base->all) / j) << 1) - 1;
+ psid = ntohs(min->all) & psid_mask;
+ }

switch (tuple->dst.protonum) {
case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
+ if (is_psid) {
+ return ((ntohs(tuple->src.u.icmp.id) & offset_mask) != 0) &&
+ ((ntohs(tuple->src.u.icmp.id) & psid_mask) == psid);
+ }
return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
case IPPROTO_GRE: /* all fall though */
@@ -215,6 +234,10 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
else
port = tuple->dst.u.all;

+ if (is_psid) {
+ return ((ntohs(port) & offset_mask) != 0) &&
+ ((ntohs(port) & psid_mask) == psid);
+ }
return ntohs(port) >= ntohs(min->all) &&
ntohs(port) <= ntohs(max->all);
default:
@@ -239,7 +262,8 @@ static int in_range(const struct nf_conntrack_tuple *tuple,
return 1;

return l4proto_in_range(tuple, NF_NAT_MANIP_SRC,
- &range->min_proto, &range->max_proto);
+ &range->min_proto, &range->max_proto, &range->base_proto,
+ range->flags & NF_NAT_RANGE_PSID);
}

static inline int
@@ -360,10 +384,10 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
*
* Per-protocol part of tuple is initialized to the incoming packet.
*/
-static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
+void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
{
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
@@ -420,6 +444,25 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
return;
}

+ if (range->flags & NF_NAT_RANGE_PSID) {
+ /* PSID defines a group of port ranges, per PSID. PSID
+ * is already contained in min and max.
+ */
+ unsigned int min_to_max, base;
+
+ min = ntohs(range->min_proto.all);
+ max = ntohs(range->max_proto.all);
+ base = ntohs(range->base_proto.all);
+ min_to_max = max - min;
+ for (; max <= (1 << 16) - 1; min += base, max = min + min_to_max) {
+ for (off = 0; off <= min_to_max; off++) {
+ *keyptr = htons(min + off);
+ if (!nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+ }
+ }
+
/* If no range specified... */
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
/* If it's dst rewrite, can't change port */
@@ -529,11 +572,19 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,

/* Only bother mapping if it's not already in range and unique */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
- if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ /* PSID mode is present always needs to check
+ * to see if the source ports are in range.
+ */
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED ||
+ (range->flags & NF_NAT_RANGE_PSID &&
+ !in_range(orig_tuple, range))) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ &range->min_proto,
+ &range->max_proto,
+ &range->base_proto,
+ range->flags &
+ NF_NAT_RANGE_PSID) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
return;
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index aace6768a64e..f65163278db0 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -17,6 +17,10 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <linux/netfilter/nf_conntrack_ftp.h>
+void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct);

#define NAT_HELPER_NAME "ftp"

@@ -72,8 +76,13 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
u_int16_t port;
int dir = CTINFO2DIR(ctinfo);
struct nf_conn *ct = exp->master;
+ struct nf_conn_nat *nat = nfct_nat(ct);
char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN];
unsigned int buflen;
+ int ret;
+
+ if (WARN_ON_ONCE(!nat))
+ return NF_DROP;

pr_debug("type %i, off %u len %u\n", type, matchoff, matchlen);

@@ -86,18 +95,14 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
* this one. */
exp->expectfn = nf_nat_follow_master;

- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
+ /* Find a port that matches the MASQ rule. */
+ nf_nat_l4proto_unique_tuple(&exp->tuple, nat->range,
+ dir ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST,
+ ct);
+ ret = nf_ct_expect_related(exp, 0);
+ port = ntohs(exp->tuple.dst.u.tcp.port);
+ if (ret != 0 && ret != -EBUSY) {
+ port = 0;
}

if (port == 0) {
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index a263505455fc..2d105e4eb8f8 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -179,15 +179,23 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
void nf_nat_follow_master(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
+ struct nf_conn_nat *nat = NULL;
struct nf_nat_range2 range;

/* This must be a fresh one. */
BUG_ON(ct->status & IPS_NAT_DONE_MASK);

- /* Change src to where master sends to */
- range.flags = NF_NAT_RANGE_MAP_IPS;
- range.min_addr = range.max_addr
- = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ if (exp->master && !exp->dir) {
+ nat = nfct_nat(exp->master);
+ if (nat)
+ range = *nat->range;
+ }
+ if (!nat) {
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ range.max_addr = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ }
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);

/* For DST manip, map port here to where it's expected. */
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index 8e8a65d46345..d83cd3d8ad3f 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -45,10 +45,6 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
return NF_DROP;
}

- nat = nf_ct_nat_ext_add(ct);
- if (nat)
- nat->masq_index = out->ifindex;
-
/* Transfer from original range. */
memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
@@ -57,6 +53,15 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
newrange.max_addr.ip = newsrc;
newrange.min_proto = range->min_proto;
newrange.max_proto = range->max_proto;
+ newrange.base_proto = range->base_proto;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat) {
+ nat->masq_index = out->ifindex;
+ if (!nat->range)
+ nat->range = kmalloc(sizeof(*nat->range), 0);
+ memcpy(nat->range, &newrange, sizeof(*nat->range));
+ }

/* Hand modified range to generic setup. */
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
diff --git a/net/netfilter/xt_MASQUERADE.c b/net/netfilter/xt_MASQUERADE.c
index eae05c178336..dc6870ca2b71 100644
--- a/net/netfilter/xt_MASQUERADE.c
+++ b/net/netfilter/xt_MASQUERADE.c
@@ -16,7 +16,7 @@ MODULE_AUTHOR("Netfilter Core Team <[email protected]>");
MODULE_DESCRIPTION("Xtables: automatic-address SNAT");

/* FIXME: Multiple targets. --RR */
-static int masquerade_tg_check(const struct xt_tgchk_param *par)
+static int masquerade_tg_check_v0(const struct xt_tgchk_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;

@@ -31,8 +31,19 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
return nf_ct_netns_get(par->net, par->family);
}

+static int masquerade_tg_check_v1(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ return nf_ct_netns_get(par->net, par->family);
+}
+
static unsigned int
-masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
+masquerade_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_nat_range2 range;
const struct nf_nat_ipv4_multi_range_compat *mr;
@@ -46,6 +57,15 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
xt_out(par));
}

+static unsigned int
+masquerade_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
+
+ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), range,
+ xt_out(par));
+}
+
static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
{
nf_ct_netns_put(par->net, par->family);
@@ -73,6 +93,7 @@ static struct xt_target masquerade_tg_reg[] __read_mostly = {
{
#if IS_ENABLED(CONFIG_IPV6)
.name = "MASQUERADE",
+ .revision = 0,
.family = NFPROTO_IPV6,
.target = masquerade_tg6,
.targetsize = sizeof(struct nf_nat_range),
@@ -84,15 +105,28 @@ static struct xt_target masquerade_tg_reg[] __read_mostly = {
}, {
#endif
.name = "MASQUERADE",
+ .revision = 0,
.family = NFPROTO_IPV4,
- .target = masquerade_tg,
+ .target = masquerade_tg_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
- .checkentry = masquerade_tg_check,
+ .checkentry = masquerade_tg_check_v0,
.destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
- }
+ },
+ {
+ .name = "MASQUERADE",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .target = masquerade_tg_v1,
+ .targetsize = sizeof(struct nf_nat_range2),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg_check_v1,
+ .destroy = masquerade_tg_destroy,
+ .me = THIS_MODULE,
+ },
};

static int __init masquerade_tg_init(void)
--
2.32.0

2021-06-30 14:24:28

by Florian Westphal

[permalink] [raw]
Subject: Re: [PATCH] net: netfilter: Add RFC-7597 Section 5.1 PSID support

Cole Dishington <[email protected]> wrote:
> Comments:
> Selecting the ports for psid needs to be in nf_nat_core since the PSID ranges are not a single range. e.g. offset=1024, PSID=0, psid_length=8 generates the ranges 1024-1027, 2048-2051, ..., 63488-63491, ... (example taken from RFC7597 B.2).
> This is why it is enough to set NF_NAT_RANGE_PROTO_SPECIFIED and init upper/lower boundaries.

I suspect this misses a NOT. But current algorithm has problems, see
below.

> + if (range->flags & NF_NAT_RANGE_PSID) {
> + /* PSID defines a group of port ranges, per PSID. PSID
> + * is already contained in min and max.
> + */
> + unsigned int min_to_max, base;
> +
> + min = ntohs(range->min_proto.all);
> + max = ntohs(range->max_proto.all);
> + base = ntohs(range->base_proto.all);
> + min_to_max = max - min;
> + for (; max <= (1 << 16) - 1; min += base, max = min + min_to_max) {
> + for (off = 0; off <= min_to_max; off++) {
> + *keyptr = htons(min + off);
> + if (!nf_nat_used_tuple(tuple, ct))
> + return;
> + }
> + }
> + }

I fear this searches waaaay to many ports.
We had softlockups in the past because of exhausive searches.

See a504b703bb1da526a01593da0e4be2af9d9f5fa8
("netfilter: nat: limit port clash resolution attempts").

I suggest you try pre-selecting one of the eligible ranges in
nf_nat_masquerade_ipv4 when the 'newrange' is filled in and set
RANGE_PROTO_SPECIFIED.

Maybe even prandom-based preselection is good enough.

> /* If no range specified... */
> if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
> /* If it's dst rewrite, can't change port */
> @@ -529,11 +572,19 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
>
> /* Only bother mapping if it's not already in range and unique */
> if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
> - if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
> + /* PSID mode is present always needs to check
> + * to see if the source ports are in range.
> + */
> + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED ||
> + (range->flags & NF_NAT_RANGE_PSID &&

Why the extra check?
Can't you set NF_NAT_RANGE_PROTO_SPECIFIED in case PSID is requested by
userspace?

> diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
> index aace6768a64e..f65163278db0 100644
> --- a/net/netfilter/nf_nat_ftp.c
> +++ b/net/netfilter/nf_nat_ftp.c
> @@ -17,6 +17,10 @@
> #include <net/netfilter/nf_conntrack_helper.h>
> #include <net/netfilter/nf_conntrack_expect.h>
> #include <linux/netfilter/nf_conntrack_ftp.h>
> +void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
> + const struct nf_nat_range2 *range,
> + enum nf_nat_manip_type maniptype,
> + const struct nf_conn *ct);
>
> #define NAT_HELPER_NAME "ftp"
>
> @@ -72,8 +76,13 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
> u_int16_t port;
> int dir = CTINFO2DIR(ctinfo);
> struct nf_conn *ct = exp->master;
> + struct nf_conn_nat *nat = nfct_nat(ct);
> char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN];
> unsigned int buflen;
> + int ret;
> +
> + if (WARN_ON_ONCE(!nat))
> + return NF_DROP;
>
> pr_debug("type %i, off %u len %u\n", type, matchoff, matchlen);
>
> @@ -86,18 +95,14 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
> * this one. */
> exp->expectfn = nf_nat_follow_master;
>
> - /* Try to get same port: if not, try to change it. */
> - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
> - int ret;
> -
> - exp->tuple.dst.u.tcp.port = htons(port);
> - ret = nf_ct_expect_related(exp, 0);
> - if (ret == 0)
> - break;
> - else if (ret != -EBUSY) {
> - port = 0;
> - break;
> - }
> + /* Find a port that matches the MASQ rule. */
> + nf_nat_l4proto_unique_tuple(&exp->tuple, nat->range,
> + dir ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST,
> + ct);

Hmm, I am ingorant on details here, but is this correct?

This could be an inbound connection, rather than outbound.

> diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
> index a263505455fc..2d105e4eb8f8 100644
> --- a/net/netfilter/nf_nat_helper.c
> +++ b/net/netfilter/nf_nat_helper.c
> @@ -179,15 +179,23 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
> void nf_nat_follow_master(struct nf_conn *ct,
> struct nf_conntrack_expect *exp)
> {
> + struct nf_conn_nat *nat = NULL;
> struct nf_nat_range2 range;
>
> /* This must be a fresh one. */
> BUG_ON(ct->status & IPS_NAT_DONE_MASK);
>
> - /* Change src to where master sends to */
> - range.flags = NF_NAT_RANGE_MAP_IPS;
> - range.min_addr = range.max_addr
> - = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
> + if (exp->master && !exp->dir) {
> + nat = nfct_nat(exp->master);
> + if (nat)
> + range = *nat->range;

Can't you store the psid-relevant parts of the range struct only?
Non-PSID doesn't need the original range, so why do you?

> diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
> index 8e8a65d46345..d83cd3d8ad3f 100644
> --- a/net/netfilter/nf_nat_masquerade.c
> +++ b/net/netfilter/nf_nat_masquerade.c
> @@ -45,10 +45,6 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
> return NF_DROP;
> }
>
> - nat = nf_ct_nat_ext_add(ct);
> - if (nat)
> - nat->masq_index = out->ifindex;
> -
> /* Transfer from original range. */
> memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
> memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
> @@ -57,6 +53,15 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
> newrange.max_addr.ip = newsrc;
> newrange.min_proto = range->min_proto;
> newrange.max_proto = range->max_proto;
> + newrange.base_proto = range->base_proto;
> +
> + nat = nf_ct_nat_ext_add(ct);
> + if (nat) {
> + nat->masq_index = out->ifindex;
> + if (!nat->range)
> + nat->range = kmalloc(sizeof(*nat->range), 0);
> + memcpy(nat->range, &newrange, sizeof(*nat->range));

kmemdup. Also misses error handling. Should use GFP_ATOMIC.
Where is this free'd again?

It would be good if you could chop this up in smaller chunks.
A selftest would be nice as well (see tools/testing/selftests/netfilter).