From: "Palik, Imre" <[email protected]>
The netfilter code is made with flexibility instead of performance in mind.
So when all we want is to pass packets between different interfaces, the
performance penalty of hitting netfilter code can be considerable, even when
all the firewalling is disabled for the bridge.
This change makes it possible to disable netfilter both on a per bridge basis,
or for the whole bridging subsystem. In the case interesting to us, this can
lead to more than 10% speedup compared to the case when only bridge-iptables
are disabled.
Cc: Anthony Liguori <[email protected]>
Signed-off-by: Imre Palik <[email protected]>
---
net/bridge/br_forward.c | 3 +--
net/bridge/br_input.c | 3 +--
net/bridge/br_netfilter.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
net/bridge/br_private.h | 9 +++++++++
net/bridge/br_sysfs_br.c | 23 ++++++++++++++++++++++
5 files changed, 82 insertions(+), 4 deletions(-)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index f96933a..2931d2a 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -98,8 +98,7 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
skb->dev = to->dev;
skb_forward_csum(skb);
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
- br_forward_finish);
+ br_nf_do_forward(skb, indev);
}
/* called with rcu_read_lock */
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index e2aa7be..0edd1d6 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -299,8 +299,7 @@ forward:
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
- br_handle_frame_finish);
+ br_nf_do_prerouting(skb, p);
break;
default:
drop:
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22..b29c14f 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -50,6 +50,7 @@
#ifdef CONFIG_SYSCTL
static struct ctl_table_header *brnf_sysctl_header;
+static int brnf_call_netfilter __read_mostly = 1;
static int brnf_call_iptables __read_mostly = 1;
static int brnf_call_ip6tables __read_mostly = 1;
static int brnf_call_arptables __read_mostly = 1;
@@ -57,6 +58,7 @@ static int brnf_filter_vlan_tagged __read_mostly = 0;
static int brnf_filter_pppoe_tagged __read_mostly = 0;
static int brnf_pass_vlan_indev __read_mostly = 0;
#else
+#define brnf_call_netfilter 1
#define brnf_call_iptables 1
#define brnf_call_ip6tables 1
#define brnf_call_arptables 1
@@ -523,6 +525,25 @@ bad:
}
+int br_nf_do_prerouting(struct sk_buff *skb, struct net_bridge_port *p)
+{
+ struct net_bridge *br;
+
+ br = p->br;
+
+ if (!brnf_call_netfilter && !br->call_nf) {
+ __u32 len = nf_bridge_encap_header_len(skb);
+
+ if (likely(pskb_may_pull(skb, len)))
+ return br_handle_frame_finish(skb);
+ kfree(skb);
+ return -EPERM;
+ }
+
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
+ br_handle_frame_finish);
+}
+
/* Replicate the checks that IPv6 does on packet reception and pass the packet
* to ip6tables, which doesn't support NAT, so things are fairly simple. */
static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
@@ -667,6 +688,26 @@ static int br_nf_forward_finish(struct sk_buff *skb)
return 0;
}
+int br_nf_do_forward(struct sk_buff *skb, struct net_device *indev)
+{
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+
+ p = br_port_get_rcu(indev);
+ if (!p) {
+ kfree(skb);
+ return NF_DROP;
+ }
+ br = p->br;
+ if (!brnf_call_netfilter && !br->call_nf) {
+ skb_push(skb, ETH_HLEN);
+ br_drop_fake_rtable(skb);
+ return dev_queue_xmit(skb);
+ } else {
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev,
+ skb->dev, br_forward_finish);
+ }
+}
/* This is the 'purely bridged' case. For IP, we pass the packet to
* netfilter with indev and outdev set to the bridge device,
@@ -929,6 +970,13 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
static struct ctl_table brnf_table[] = {
{
+ .procname = "bridge-call-netfilter",
+ .data = &brnf_call_netfilter,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
.procname = "bridge-nf-call-arptables",
.data = &brnf_call_arptables,
.maxlen = sizeof(int),
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aea3d13..6b0aad9 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -215,6 +215,7 @@ struct net_bridge
struct hlist_head hash[BR_HASH_SIZE];
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct rtable fake_rtable;
+ bool call_nf;
bool nf_call_iptables;
bool nf_call_ip6tables;
bool nf_call_arptables;
@@ -763,10 +764,18 @@ static inline int br_vlan_enabled(struct net_bridge *br)
int br_nf_core_init(void);
void br_nf_core_fini(void);
void br_netfilter_rtable_init(struct net_bridge *);
+int br_nf_do_prerouting(struct sk_buff *skb, struct net_bridge_port *p);
+int br_nf_do_forward(struct sk_buff *skb, struct net_device *indev);
#else
static inline int br_nf_core_init(void) { return 0; }
static inline void br_nf_core_fini(void) {}
#define br_netfilter_rtable_init(x)
+#define br_nf_do_prerouting(skb, p) \
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, \
+ br_handle_frame_finish)
+#define br_nf_do_forward(skb, indev) \
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev, \
+ br_forward_finish)
#endif
/* br_stp.c */
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 4c97fc5..707b994 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -630,6 +630,28 @@ static ssize_t multicast_startup_query_interval_store(
static DEVICE_ATTR_RW(multicast_startup_query_interval);
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+static ssize_t call_nf_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->call_nf);
+}
+
+static int set_call_nf(struct net_bridge *br, unsigned long val)
+{
+ br->call_nf = val ? true : false;
+ return 0;
+}
+
+static ssize_t call_nf_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_call_nf);
+}
+static DEVICE_ATTR(call_nf, S_IRUGO | S_IWUSR,
+ call_nf_show, call_nf_store);
+
static ssize_t nf_call_iptables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
@@ -780,6 +802,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_multicast_startup_query_interval.attr,
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ &dev_attr_call_nf.attr,
&dev_attr_nf_call_iptables.attr,
&dev_attr_nf_call_ip6tables.attr,
&dev_attr_nf_call_arptables.attr,
--
1.7.9.5
From: Imre Palik <[email protected]>
Date: Tue, 10 Feb 2015 10:32:24 +0100
> From: "Palik, Imre" <[email protected]>
>
> The netfilter code is made with flexibility instead of performance in mind.
> So when all we want is to pass packets between different interfaces, the
> performance penalty of hitting netfilter code can be considerable, even when
> all the firewalling is disabled for the bridge.
>
> This change makes it possible to disable netfilter both on a per bridge basis,
> or for the whole bridging subsystem. In the case interesting to us, this can
> lead to more than 10% speedup compared to the case when only bridge-iptables
> are disabled.
>
> Cc: Anthony Liguori <[email protected]>
> Signed-off-by: Imre Palik <[email protected]>
Sorry, no.
If I apply this, someone is going to try to submit a patch for every
damn protocol layer to add a stupid hack like this.
Makw NF_HOOK() faster instead.
On 02/11/15 23:29, David Miller wrote:
> From: Imre Palik <[email protected]>
> Date: Tue, 10 Feb 2015 10:32:24 +0100
>
>> From: "Palik, Imre" <[email protected]>
>>
>> The netfilter code is made with flexibility instead of performance in mind.
>> So when all we want is to pass packets between different interfaces, the
>> performance penalty of hitting netfilter code can be considerable, even when
>> all the firewalling is disabled for the bridge.
>>
>> This change makes it possible to disable netfilter both on a per bridge basis,
>> or for the whole bridging subsystem. In the case interesting to us, this can
>> lead to more than 10% speedup compared to the case when only bridge-iptables
>> are disabled.
>>
>> Cc: Anthony Liguori <[email protected]>
>> Signed-off-by: Imre Palik <[email protected]>
>
> Sorry, no.
>
> If I apply this, someone is going to try to submit a patch for every
> damn protocol layer to add a stupid hack like this.
Actually this is one of those patches. There is already a "stupid hack like this" for iptables and arptables. (Implemented before git history, and giving me 10% speedup. Many thanks, whoever did it.)
I also searched various LKML archives, and it seems the existing "stupid hacks" for iptables and arptables haven't resulted in any related patch submission in the last ten years. (Or my google-fu is weak.)
Moreover, I cannot imagine any other reasonable on/off switch for bridge-netfilter than these three. Of course, my imagination might be lacking there.
> Makw NF_HOOK() faster instead.
As far as I see, flexibility implies trashing the cache/TLB. So it is impossible to have flexibility and performance at the same time.
(Except possibly with self modifying code, but that has its own set of problems.)
Imre Palik <[email protected]> wrote:
> On 02/11/15 23:29, David Miller wrote:
> > If I apply this, someone is going to try to submit a patch for every
> > damn protocol layer to add a stupid hack like this.
>
> Actually this is one of those patches. There is already a "stupid hack like this" for iptables and arptables. (Implemented before git history, and giving me 10% speedup. Many thanks, whoever did it.)
>
> I also searched various LKML archives, and it seems the existing "stupid hacks" for iptables and arptables haven't resulted in any related patch submission in the last ten years. (Or my google-fu is weak.)
>
> Moreover, I cannot imagine any other reasonable on/off switch for bridge-netfilter than these three. Of course, my imagination might be lacking there.
Why do you load the bridge netfilter module if you don't want it?
Loading it registers the internal hooks for the call-ip(6)tables and
sabotage hooks with NF_BRIDGE protocol so most of the NF_HOOK(NF_BRIDGE, ...
calls become active.
On 02/13/15 17:37, Florian Westphal wrote:
> Imre Palik <[email protected]> wrote:
>> On 02/11/15 23:29, David Miller wrote:
>>> If I apply this, someone is going to try to submit a patch for every
>>> damn protocol layer to add a stupid hack like this.
>>
>> Actually this is one of those patches. There is already a "stupid hack like this" for iptables and arptables. (Implemented before git history, and giving me 10% speedup. Many thanks, whoever did it.)
>>
>> I also searched various LKML archives, and it seems the existing "stupid hacks" for iptables and arptables haven't resulted in any related patch submission in the last ten years. (Or my google-fu is weak.)
>>
>> Moreover, I cannot imagine any other reasonable on/off switch for bridge-netfilter than these three. Of course, my imagination might be lacking there.
>
> Why do you load the bridge netfilter module if you don't want it?
> Loading it registers the internal hooks for the call-ip(6)tables and
> sabotage hooks with NF_BRIDGE protocol so most of the NF_HOOK(NF_BRIDGE, ...
> calls become active.
>
The trouble is that there are some bridges (with low traffic) where I need netfilter, and some other bridges (carrying lots of traffic), where I don't. Being able to set things up on a per bridge basis is a powerful thing.
I only implemented the global switch because the iptables and arptables support also have one. If this is what bugs people here, I can remove it, and resubmit.
Imre Palik <[email protected]> wrote:
> The trouble is that there are some bridges (with low traffic) where I need netfilter, and some other bridges (carrying lots of traffic), where I don't. Being able to set things up on a per bridge basis is a powerful thing.
>
> I only implemented the global switch because the iptables and arptables support also have one. If this is what bugs people here, I can remove it, and resubmit.
I see. But I agree with David, accepting such patch would pave way
for all kinds of ugly hacks.
It seems that technically the best solution would be to allow attaching
filter rules to devices, but alas, netfilter doesn't support that.
Alternatively, you patch *might* be ok iff you can get rid of the extra
userspace-visible configuration knobs, we already have way too many of
these.
You'll also have to figure out how to avoid any run-time dependency on
br_netfilter module from the bridge core.
If you can do this, you might be able to get similar effect as your patch
by replacing
NF_HOOK with NF_HOOK_COND(..., !(br->flags & NO_NETFILTER))
or something like this.
I don't know how invasive this would be, though.
On 02/13/15 20:03, Florian Westphal wrote:
> Imre Palik <[email protected]> wrote:
>> The trouble is that there are some bridges (with low traffic) where I need netfilter, and some other bridges (carrying lots of traffic), where I don't. Being able to set things up on a per bridge basis is a powerful thing.
>>
>> I only implemented the global switch because the iptables and arptables support also have one. If this is what bugs people here, I can remove it, and resubmit.
>
> I see. But I agree with David, accepting such patch would pave way
> for all kinds of ugly hacks.
>
> It seems that technically the best solution would be to allow attaching
> filter rules to devices, but alas, netfilter doesn't support that.
>
> Alternatively, you patch *might* be ok iff you can get rid of the extra
> userspace-visible configuration knobs, we already have way too many of
> these.
The sysctl can be removed. But I need some means to switch it off for a given bridge, so I kept the sysfs interface.
If there is a more preferred way to do it, then please let me know.
> You'll also have to figure out how to avoid any run-time dependency on
> br_netfilter module from the bridge core.
>
> If you can do this, you might be able to get similar effect as your patch
> by replacing
>
> NF_HOOK with NF_HOOK_COND(..., !(br->flags & NO_NETFILTER))
>
> or something like this.
This works nicely for the NFPROTO_BRIDGE, NF_BR_PRE_ROUTING case. Thanks for the idea.
But for the NFPROTO_BRIDGE, NF_BR_FORWARD case the resulting code would be more ugly,
because of the chaining of the entries.
> I don't know how invasive this would be, though.
I will post the cleaned up version in a sec.
It looks way better. I hope it will be enough ...