Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
used to indicate whenever the dev promiscuity counter is changed.
The notification doesn't use any switchdev_attr attribute because in the
notifier callbacks is it possible to get the dev and read directly
the promiscuity value.
Signed-off-by: Horatiu Vultur <[email protected]>
---
include/net/switchdev.h | 1 +
net/core/dev.c | 9 +++++++++
2 files changed, 10 insertions(+)
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index aee86a1..14b1617 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -40,6 +40,7 @@ enum switchdev_attr_id {
SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
+ SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
};
struct switchdev_attr {
diff --git a/net/core/dev.c b/net/core/dev.c
index 49589ed..40c74f2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -142,6 +142,7 @@
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
+#include <net/switchdev.h>
#include "net-sysfs.h"
@@ -7377,6 +7378,11 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
+ struct switchdev_attr attr = {
+ .orig_dev = dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
+ .flags = SWITCHDEV_F_DEFER,
+ };
kuid_t uid;
kgid_t gid;
@@ -7419,6 +7425,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
}
if (notify)
__dev_notify_flags(dev, old_flags, IFF_PROMISC);
+
+ switchdev_port_attr_set(dev, &attr);
+
return 0;
}
--
2.7.4
Thu, Aug 29, 2019 at 11:22:28AM CEST, [email protected] wrote:
>Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
>used to indicate whenever the dev promiscuity counter is changed.
>
>The notification doesn't use any switchdev_attr attribute because in the
>notifier callbacks is it possible to get the dev and read directly
>the promiscuity value.
>
>Signed-off-by: Horatiu Vultur <[email protected]>
>---
> include/net/switchdev.h | 1 +
> net/core/dev.c | 9 +++++++++
> 2 files changed, 10 insertions(+)
>
>diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>index aee86a1..14b1617 100644
>--- a/include/net/switchdev.h
>+++ b/include/net/switchdev.h
>@@ -40,6 +40,7 @@ enum switchdev_attr_id {
> SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
> SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
> SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
>+ SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
> };
>
> struct switchdev_attr {
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 49589ed..40c74f2 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -142,6 +142,7 @@
> #include <linux/net_namespace.h>
> #include <linux/indirect_call_wrapper.h>
> #include <net/devlink.h>
>+#include <net/switchdev.h>
>
> #include "net-sysfs.h"
>
>@@ -7377,6 +7378,11 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
> static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> {
> unsigned int old_flags = dev->flags;
>+ struct switchdev_attr attr = {
>+ .orig_dev = dev,
>+ .id = SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
>+ .flags = SWITCHDEV_F_DEFER,
NACK
This is invalid usecase for switchdev infra. Switchdev is there for
bridge offload purposes only.
For promiscuity changes, the infrastructure is already present in the
code. See __dev_notify_flags(). it calls:
call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info)
and you can actually see the changed flag in ".flags_changed".
You just have to register netdev notifier block in your driver. Grep
for: register_netdevice_notifier
>+ };
> kuid_t uid;
> kgid_t gid;
>
>@@ -7419,6 +7425,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> }
> if (notify)
> __dev_notify_flags(dev, old_flags, IFF_PROMISC);
>+
>+ switchdev_port_attr_set(dev, &attr);
>+
> return 0;
> }
>
>--
>2.7.4
>
The 08/29/2019 11:51, Jiri Pirko wrote:
> External E-Mail
>
>
> Thu, Aug 29, 2019 at 11:22:28AM CEST, [email protected] wrote:
> >Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
> >used to indicate whenever the dev promiscuity counter is changed.
> >
> >The notification doesn't use any switchdev_attr attribute because in the
> >notifier callbacks is it possible to get the dev and read directly
> >the promiscuity value.
> >
> >Signed-off-by: Horatiu Vultur <[email protected]>
> >---
> > include/net/switchdev.h | 1 +
> > net/core/dev.c | 9 +++++++++
> > 2 files changed, 10 insertions(+)
> >
> >diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> >index aee86a1..14b1617 100644
> >--- a/include/net/switchdev.h
> >+++ b/include/net/switchdev.h
> >@@ -40,6 +40,7 @@ enum switchdev_attr_id {
> > SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
> > SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
> > SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
> >+ SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
> > };
> >
> > struct switchdev_attr {
> >diff --git a/net/core/dev.c b/net/core/dev.c
> >index 49589ed..40c74f2 100644
> >--- a/net/core/dev.c
> >+++ b/net/core/dev.c
> >@@ -142,6 +142,7 @@
> > #include <linux/net_namespace.h>
> > #include <linux/indirect_call_wrapper.h>
> > #include <net/devlink.h>
> >+#include <net/switchdev.h>
> >
> > #include "net-sysfs.h"
> >
> >@@ -7377,6 +7378,11 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
> > static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> > {
> > unsigned int old_flags = dev->flags;
> >+ struct switchdev_attr attr = {
> >+ .orig_dev = dev,
> >+ .id = SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
> >+ .flags = SWITCHDEV_F_DEFER,
>
Hi Jiri,
> NACK
>
> This is invalid usecase for switchdev infra. Switchdev is there for
> bridge offload purposes only.
>
> For promiscuity changes, the infrastructure is already present in the
> code. See __dev_notify_flags(). it calls:
> call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info)
> and you can actually see the changed flag in ".flags_changed".
Yes, you are right. But in case the port is part of a bridge and then
enable promisc mode by a user space application(tpcdump) then the drivers
will not be notified. The reason is that the dev->flags will still be the
same(because IFF_PROMISC was already set) and only promiscuity will be
changed.
One fix could be to call __dev_notify_flags() no matter when the
promisc is enable or disabled.
>
> You just have to register netdev notifier block in your driver. Grep
> for: register_netdevice_notifier
>
>
> >+ };
> > kuid_t uid;
> > kgid_t gid;
> >
> >@@ -7419,6 +7425,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> > }
> > if (notify)
> > __dev_notify_flags(dev, old_flags, IFF_PROMISC);
> >+
> >+ switchdev_port_attr_set(dev, &attr);
> >+
> > return 0;
> > }
> >
> >--
> >2.7.4
> >
>
--
/Horatiu
Thu, Aug 29, 2019 at 12:56:54PM CEST, [email protected] wrote:
>The 08/29/2019 11:51, Jiri Pirko wrote:
>> External E-Mail
>>
>>
>> Thu, Aug 29, 2019 at 11:22:28AM CEST, [email protected] wrote:
>> >Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
>> >used to indicate whenever the dev promiscuity counter is changed.
>> >
>> >The notification doesn't use any switchdev_attr attribute because in the
>> >notifier callbacks is it possible to get the dev and read directly
>> >the promiscuity value.
>> >
>> >Signed-off-by: Horatiu Vultur <[email protected]>
>> >---
>> > include/net/switchdev.h | 1 +
>> > net/core/dev.c | 9 +++++++++
>> > 2 files changed, 10 insertions(+)
>> >
>> >diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>> >index aee86a1..14b1617 100644
>> >--- a/include/net/switchdev.h
>> >+++ b/include/net/switchdev.h
>> >@@ -40,6 +40,7 @@ enum switchdev_attr_id {
>> > SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
>> > SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
>> > SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
>> >+ SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
>> > };
>> >
>> > struct switchdev_attr {
>> >diff --git a/net/core/dev.c b/net/core/dev.c
>> >index 49589ed..40c74f2 100644
>> >--- a/net/core/dev.c
>> >+++ b/net/core/dev.c
>> >@@ -142,6 +142,7 @@
>> > #include <linux/net_namespace.h>
>> > #include <linux/indirect_call_wrapper.h>
>> > #include <net/devlink.h>
>> >+#include <net/switchdev.h>
>> >
>> > #include "net-sysfs.h"
>> >
>> >@@ -7377,6 +7378,11 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
>> > static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
>> > {
>> > unsigned int old_flags = dev->flags;
>> >+ struct switchdev_attr attr = {
>> >+ .orig_dev = dev,
>> >+ .id = SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
>> >+ .flags = SWITCHDEV_F_DEFER,
>>
>
>Hi Jiri,
>
>> NACK
>>
>> This is invalid usecase for switchdev infra. Switchdev is there for
>> bridge offload purposes only.
>>
>> For promiscuity changes, the infrastructure is already present in the
>> code. See __dev_notify_flags(). it calls:
>> call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info)
>> and you can actually see the changed flag in ".flags_changed".
>Yes, you are right. But in case the port is part of a bridge and then
>enable promisc mode by a user space application(tpcdump) then the drivers
If the promisc is on, it is on. Why do you need to know who enabled it?
Or do you want to use this to ask driver to ask hw to trap packets to
kernel? If yes, I don't think it is correct. If you want to "steal" some
packets from the hw forwarding datapath, use TC action "trap".
>will not be notified. The reason is that the dev->flags will still be the
>same(because IFF_PROMISC was already set) and only promiscuity will be
>changed.
>
>One fix could be to call __dev_notify_flags() no matter when the
>promisc is enable or disabled.
>
>>
>> You just have to register netdev notifier block in your driver. Grep
>> for: register_netdevice_notifier
>>
>>
>> >+ };
>> > kuid_t uid;
>> > kgid_t gid;
>> >
>> >@@ -7419,6 +7425,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
>> > }
>> > if (notify)
>> > __dev_notify_flags(dev, old_flags, IFF_PROMISC);
>> >+
>> >+ switchdev_port_attr_set(dev, &attr);
>> >+
>> > return 0;
>> > }
>> >
>> >--
>> >2.7.4
>> >
>>
>
>--
>/Horatiu
The 08/29/2019 14:18, Jiri Pirko wrote:
> External E-Mail
>
>
> Thu, Aug 29, 2019 at 12:56:54PM CEST, [email protected] wrote:
> >The 08/29/2019 11:51, Jiri Pirko wrote:
> >> External E-Mail
> >>
> >>
> >> Thu, Aug 29, 2019 at 11:22:28AM CEST, [email protected] wrote:
> >> >Add the SWITCHDEV_ATTR_ID_PORT_PROMISCUITY switchdev notification type,
> >> >used to indicate whenever the dev promiscuity counter is changed.
> >> >
> >> >The notification doesn't use any switchdev_attr attribute because in the
> >> >notifier callbacks is it possible to get the dev and read directly
> >> >the promiscuity value.
> >> >
> >> >Signed-off-by: Horatiu Vultur <[email protected]>
> >> >---
> >> > include/net/switchdev.h | 1 +
> >> > net/core/dev.c | 9 +++++++++
> >> > 2 files changed, 10 insertions(+)
> >> >
> >> >diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> >> >index aee86a1..14b1617 100644
> >> >--- a/include/net/switchdev.h
> >> >+++ b/include/net/switchdev.h
> >> >@@ -40,6 +40,7 @@ enum switchdev_attr_id {
> >> > SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
> >> > SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
> >> > SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
> >> >+ SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
> >> > };
> >> >
> >> > struct switchdev_attr {
> >> >diff --git a/net/core/dev.c b/net/core/dev.c
> >> >index 49589ed..40c74f2 100644
> >> >--- a/net/core/dev.c
> >> >+++ b/net/core/dev.c
> >> >@@ -142,6 +142,7 @@
> >> > #include <linux/net_namespace.h>
> >> > #include <linux/indirect_call_wrapper.h>
> >> > #include <net/devlink.h>
> >> >+#include <net/switchdev.h>
> >> >
> >> > #include "net-sysfs.h"
> >> >
> >> >@@ -7377,6 +7378,11 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
> >> > static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> >> > {
> >> > unsigned int old_flags = dev->flags;
> >> >+ struct switchdev_attr attr = {
> >> >+ .orig_dev = dev,
> >> >+ .id = SWITCHDEV_ATTR_ID_PORT_PROMISCUITY,
> >> >+ .flags = SWITCHDEV_F_DEFER,
> >>
> >
> >Hi Jiri,
> >
> >> NACK
> >>
> >> This is invalid usecase for switchdev infra. Switchdev is there for
> >> bridge offload purposes only.
> >>
> >> For promiscuity changes, the infrastructure is already present in the
> >> code. See __dev_notify_flags(). it calls:
> >> call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info)
> >> and you can actually see the changed flag in ".flags_changed".
> >Yes, you are right. But in case the port is part of a bridge and then
> >enable promisc mode by a user space application(tpcdump) then the drivers
>
> If the promisc is on, it is on. Why do you need to know who enabled it?
When a port is added to a bridge, then the port gets in promisc mode.
But in our case the HW has bridge capabilities so it is not required to
set the port in promisc mode.
But if someone else requires the port to be in promisc mode (tcpdump or
any other application) then I would like to set the port in promisc
mode.
In previous emails Andrew came with the suggestion to look at
dev->promiscuity and check if the port is a bridge port. Using this
information I could see when to add the port in promisc mode. He also
suggested to add a new switchdev call(maybe I missunderstood him, or I
have done it at the wrong place) in case there are no callbacks in the
driver to get this information.
>
> Or do you want to use this to ask driver to ask hw to trap packets to
> kernel? If yes, I don't think it is correct. If you want to "steal" some
> packets from the hw forwarding datapath, use TC action "trap".
No, I just wanted to know when to update the HW to set the port in
promisc mode.
>
>
> >will not be notified. The reason is that the dev->flags will still be the
> >same(because IFF_PROMISC was already set) and only promiscuity will be
> >changed.
> >
> >One fix could be to call __dev_notify_flags() no matter when the
> >promisc is enable or disabled.
> >
> >>
> >> You just have to register netdev notifier block in your driver. Grep
> >> for: register_netdevice_notifier
> >>
> >>
> >> >+ };
> >> > kuid_t uid;
> >> > kgid_t gid;
> >> >
> >> >@@ -7419,6 +7425,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
> >> > }
> >> > if (notify)
> >> > __dev_notify_flags(dev, old_flags, IFF_PROMISC);
> >> >+
> >> >+ switchdev_port_attr_set(dev, &attr);
> >> >+
> >> > return 0;
> >> > }
> >> >
> >> >--
> >> >2.7.4
> >> >
> >>
> >
> >--
> >/Horatiu
>
--
/Horatiu
On Thu, 29 Aug 2019 14:44:14 +0200
Horatiu Vultur <[email protected]> wrote:
> When a port is added to a bridge, then the port gets in promisc mode.
> But in our case the HW has bridge capabilities so it is not required
> to set the port in promisc mode.
> But if someone else requires the port to be in promisc mode (tcpdump
> or any other application) then I would like to set the port in promisc
> mode.
> In previous emails Andrew came with the suggestion to look at
> dev->promiscuity and check if the port is a bridge port. Using this
> information I could see when to add the port in promisc mode. He also
> suggested to add a new switchdev call(maybe I missunderstood him, or I
> have done it at the wrong place) in case there are no callbacks in the
> driver to get this information.
I would use the 1st suggestion.
for/in your driver:
if (dev->promiscuity > 0) {
if (dev->promiscuity == 1 && netif_is_bridge_port(dev)) {
/* no need to set promisc mode because promiscuity
* was requested by bridge
*/
...
} else {
/* need to set promisc mode as someone else requested
* promiscuity
*/
}
}
Thanks,
Ivan
The 08/29/2019 14:55, Ivan Vecera wrote:
> External E-Mail
>
>
> On Thu, 29 Aug 2019 14:44:14 +0200
> Horatiu Vultur <[email protected]> wrote:
>
> > When a port is added to a bridge, then the port gets in promisc mode.
> > But in our case the HW has bridge capabilities so it is not required
> > to set the port in promisc mode.
> > But if someone else requires the port to be in promisc mode (tcpdump
> > or any other application) then I would like to set the port in promisc
> > mode.
> > In previous emails Andrew came with the suggestion to look at
> > dev->promiscuity and check if the port is a bridge port. Using this
> > information I could see when to add the port in promisc mode. He also
> > suggested to add a new switchdev call(maybe I missunderstood him, or I
> > have done it at the wrong place) in case there are no callbacks in the
> > driver to get this information.
>
Hi Ivan,
> I would use the 1st suggestion.
>
> for/in your driver:
> if (dev->promiscuity > 0) {
> if (dev->promiscuity == 1 && netif_is_bridge_port(dev)) {
> /* no need to set promisc mode because promiscuity
> * was requested by bridge
> */
> ...
> } else {
> /* need to set promisc mode as someone else requested
> * promiscuity
> */
> }
> }
Yes that was my first try. It worked for many cases but there was one
which it didn't work.
If first add the port to the bridge, then you get callbacks and
notifications in the driver where you can use the code that you
suggested. But if next you enable promisc mode on the port (ip link set
dev eth0 promisc on), then the driver will not get any callbacks or
notifications. Therefore it could not see the new value of
dev->promiscuity to update the HW.
The code that updates the promisc mode it would not notify in case
the promiscuity is changed and because the port has already the flag
IFF_PROMISC it would not notify any listeners because the flags are not
changed.
Here [1] is the code that I refer to.
[1] https://elixir.bootlin.com/linux/latest/source/net/core/dev.c#L7592
>
> Thanks,
> Ivan
>
--
/Horatiu
On Thu, Aug 29, 2019 at 02:55:18PM +0200, Ivan Vecera wrote:
> On Thu, 29 Aug 2019 14:44:14 +0200
> Horatiu Vultur <[email protected]> wrote:
>
> > When a port is added to a bridge, then the port gets in promisc mode.
> > But in our case the HW has bridge capabilities so it is not required
> > to set the port in promisc mode.
> > But if someone else requires the port to be in promisc mode (tcpdump
> > or any other application) then I would like to set the port in promisc
> > mode.
> > In previous emails Andrew came with the suggestion to look at
> > dev->promiscuity and check if the port is a bridge port. Using this
> > information I could see when to add the port in promisc mode. He also
> > suggested to add a new switchdev call(maybe I missunderstood him, or I
> > have done it at the wrong place) in case there are no callbacks in the
> > driver to get this information.
>
> I would use the 1st suggestion.
>
> for/in your driver:
> if (dev->promiscuity > 0) {
> if (dev->promiscuity == 1 && netif_is_bridge_port(dev)) {
> /* no need to set promisc mode because promiscuity
> * was requested by bridge
> */
> ...
> } else {
> /* need to set promisc mode as someone else requested
> * promiscuity
> */
> }
> }
Hi Ivan
The problem with this is, the driver only gets called when promisc
goes from 0 to !0. So, the port is added to the bridge. Promisc goes
0->1, and the driver gets called. We can evaluate as you said above,
and leave the port filtering frames, not forwarding everything. When
tcpdump is started, the core does promisc 1->2, but does not call into
the driver. Also, currently sending a notification is not
unconditional.
What this patch adds is an unconditional call into the switchdev
driver so it can evaluate the condition above, with every change to
the promisc counter.
Andrew
> NACK
>
> This is invalid usecase for switchdev infra. Switchdev is there for
> bridge offload purposes only.
Hi Jiri
I would argue this is for bridge offload. In another email, you say
promisc is promisc. Does that mean the Mellonox hardware forwards
every frame ingressing a port to the CPU by default as soon as it is
enslaved to a bridge and promisc mode turned on? Or course not. At the
moment, every switchdev driver wrongly implement promisc mode.
This patchset is about correctly implementing promisc mode, so that
applications can use it as expected. And that means configuring the
hardware bridge to also forward a copy of frames to the CPU.
I see trap as a different use case. tcpdump/pcap is not going to use
traps.
Andrew
On Thu, 29 Aug 2019 15:15:43 +0200
Andrew Lunn <[email protected]> wrote:
> The problem with this is, the driver only gets called when promisc
> goes from 0 to !0. So, the port is added to the bridge. Promisc goes
> 0->1, and the driver gets called. We can evaluate as you said above,
> and leave the port filtering frames, not forwarding everything. When
> tcpdump is started, the core does promisc 1->2, but does not call into
> the driver. Also, currently sending a notification is not
> unconditional.
Hi Andrew,
got it. What about to change the existing notify call so NETDEV_CHANGE
notification will be also sent when (old_promiscuity !=
new_promiscuity)?
Ivan
Thu, Aug 29, 2019 at 03:26:11PM CEST, [email protected] wrote:
>> NACK
>>
>> This is invalid usecase for switchdev infra. Switchdev is there for
>> bridge offload purposes only.
>
>Hi Jiri
>
>I would argue this is for bridge offload. In another email, you say
>promisc is promisc. Does that mean the Mellonox hardware forwards
>every frame ingressing a port to the CPU by default as soon as it is
>enslaved to a bridge and promisc mode turned on? Or course not. At the
>moment, every switchdev driver wrongly implement promisc mode.
>
>This patchset is about correctly implementing promisc mode, so that
>applications can use it as expected. And that means configuring the
>hardware bridge to also forward a copy of frames to the CPU.
Wait, I believe there has been some misundestanding. Promisc mode is NOT
about getting packets to the cpu. It's about setting hw filters in a way
that no rx packet is dropped. For normal nics it means that all packets
get to the cpu, but that is just because it is the only direction they
can make.
If you want to get packets from the hw forwarding dataplane to cpu, you
should not use promisc mode for that. That would be incorrect.
If you want to get packets from the hw forwarding dataplane to cpu, you
should use tc trap action. It is there exactly for this purpose.
Promisc is for setting rx filters.
>
>I see trap as a different use case. tcpdump/pcap is not going to use
>traps.
>
> Andrew
> Wait, I believe there has been some misundestanding. Promisc mode is NOT
> about getting packets to the cpu. It's about setting hw filters in a way
> that no rx packet is dropped.
>
> If you want to get packets from the hw forwarding dataplane to cpu, you
> should not use promisc mode for that. That would be incorrect.
Hi Jiri
I'm not sure a wireshark/tcpdump/pcap user would agree with you. They
want to see packets on an interface, so they use these tools. The fact
that the interface is a switch interface should not matter. The
switchdev model is that we try to hide away the interface happens to
be on a switch, you can just use it as normal. So why should promisc
mode not work as normal?
> If you want to get packets from the hw forwarding dataplane to cpu, you
> should use tc trap action. It is there exactly for this purpose.
Do you really think a wireshark/tcpdump/pcap user should need to use
tc trap for the special case the interface is a switch port? Doesn't that
break the switchdev model?
tc trap is more about fine grained selection of packets. Also, it
seems like trapped packets are not forwarded, which is not what you
would expect from wireshark/tcpdump/pcap.
Andrew
On Thu, Aug 29, 2019 at 04:37:32PM +0200, Andrew Lunn wrote:
> > Wait, I believe there has been some misundestanding. Promisc mode is NOT
> > about getting packets to the cpu. It's about setting hw filters in a way
> > that no rx packet is dropped.
> >
> > If you want to get packets from the hw forwarding dataplane to cpu, you
> > should not use promisc mode for that. That would be incorrect.
>
> Hi Jiri
>
> I'm not sure a wireshark/tcpdump/pcap user would agree with you. They
> want to see packets on an interface, so they use these tools. The fact
> that the interface is a switch interface should not matter. The
> switchdev model is that we try to hide away the interface happens to
> be on a switch, you can just use it as normal. So why should promisc
> mode not work as normal?
Hi Andrew,
What happens when you run tcpdump on a routed interface without putting
it in promiscuous mode ('-p')? If it is a pure software switch, then you
see all unicast packets addressed to your interface's MAC address. What
happens when the same is done on a hardware switch? With the proposed
solution you will not get the same result.
On a software switch, when you run tcpdump without '-p', do you incur
major packet loss? No. Will this happen when you punt several Tbps to
your CPU on the hardware switch? Yes.
Extending the definition of promiscuous mode to mean punt all traffic to
the CPU is wrong, IMO. You will not be able to capture all the packets
anyway. If you have both elephant and mice flows, then it is very likely
you will not be able to see any packets from the mice flows. The way
this kind of monitoring is usually done is by either sampling packets
(see tc-sample) or mirroring it to capable server. Both options are
available in Linux today.
> > If you want to get packets from the hw forwarding dataplane to cpu, you
> > should use tc trap action. It is there exactly for this purpose.
>
> Do you really think a wireshark/tcpdump/pcap user should need to use
> tc trap for the special case the interface is a switch port? Doesn't that
> break the switchdev model?
I do not object to the overall goal, but I believe to implementation is
wrong. Instead, it would be much better to extend tshark/tcpdump and
with another flag that will instruct libpcap to install a rule that will
trap all traffic to the CPU. You can do that on either ingress or egress
using matchall and trap action.
If you want to do it without specifying a special flag (I think it's
very dangerous due to the potential packet loss), you can add a flag to
the interface that will indicate to libpcap that installing a tc filter
with trap action is required.
> tc trap is more about fine grained selection of packets.
Depends on the filter you associate with the action. If it's matchall,
then it's not fine grained at all :)
> Also, it seems like trapped packets are not forwarded, which is not
> what you would expect from wireshark/tcpdump/pcap.
How do you mean? Not forwarded by the HW? Right. But the trapped packets
are forwarded by the kernel. We can also add another action that means
both trap and forward. In mlxsw terminology it's called mirror.
> Hi Andrew,
>
> What happens when you run tcpdump on a routed interface without putting
> it in promiscuous mode ('-p')? If it is a pure software switch, then you
> see all unicast packets addressed to your interface's MAC address. What
> happens when the same is done on a hardware switch? With the proposed
> solution you will not get the same result.
>
> On a software switch, when you run tcpdump without '-p', do you incur
> major packet loss? No. Will this happen when you punt several Tbps to
> your CPU on the hardware switch? Yes.
Hi Ido
Please think about the general case, not your hardware. A DSA switch
generally has 1G ports. And the connection to the host is generally
1G, maybe 2.5G. So if i put one interface into promisc mode, i will
probably receive the majority of the traffic on that port, so long as
there is not too much traffic from other ports towards the CPU.
I also don't expect any major packet loss in the switch. It is still
hardware switching, but also sending a copy to the CPU. That copy will
have the offload_fwd_mark bit set, so the bridge will discard the
frame. The switch egress queue towards the CPU might overflow, but
that means tcpdump does not get to see all the frames, and some
traffic which is actually heading to the CPU is lost. But that can
happen anyway.
We should also think about the different classes of users. Somebody
using a TOR switch with a NOS is very different to a user of a SOHO
switch in their WiFi access point. The first probably knows tc very
well, the second has probably never heard of it, and just wants
tcpdump to work like on their desktop.
Andrew
On Thu, Aug 29, 2019 at 08:29:57PM +0200, Andrew Lunn wrote:
> > Hi Andrew,
> >
> > What happens when you run tcpdump on a routed interface without putting
> > it in promiscuous mode ('-p')? If it is a pure software switch, then you
> > see all unicast packets addressed to your interface's MAC address. What
> > happens when the same is done on a hardware switch? With the proposed
> > solution you will not get the same result.
> >
> > On a software switch, when you run tcpdump without '-p', do you incur
> > major packet loss? No. Will this happen when you punt several Tbps to
> > your CPU on the hardware switch? Yes.
>
> Hi Ido
>
> Please think about the general case, not your hardware. A DSA switch
> generally has 1G ports. And the connection to the host is generally
> 1G, maybe 2.5G. So if i put one interface into promisc mode, i will
> probably receive the majority of the traffic on that port, so long as
> there is not too much traffic from other ports towards the CPU.
>
> I also don't expect any major packet loss in the switch. It is still
> hardware switching, but also sending a copy to the CPU. That copy will
> have the offload_fwd_mark bit set, so the bridge will discard the
> frame. The switch egress queue towards the CPU might overflow, but
> that means tcpdump does not get to see all the frames, and some
> traffic which is actually heading to the CPU is lost. But that can
> happen anyway.
The potential packet loss was only one example why using promiscuous
mode as an indication to punt all traffic to the CPU is wrong. I also
mentioned that you will not capture any traffic (besides
control/exception) when '-p' is specified.
> We should also think about the different classes of users. Somebody
> using a TOR switch with a NOS is very different to a user of a SOHO
> switch in their WiFi access point. The first probably knows tc very
> well, the second has probably never heard of it, and just wants
> tcpdump to work like on their desktop.
I fully agree that we should make it easy for users to capture offloaded
traffic, which is why I suggested patching libpcap. Add a flag to
capable netdevs that tells libpcap that in order to capture all the
traffic from this interface it needs to add a tc filter with a trap
action. That way zero familiarity with tc is required from users.
I really believe that instead of interpreting IFF_PROMISC in exotic ways
and pushing all this logic into the kernel, we should instead teach user
space utilities to capture offloaded traffic.
From: Ido Schimmel <[email protected]>
Date: Thu, 29 Aug 2019 20:57:59 +0300
> On a software switch, when you run tcpdump without '-p', do you incur
> major packet loss? No. Will this happen when you punt several Tbps to
> your CPU on the hardware switch? Yes.
>
> Extending the definition of promiscuous mode to mean punt all traffic to
> the CPU is wrong, IMO. You will not be able to capture all the packets
This is so illogical, it is mind boggling.
How different is this to using tcpdump/wireshark on a 100GB or 1TB
network interface?
There is no difference.
Please stop portraying switches as special in this regard, they are
not.
From: Andrew Lunn <[email protected]>
Date: Thu, 29 Aug 2019 20:29:57 +0200
> We should also think about the different classes of users. Somebody
> using a TOR switch with a NOS is very different to a user of a SOHO
> switch in their WiFi access point. The first probably knows tc very
> well, the second has probably never heard of it, and just wants
> tcpdump to work like on their desktop.
+1
From: Ido Schimmel <[email protected]>
Date: Thu, 29 Aug 2019 22:36:13 +0300
> I fully agree that we should make it easy for users to capture offloaded
> traffic, which is why I suggested patching libpcap. Add a flag to
> capable netdevs that tells libpcap that in order to capture all the
> traffic from this interface it needs to add a tc filter with a trap
> action. That way zero familiarity with tc is required from users.
Why not just make setting promisc mode on the device do this rather than
require all of this tc indirection nonsense?
That's the whole point of this conversation I thought?
Fri, Aug 30, 2019 at 12:12:01AM CEST, [email protected] wrote:
>From: Ido Schimmel <[email protected]>
>Date: Thu, 29 Aug 2019 22:36:13 +0300
>
>> I fully agree that we should make it easy for users to capture offloaded
>> traffic, which is why I suggested patching libpcap. Add a flag to
>> capable netdevs that tells libpcap that in order to capture all the
>> traffic from this interface it needs to add a tc filter with a trap
>> action. That way zero familiarity with tc is required from users.
>
>Why not just make setting promisc mode on the device do this rather than
>require all of this tc indirection nonsense?
Because the "promisc mode" would gain another meaning. Now how the
driver should guess which meaning the user ment when he setted it?
filter or trap?
That is very confusing. If the flag is the way to do this, let's
introduce another flag, like IFF_TRAPPING indicating that user wants
exactly this.
>
>That's the whole point of this conversation I thought?
From: Jiri Pirko <[email protected]>
Date: Fri, 30 Aug 2019 07:39:40 +0200
> Because the "promisc mode" would gain another meaning. Now how the
> driver should guess which meaning the user ment when he setted it?
> filter or trap?
>
> That is very confusing. If the flag is the way to do this, let's
> introduce another flag, like IFF_TRAPPING indicating that user wants
> exactly this.
I don't understand how the meaning of promiscuous mode for a
networking device has suddenly become ambiguous, when did this start
happening?
Thu, Aug 29, 2019 at 04:37:32PM CEST, [email protected] wrote:
>> Wait, I believe there has been some misundestanding. Promisc mode is NOT
>> about getting packets to the cpu. It's about setting hw filters in a way
>> that no rx packet is dropped.
>>
>> If you want to get packets from the hw forwarding dataplane to cpu, you
>> should not use promisc mode for that. That would be incorrect.
>
>Hi Jiri
>
>I'm not sure a wireshark/tcpdump/pcap user would agree with you. They
>want to see packets on an interface, so they use these tools. The fact
>that the interface is a switch interface should not matter. The
>switchdev model is that we try to hide away the interface happens to
>be on a switch, you can just use it as normal. So why should promisc
>mode not work as normal?
It does, disables the rx filter. Why do you think it means the same
thing as "trap all to cpu"? Hw datapath was never considered by
wireshark.
In fact, I have usecase where I need to see only slow-path traffic by
wireshark, not all packets going through hw. So apparently, there is a
need of another wireshark option and perhaps another flag
IFF_HW_TRAPPING?.
tcpdump -i eth0
tcpdump -i eth0 --no-promiscuous-mode
tcpdump -i eth0 --hw-trapping-mode
>
>> If you want to get packets from the hw forwarding dataplane to cpu, you
>> should use tc trap action. It is there exactly for this purpose.
>
>Do you really think a wireshark/tcpdump/pcap user should need to use
>tc trap for the special case the interface is a switch port? Doesn't that
>break the switchdev model?
>
>tc trap is more about fine grained selection of packets. Also, it
>seems like trapped packets are not forwarded, which is not what you
>would expect from wireshark/tcpdump/pcap.
>
> Andrew
From: Jiri Pirko <[email protected]>
Date: Fri, 30 Aug 2019 08:13:27 +0200
> In fact, I have usecase where I need to see only slow-path traffic by
> wireshark, not all packets going through hw.
This could be an attribute in the descriptor entries for the packets
provided to userspace, and BPF filters could act on these as well.
Switch network devices aren't special, promiscuous mode means all
packets on the wire that you are attached.
This talk about special handling of "trapping" is a strawman.
Fri, Aug 30, 2019 at 08:02:33AM CEST, [email protected] wrote:
>From: Jiri Pirko <[email protected]>
>Date: Fri, 30 Aug 2019 07:39:40 +0200
>
>> Because the "promisc mode" would gain another meaning. Now how the
>> driver should guess which meaning the user ment when he setted it?
>> filter or trap?
>>
>> That is very confusing. If the flag is the way to do this, let's
>> introduce another flag, like IFF_TRAPPING indicating that user wants
>> exactly this.
>
>I don't understand how the meaning of promiscuous mode for a
>networking device has suddenly become ambiguous, when did this start
>happening?
The promiscuity is a way to setup the rx filter. So promics == rx filter
off. For normal nics, where there is no hw fwd datapath,
this coincidentally means all received packets go to cpu.
But if there is hw fwd datapath, rx filter is still off, all rxed packets
are processed. But that does not mean they should be trapped to cpu.
Simple example:
I need to see slowpath packets, for example arps/stp/bgp/... that
are going to cpu, I do:
tcpdump -i swp1
I don't want to get all the traffic running over hw running this cmd.
This is a valid usecase.
To cope with hw fwd datapath devices, I believe that tcpdump has to have
notion of that. Something like:
tcpdump -i swp1 --hw-trapping-mode
The logic can be inverse:
tcpdump -i swp1
tcpdump -i swp1 --no-hw-trapping-mode
However, that would provide inconsistent behaviour between existing and
patched tcpdump/kernel.
All I'm trying to say, there are 2 flags
needed (if we don't use tc trap).
On Fri, 30 Aug 2019 08:36:24 +0200
Jiri Pirko <[email protected]> wrote:
> Fri, Aug 30, 2019 at 08:02:33AM CEST, [email protected] wrote:
> >From: Jiri Pirko <[email protected]>
> >Date: Fri, 30 Aug 2019 07:39:40 +0200
> >
> >> Because the "promisc mode" would gain another meaning. Now how the
> >> driver should guess which meaning the user ment when he setted it?
> >> filter or trap?
> >>
> >> That is very confusing. If the flag is the way to do this, let's
> >> introduce another flag, like IFF_TRAPPING indicating that user
> >> wants exactly this.
> >
> >I don't understand how the meaning of promiscuous mode for a
> >networking device has suddenly become ambiguous, when did this start
> >happening?
>
> The promiscuity is a way to setup the rx filter. So promics == rx
> filter off. For normal nics, where there is no hw fwd datapath,
> this coincidentally means all received packets go to cpu.
> But if there is hw fwd datapath, rx filter is still off, all rxed
> packets are processed. But that does not mean they should be trapped
> to cpu.
+1
Promisc is Rx filtering option and should not imply that offloaded
traffic is trapped to CPU.
> Simple example:
> I need to see slowpath packets, for example arps/stp/bgp/... that
> are going to cpu, I do:
> tcpdump -i swp1
>
> I don't want to get all the traffic running over hw running this cmd.
> This is a valid usecase.
>
> To cope with hw fwd datapath devices, I believe that tcpdump has to
> have notion of that. Something like:
>
> tcpdump -i swp1 --hw-trapping-mode
>
> The logic can be inverse:
> tcpdump -i swp1
> tcpdump -i swp1 --no-hw-trapping-mode
>
> However, that would provide inconsistent behaviour between existing
> and patched tcpdump/kernel.
>
> All I'm trying to say, there are 2 flags
> needed (if we don't use tc trap).
From: Jiri Pirko <[email protected]>
Date: Fri, 30 Aug 2019 08:36:24 +0200
> The promiscuity is a way to setup the rx filter. So promics == rx filter
> off. For normal nics, where there is no hw fwd datapath,
> this coincidentally means all received packets go to cpu.
You cannot convince me that the HW datapath isn't a "rx filter" too, sorry.
From: Ivan Vecera <[email protected]>
Date: Fri, 30 Aug 2019 08:54:45 +0200
> Promisc is Rx filtering option and should not imply that offloaded
> traffic is trapped to CPU.
Hardware is offloaded based upon an rx filter.
Stop this nonsense.
Fri, Aug 30, 2019 at 09:12:23AM CEST, [email protected] wrote:
>From: Jiri Pirko <[email protected]>
>Date: Fri, 30 Aug 2019 08:36:24 +0200
>
>> The promiscuity is a way to setup the rx filter. So promics == rx filter
>> off. For normal nics, where there is no hw fwd datapath,
>> this coincidentally means all received packets go to cpu.
>
>You cannot convince me that the HW datapath isn't a "rx filter" too, sorry.
If you look at it that way, then we have 2: rx_filter and hw_rx_filter.
The point is, those 2 are not one item, that is the point I'm trying to
make :/
On Fri, 30 Aug 2019 08:13:27 +0200
Jiri Pirko <[email protected]> wrote:
> Thu, Aug 29, 2019 at 04:37:32PM CEST, [email protected] wrote:
> >> Wait, I believe there has been some misundestanding. Promisc mode
> >> is NOT about getting packets to the cpu. It's about setting hw
> >> filters in a way that no rx packet is dropped.
> >>
> >> If you want to get packets from the hw forwarding dataplane to
> >> cpu, you should not use promisc mode for that. That would be
> >> incorrect.
> >
> >Hi Jiri
> >
> >I'm not sure a wireshark/tcpdump/pcap user would agree with you. They
> >want to see packets on an interface, so they use these tools. The
> >fact that the interface is a switch interface should not matter. The
> >switchdev model is that we try to hide away the interface happens to
> >be on a switch, you can just use it as normal. So why should promisc
> >mode not work as normal?
>
> It does, disables the rx filter. Why do you think it means the same
> thing as "trap all to cpu"? Hw datapath was never considered by
> wireshark.
>
> In fact, I have usecase where I need to see only slow-path traffic by
> wireshark, not all packets going through hw. So apparently, there is a
> need of another wireshark option and perhaps another flag
> IFF_HW_TRAPPING?.
Agree with Jiri but understand both perspectives. We can treat
IFF_PROMISC as:
1) "I want to _SEE_ all Rx traffic on specified interface"
that means for switchdev driver that it has to trap all traffic to CPU
implicitly. And in this case we need another flag that will say "I
don't want to see offloaded traffic".
2) "I want to ensure that nothing is dropped on specified interface" so
IFF_PROMISC is treated as filtering option only. To see offloaded
traffic you need to setup TC rule with trap action or another flag like
IFF_TRAPPING.
IMO IFF_PROMISC should be considered to be a filtering option and
should not imply trapping of offloaded traffic.
Thanks,
Ivan
From: Jiri Pirko <[email protected]>
Date: Fri, 30 Aug 2019 09:21:33 +0200
> Fri, Aug 30, 2019 at 09:12:23AM CEST, [email protected] wrote:
>>From: Jiri Pirko <[email protected]>
>>Date: Fri, 30 Aug 2019 08:36:24 +0200
>>
>>> The promiscuity is a way to setup the rx filter. So promics == rx filter
>>> off. For normal nics, where there is no hw fwd datapath,
>>> this coincidentally means all received packets go to cpu.
>>
>>You cannot convince me that the HW datapath isn't a "rx filter" too, sorry.
>
> If you look at it that way, then we have 2: rx_filter and hw_rx_filter.
> The point is, those 2 are not one item, that is the point I'm trying to
> make :/
And you can turn both of them off when I ask for promiscuous mode, that's
a detail of the device not a semantic issue.
Fri, Aug 30, 2019 at 09:32:25AM CEST, [email protected] wrote:
>From: Jiri Pirko <[email protected]>
>Date: Fri, 30 Aug 2019 09:21:33 +0200
>
>> Fri, Aug 30, 2019 at 09:12:23AM CEST, [email protected] wrote:
>>>From: Jiri Pirko <[email protected]>
>>>Date: Fri, 30 Aug 2019 08:36:24 +0200
>>>
>>>> The promiscuity is a way to setup the rx filter. So promics == rx filter
>>>> off. For normal nics, where there is no hw fwd datapath,
>>>> this coincidentally means all received packets go to cpu.
>>>
>>>You cannot convince me that the HW datapath isn't a "rx filter" too, sorry.
>>
>> If you look at it that way, then we have 2: rx_filter and hw_rx_filter.
>> The point is, those 2 are not one item, that is the point I'm trying to
>> make :/
>
>And you can turn both of them off when I ask for promiscuous mode, that's
>a detail of the device not a semantic issue.
Well, bridge asks for promiscuous mode during enslave -> hw_rx_filter off
When you, want to see all traffic in tcpdump -> rx_filter off
So basically there are 2 flavours of promiscuous mode we have to somehow
distinguish between, so the driver knows what to do.
Nothe that the hw_rx_filter off is not something special to bridge.
There is a usecase for this when no bridge is there, only TC filters for
example.
On Thu, Aug 29, 2019 at 03:12:01PM -0700, David Miller wrote:
> From: Ido Schimmel <[email protected]>
> Date: Thu, 29 Aug 2019 22:36:13 +0300
>
> > I fully agree that we should make it easy for users to capture offloaded
> > traffic, which is why I suggested patching libpcap. Add a flag to
> > capable netdevs that tells libpcap that in order to capture all the
> > traffic from this interface it needs to add a tc filter with a trap
> > action. That way zero familiarity with tc is required from users.
>
> Why not just make setting promisc mode on the device do this rather than
> require all of this tc indirection nonsense?
>
> That's the whole point of this conversation I thought?
As I understand it, the goal of this series is to be able to capture
offloaded traffic.
Currently, the only indication that drivers get when someone is running
tcpdump/tshark/whatever over an interface is that it's is put in promisc
mode. So this patches use it as an indication to trap all the traffic to
the CPU and special case the bridge in order to prevent the driver from
trapping packets when its ports are bridged. The same will have to be
done for OVS and in any other (current or future) cases where promisc
mode is needed to disable Rx filtering.
If there was another indication that these applications are running over
an interface, would we be bothering with this new interpretation of
promisc mode and special casing? I don't think so.
We can instead teach libpcap that in order to capture offloaded traffic
it should use this "tc indirection nonsense". Or turn on a new knob.
This avoids the need to change each driver with this new special case
logic.
Also, what happens when I'm running these application without putting
the interface in promisc mode? On an offloaded interface I would not be
able to even capture packets addressed to my interface's MAC address.
What happens when the interface is already in promisc mode (because of
the bridge) and I'm again running tcpdump with '-p'? I will not be able
to capture offloaded traffic despite the fact that the interface is
already configured in promisc mode.
> Also, what happens when I'm running these application without putting
> the interface in promisc mode? On an offloaded interface I would not be
> able to even capture packets addressed to my interface's MAC address.
Sorry for rejoining the discussion late. I've been travelling and i'm
now 3/4 of the way to Lisbon.
That statement i don't get. If the frame has the MAC address of the
interface, it has to be delivered to the CPU. And so pcap will see it
when running on the interface. I can pretty much guarantee every DSA
driver does that.
But to address the bigger picture. My understanding is that we want to
model offloading as a mechanism to accelerate what Linux can already
do. The user should not have to care about these accelerators. The
interface should work like a normal Linux interface. I can put an IP
address on it and ping a peer. I can run a dhcp client and get an IP
address from a dhcp server. I can add the interface to a bridge, and
packets will get bridged. I as a user should not need to care if this
is done in software, or accelerated by offloading it. I can add a
route, and if the accelerate knows about L3, it can accelerate that as
well. If not, the kernel will route it.
So if i run wireshark on an interface, i expect the interface will be
put into promisc mode and i see all packets ingressing the interface.
What the accelerator needs to do to achieve this, i as a user don't
care.
I can follow the argument that i won't necessarily see every
packet. But that is always true. For many embedded systems, the CPU is
too slow to receive at line rate, even when we are talking about 1G
links. Packets do get dropped. And i hope tcpdump users understand
that.
For me, having tcpdump use tc trap is just wrong. It breaks the model
that the user should not care about the accelerator. If anything, i
think the driver needs to translate cBPF which pcap passes to the
kernel to whatever internal format the accelerator can process. That
is just another example of using hardware acceleration.
Andrew
On Sat, Aug 31, 2019 at 09:35:56PM +0200, Andrew Lunn wrote:
> > Also, what happens when I'm running these application without putting
> > the interface in promisc mode? On an offloaded interface I would not be
> > able to even capture packets addressed to my interface's MAC address.
>
> Sorry for rejoining the discussion late. I've been travelling and i'm
> now 3/4 of the way to Lisbon.
Hi Andrew,
Have fun!
> That statement i don't get.
What about the other statements?
> If the frame has the MAC address of the interface, it has to be
> delivered to the CPU.
So every packet that needs to be routed should be delivered to the CPU?
Definitely not.
> And so pcap will see it when running on the interface. I can pretty
> much guarantee every DSA driver does that.
I assume because you currently only consider L2 forwarding.
> But to address the bigger picture. My understanding is that we want to
> model offloading as a mechanism to accelerate what Linux can already
> do. The user should not have to care about these accelerators. The
> interface should work like a normal Linux interface. I can put an IP
> address on it and ping a peer. I can run a dhcp client and get an IP
> address from a dhcp server. I can add the interface to a bridge, and
> packets will get bridged. I as a user should not need to care if this
> is done in software, or accelerated by offloading it. I can add a
> route, and if the accelerate knows about L3, it can accelerate that as
> well. If not, the kernel will route it.
Yep, and this is how it's all working today.
> So if i run wireshark on an interface, i expect the interface will be
> put into promisc mode and i see all packets ingressing the interface.
> What the accelerator needs to do to achieve this, i as a user don't
> care.
>
> I can follow the argument that i won't necessarily see every
> packet. But that is always true. For many embedded systems, the CPU is
> too slow to receive at line rate, even when we are talking about 1G
> links. Packets do get dropped. And i hope tcpdump users understand
> that.
>
> For me, having tcpdump use tc trap is just wrong. It breaks the model
> that the user should not care about the accelerator. If anything, i
> think the driver needs to translate cBPF which pcap passes to the
> kernel to whatever internal format the accelerator can process. That
> is just another example of using hardware acceleration.
Look, this again boils down to what promisc mode means with regards to
hardware offload. You want it to mean punt all traffic to the CPU? Fine.
Does not seem like anyone will be switching sides anyway, so lets move
forward. But the current approach is not good. Each driver needs to have
this special case logic and the semantics of promisc mode change not
only with regards to the value of the promisc counter, but also with
regards to the interface's uppers. This is highly fragile and confusing.
The approach taken in v2 makes much more sense. Add a new flag to
accelerators and have the networking stack check it before putting the
interface in promisc mode. Then the only thing drivers need to do is to
instruct the accelerator to trap all traffic to the CPU.
Sat, Aug 31, 2019 at 09:35:56PM CEST, [email protected] wrote:
>> Also, what happens when I'm running these application without putting
>> the interface in promisc mode? On an offloaded interface I would not be
>> able to even capture packets addressed to my interface's MAC address.
>
>Sorry for rejoining the discussion late. I've been travelling and i'm
>now 3/4 of the way to Lisbon.
>
>That statement i don't get. If the frame has the MAC address of the
>interface, it has to be delivered to the CPU. And so pcap will see it
1) you cannot go to slowpath for all such packets (route)
2) you might be interested to examine packets with other dest mac too.
>when running on the interface. I can pretty much guarantee every DSA
>driver does that.
>
>But to address the bigger picture. My understanding is that we want to
>model offloading as a mechanism to accelerate what Linux can already
>do. The user should not have to care about these accelerators. The
>interface should work like a normal Linux interface. I can put an IP
>address on it and ping a peer. I can run a dhcp client and get an IP
>address from a dhcp server. I can add the interface to a bridge, and
>packets will get bridged. I as a user should not need to care if this
>is done in software, or accelerated by offloading it. I can add a
>route, and if the accelerate knows about L3, it can accelerate that as
>well. If not, the kernel will route it.
>
>So if i run wireshark on an interface, i expect the interface will be
>put into promisc mode and i see all packets ingressing the interface.
Again, you are merging 2 things together:
1) rx filter - this is needed for bridge, ovs, others (tc)
this is promisc setting
2) cpu trap - here one may be interested in:
a) only packets ASIC traps to CPU by default (ARPs, STP, BGP, etc)
b) all packets ingressing the port (note that those are only those
passed by rx filter)
Clearly 1) and 2) need separate knobs. In 2), there are valid usecases
for both a) and b). Only the user is the one who can tell which is he
interested in. This can't happen automagically.
Can we just have a knob for 2)?
>What the accelerator needs to do to achieve this, i as a user don't
>care.
>
>I can follow the argument that i won't necessarily see every
>packet. But that is always true. For many embedded systems, the CPU is
>too slow to receive at line rate, even when we are talking about 1G
>links. Packets do get dropped. And i hope tcpdump users understand
>that.
>
>For me, having tcpdump use tc trap is just wrong. It breaks the model
>that the user should not care about the accelerator. If anything, i
>think the driver needs to translate cBPF which pcap passes to the
>kernel to whatever internal format the accelerator can process. That
>is just another example of using hardware acceleration.
>
> Andrew
On Sat, Aug 31, 2019 at 11:47:05PM +0300, Ido Schimmel wrote:
> On Sat, Aug 31, 2019 at 09:35:56PM +0200, Andrew Lunn wrote:
> > > Also, what happens when I'm running these application without putting
> > > the interface in promisc mode? On an offloaded interface I would not be
> > > able to even capture packets addressed to my interface's MAC address.
> >
> > Sorry for rejoining the discussion late. I've been travelling and i'm
> > now 3/4 of the way to Lisbon.
>
> Hi Andrew,
>
> Have fun!
>
> > That statement i don't get.
>
> What about the other statements?
>
> > If the frame has the MAC address of the interface, it has to be
> > delivered to the CPU.
>
> So every packet that needs to be routed should be delivered to the CPU?
> Definitely not.
>
> > And so pcap will see it when running on the interface. I can pretty
> > much guarantee every DSA driver does that.
>
> I assume because you currently only consider L2 forwarding.
Yes, that is what i missed. The vast majority of switches which Linux
supports are L2. All the switches i deal with are L2. So i did not
think about L3. My bad.
> > But to address the bigger picture. My understanding is that we want to
> > model offloading as a mechanism to accelerate what Linux can already
> > do. The user should not have to care about these accelerators. The
> > interface should work like a normal Linux interface. I can put an IP
> > address on it and ping a peer. I can run a dhcp client and get an IP
> > address from a dhcp server. I can add the interface to a bridge, and
> > packets will get bridged. I as a user should not need to care if this
> > is done in software, or accelerated by offloading it. I can add a
> > route, and if the accelerate knows about L3, it can accelerate that as
> > well. If not, the kernel will route it.
>
> Yep, and this is how it's all working today.
So for a L3 switch, frames which match the MAC address, and one of the
many global scope IP addresses on any interface, get delivered to the
CPU, when the accelerator is L3 capable. If the IP address does not
match, it gets routed in hardware, if there is an appropriate router,
otherwise it get passed to the CPU, so the CPU can route it out an
interface which is not part of the switch.
> Look, this again boils down to what promisc mode means with regards to
> hardware offload. You want it to mean punt all traffic to the CPU? Fine.
> Does not seem like anyone will be switching sides anyway, so lets move
> forward. But the current approach is not good. Each driver needs to have
> this special case logic and the semantics of promisc mode change not
> only with regards to the value of the promisc counter, but also with
> regards to the interface's uppers. This is highly fragile and confusing.
Yes, i agree. We want one function, in the core, which handles all the
different uppers. Maybe 2, if we need to consider L2 and L3 switches
differently.
Andrew
Hi Jiri,
Sorry for joining the discussion this late, but I have been without mail access
for the last few days.
The 08/30/2019 08:36, Jiri Pirko wrote:
> Fri, Aug 30, 2019 at 08:02:33AM CEST, [email protected] wrote:
> >From: Jiri Pirko <[email protected]>
> >Date: Fri, 30 Aug 2019 07:39:40 +0200
> >
> >> Because the "promisc mode" would gain another meaning. Now how the
> >> driver should guess which meaning the user ment when he setted it?
> >> filter or trap?
> >>
> >> That is very confusing. If the flag is the way to do this, let's
> >> introduce another flag, like IFF_TRAPPING indicating that user wants
> >> exactly this.
> >
> >I don't understand how the meaning of promiscuous mode for a
> >networking device has suddenly become ambiguous, when did this start
> >happening?
>
> The promiscuity is a way to setup the rx filter. So promics == rx filter
> off. For normal nics, where there is no hw fwd datapath,
> this coincidentally means all received packets go to cpu.
> But if there is hw fwd datapath, rx filter is still off, all rxed packets
> are processed. But that does not mean they should be trapped to cpu.
>
> Simple example:
> I need to see slowpath packets, for example arps/stp/bgp/... that
> are going to cpu, I do:
> tcpdump -i swp1
How is this different from "tcpdump -p -i swp1"
> I don't want to get all the traffic running over hw running this cmd.
> This is a valid usecase.
>
> To cope with hw fwd datapath devices, I believe that tcpdump has to have
> notion of that. Something like:
>
> tcpdump -i swp1 --hw-trapping-mode
>
> The logic can be inverse:
> tcpdump -i swp1
> tcpdump -i swp1 --no-hw-trapping-mode
>
> However, that would provide inconsistent behaviour between existing and
> patched tcpdump/kernel.
>
> All I'm trying to say, there are 2 flags
> needed (if we don't use tc trap).
I have been reading through this thread several times and I still do not get it.
As far as I understand you are arguing that we need 3 modes:
- tcpdump -i swp1
- tcpdump -p -i swp1
- tcpdump -i swp1 --hw-trapping-mode
Would you mind provide an example of the traffic you want to see in the 3 cases
(or the traffic which you do not want to see).
/Allan
Mon, Sep 02, 2019 at 07:42:31PM CEST, [email protected] wrote:
>Hi Jiri,
>
>Sorry for joining the discussion this late, but I have been without mail access
>for the last few days.
>
>
>The 08/30/2019 08:36, Jiri Pirko wrote:
>> Fri, Aug 30, 2019 at 08:02:33AM CEST, [email protected] wrote:
>> >From: Jiri Pirko <[email protected]>
>> >Date: Fri, 30 Aug 2019 07:39:40 +0200
>> >
>> >> Because the "promisc mode" would gain another meaning. Now how the
>> >> driver should guess which meaning the user ment when he setted it?
>> >> filter or trap?
>> >>
>> >> That is very confusing. If the flag is the way to do this, let's
>> >> introduce another flag, like IFF_TRAPPING indicating that user wants
>> >> exactly this.
>> >
>> >I don't understand how the meaning of promiscuous mode for a
>> >networking device has suddenly become ambiguous, when did this start
>> >happening?
>>
>> The promiscuity is a way to setup the rx filter. So promics == rx filter
>> off. For normal nics, where there is no hw fwd datapath,
>> this coincidentally means all received packets go to cpu.
>> But if there is hw fwd datapath, rx filter is still off, all rxed packets
>> are processed. But that does not mean they should be trapped to cpu.
>>
>> Simple example:
>> I need to see slowpath packets, for example arps/stp/bgp/... that
>> are going to cpu, I do:
>> tcpdump -i swp1
>
>How is this different from "tcpdump -p -i swp1"
>
>> I don't want to get all the traffic running over hw running this cmd.
>> This is a valid usecase.
>>
>> To cope with hw fwd datapath devices, I believe that tcpdump has to have
>> notion of that. Something like:
>>
>> tcpdump -i swp1 --hw-trapping-mode
>>
>> The logic can be inverse:
>> tcpdump -i swp1
>> tcpdump -i swp1 --no-hw-trapping-mode
>>
>> However, that would provide inconsistent behaviour between existing and
>> patched tcpdump/kernel.
>>
>> All I'm trying to say, there are 2 flags
>> needed (if we don't use tc trap).
>
>I have been reading through this thread several times and I still do not get it.
>
>As far as I understand you are arguing that we need 3 modes:
>
>- tcpdump -i swp1
Depends on default. Promisc is on.
>- tcpdump -p -i swp1
All traffic that is trapped to the cpu by default, not promisc means
only mac of the interface (if bridge for example haven't set promisc
already) and special macs. So host traffic (ip of host), bgp, arp, nsnd,
etc.
>- tcpdump -i swp1 --hw-trapping-mode
Promisc is on, all traffic received on the port and pushed to cpu. User
has to be careful because in case of mlxsw this can lead to couple
hundred gigabit traffic going over limited pci bandwidth (gigabits).
>
>Would you mind provide an example of the traffic you want to see in the 3 cases
>(or the traffic which you do not want to see).
>
>/Allan
>
The 09/01/2019 20:48, Andrew Lunn wrote:
> > Look, this again boils down to what promisc mode means with regards to
> > hardware offload. You want it to mean punt all traffic to the CPU? Fine.
> > Does not seem like anyone will be switching sides anyway, so lets move
> > forward. But the current approach is not good. Each driver needs to have
> > this special case logic and the semantics of promisc mode change not
> > only with regards to the value of the promisc counter, but also with
> > regards to the interface's uppers. This is highly fragile and confusing.
> Yes, i agree. We want one function, in the core, which handles all the
> different uppers. Maybe 2, if we need to consider L2 and L3 switches
> differently.
Are you suggesting that we continue the path in v3, but add a utility function
to determinate if the interface needs to go into promisc mode (taking the
bridge stats, the promisc counter, and the upper devices into account).
Or are you suggest that we like Ido go back to v2?
/Allan
The 09/02/2019 19:51, Jiri Pirko wrote:
> External E-Mail
>
>
> Mon, Sep 02, 2019 at 07:42:31PM CEST, [email protected] wrote:
> >Hi Jiri,
> >
> >Sorry for joining the discussion this late, but I have been without mail access
> >for the last few days.
> >
> >
> >The 08/30/2019 08:36, Jiri Pirko wrote:
> >> Fri, Aug 30, 2019 at 08:02:33AM CEST, [email protected] wrote:
> >> >From: Jiri Pirko <[email protected]>
> >> >Date: Fri, 30 Aug 2019 07:39:40 +0200
> >> >
> >> >> Because the "promisc mode" would gain another meaning. Now how the
> >> >> driver should guess which meaning the user ment when he setted it?
> >> >> filter or trap?
> >> >>
> >> >> That is very confusing. If the flag is the way to do this, let's
> >> >> introduce another flag, like IFF_TRAPPING indicating that user wants
> >> >> exactly this.
> >> >
> >> >I don't understand how the meaning of promiscuous mode for a
> >> >networking device has suddenly become ambiguous, when did this start
> >> >happening?
> >>
> >> The promiscuity is a way to setup the rx filter. So promics == rx filter
> >> off. For normal nics, where there is no hw fwd datapath,
> >> this coincidentally means all received packets go to cpu.
> >> But if there is hw fwd datapath, rx filter is still off, all rxed packets
> >> are processed. But that does not mean they should be trapped to cpu.
> >>
> >> Simple example:
> >> I need to see slowpath packets, for example arps/stp/bgp/... that
> >> are going to cpu, I do:
> >> tcpdump -i swp1
> >
> >How is this different from "tcpdump -p -i swp1"
> >
> >> I don't want to get all the traffic running over hw running this cmd.
> >> This is a valid usecase.
> >>
> >> To cope with hw fwd datapath devices, I believe that tcpdump has to have
> >> notion of that. Something like:
> >>
> >> tcpdump -i swp1 --hw-trapping-mode
> >>
> >> The logic can be inverse:
> >> tcpdump -i swp1
> >> tcpdump -i swp1 --no-hw-trapping-mode
> >>
> >> However, that would provide inconsistent behaviour between existing and
> >> patched tcpdump/kernel.
> >>
> >> All I'm trying to say, there are 2 flags
> >> needed (if we don't use tc trap).
> >
> >I have been reading through this thread several times and I still do not get it.
> >
> >As far as I understand you are arguing that we need 3 modes:
> >
> >- tcpdump -i swp1
>
> Depends on default. Promisc is on.
>
>
> >- tcpdump -p -i swp1
>
> All traffic that is trapped to the cpu by default, not promisc means
> only mac of the interface (if bridge for example haven't set promisc
> already) and special macs. So host traffic (ip of host), bgp, arp, nsnd,
> etc.
In the case where the interface is enslaved to a bridge, it is put into promisc
mode, which means that "tcpdump -i swp1" and "tcpdump -p -i swp1" give the same
result, right?
Is this desirable?
> >- tcpdump -i swp1 --hw-trapping-mode
>
> Promisc is on, all traffic received on the port and pushed to cpu. User
> has to be careful because in case of mlxsw this can lead to couple
> hundred gigabit traffic going over limited pci bandwidth (gigabits).
>
>
> >
> >Would you mind provide an example of the traffic you want to see in the 3 cases
> >(or the traffic which you do not want to see).
> >
> >/Allan
> >
>
--
/Allan
Mon, Sep 02, 2019 at 08:05:20PM CEST, [email protected] wrote:
>The 09/02/2019 19:51, Jiri Pirko wrote:
>> External E-Mail
>>
>>
>> Mon, Sep 02, 2019 at 07:42:31PM CEST, [email protected] wrote:
>> >Hi Jiri,
>> >
>> >Sorry for joining the discussion this late, but I have been without mail access
>> >for the last few days.
>> >
>> >
>> >The 08/30/2019 08:36, Jiri Pirko wrote:
>> >> Fri, Aug 30, 2019 at 08:02:33AM CEST, [email protected] wrote:
>> >> >From: Jiri Pirko <[email protected]>
>> >> >Date: Fri, 30 Aug 2019 07:39:40 +0200
>> >> >
>> >> >> Because the "promisc mode" would gain another meaning. Now how the
>> >> >> driver should guess which meaning the user ment when he setted it?
>> >> >> filter or trap?
>> >> >>
>> >> >> That is very confusing. If the flag is the way to do this, let's
>> >> >> introduce another flag, like IFF_TRAPPING indicating that user wants
>> >> >> exactly this.
>> >> >
>> >> >I don't understand how the meaning of promiscuous mode for a
>> >> >networking device has suddenly become ambiguous, when did this start
>> >> >happening?
>> >>
>> >> The promiscuity is a way to setup the rx filter. So promics == rx filter
>> >> off. For normal nics, where there is no hw fwd datapath,
>> >> this coincidentally means all received packets go to cpu.
>> >> But if there is hw fwd datapath, rx filter is still off, all rxed packets
>> >> are processed. But that does not mean they should be trapped to cpu.
>> >>
>> >> Simple example:
>> >> I need to see slowpath packets, for example arps/stp/bgp/... that
>> >> are going to cpu, I do:
>> >> tcpdump -i swp1
>> >
>> >How is this different from "tcpdump -p -i swp1"
>> >
>> >> I don't want to get all the traffic running over hw running this cmd.
>> >> This is a valid usecase.
>> >>
>> >> To cope with hw fwd datapath devices, I believe that tcpdump has to have
>> >> notion of that. Something like:
>> >>
>> >> tcpdump -i swp1 --hw-trapping-mode
>> >>
>> >> The logic can be inverse:
>> >> tcpdump -i swp1
>> >> tcpdump -i swp1 --no-hw-trapping-mode
>> >>
>> >> However, that would provide inconsistent behaviour between existing and
>> >> patched tcpdump/kernel.
>> >>
>> >> All I'm trying to say, there are 2 flags
>> >> needed (if we don't use tc trap).
>> >
>> >I have been reading through this thread several times and I still do not get it.
>> >
>> >As far as I understand you are arguing that we need 3 modes:
>> >
>> >- tcpdump -i swp1
>>
>> Depends on default. Promisc is on.
>>
>>
>> >- tcpdump -p -i swp1
>>
>> All traffic that is trapped to the cpu by default, not promisc means
>> only mac of the interface (if bridge for example haven't set promisc
>> already) and special macs. So host traffic (ip of host), bgp, arp, nsnd,
>> etc.
>
>In the case where the interface is enslaved to a bridge, it is put into promisc
>mode, which means that "tcpdump -i swp1" and "tcpdump -p -i swp1" give the same
>result, right?
>
>Is this desirable?
Yes, that is correct and expected. It it might not be bridged, depends
on a usecase.
>
>> >- tcpdump -i swp1 --hw-trapping-mode
>>
>> Promisc is on, all traffic received on the port and pushed to cpu. User
>> has to be careful because in case of mlxsw this can lead to couple
>> hundred gigabit traffic going over limited pci bandwidth (gigabits).
>>
>>
>> >
>> >Would you mind provide an example of the traffic you want to see in the 3 cases
>> >(or the traffic which you do not want to see).
>> >
>> >/Allan
>> >
>>
>
>--
>/Allan
On Mon, Sep 02, 2019 at 07:42:31PM +0200, Allan W. Nielsen wrote:
> I have been reading through this thread several times and I still do not get it.
Allan,
I kept thinking about this and I want to make sure that I correctly
understand the end result.
With these patches applied I assume I will see the following traffic
when running tcpdump on one of the netdevs exposed by the ocelot driver:
- Ingress: All
- Egress: Only locally generated traffic and traffic forwarded by the
kernel from interfaces not belonging to the ocelot driver
The above means I will not see any offloaded traffic transmitted by the
port. Is that correct? I see that the driver is setting
'offload_fwd_mark' for any traffic trapped from bridged ports, which
means the bridge will drop it before it traverses the packet taps on
egress.
Large parts of the discussion revolve around the fact that switch ports
are not any different than other ports. Dave wrote "Please stop
portraying switches as special in this regard" and Andrew wrote "[The
user] just wants tcpdump to work like on their desktop."
But if anything, this discussion proves that switch ports are special in
this regard and that tcpdump will not work like on the desktop.
Beside the fact that I don't agree (but gave up) with the new
interpretation of promisc mode, I wonder if we're not asking for trouble
with this patchset. Users will see all offloaded traffic on ingress, but
none of it on egress. This is in contrast to the sever/desktop, where
Linux is much more dominant in comparison to switches (let alone hw
accelerated ones) and where all the traffic is visible through tcpdump.
I can already see myself having to explain this over and over again to
confused users.
Now, I understand that showing egress traffic is inherently difficult.
It means one of two things:
1. We allow packets to be forwarded by both the software and the
hardware
2. We trap all ingressing traffic from all the ports
Both options can have devastating effects on the network and therefore
should not be triggered by a supposedly innocent invocation of tcpdump.
I again wonder if it would not be wiser to solve this by introducing two
new flags to tcpdump for ingress/egress (similar to -Q in/out) capturing
of offloaded traffic. The capturing of egress offloaded traffic can be
documented with the appropriate warnings.
Anyway, I don't want to hold you up, I merely want to make sure that the
above (assuming it's correct) is considered before the patches are
applied.
Thanks
Hi Ido,
The 09/03/2019 09:13, Ido Schimmel wrote:
> On Mon, Sep 02, 2019 at 07:42:31PM +0200, Allan W. Nielsen wrote:
> > I have been reading through this thread several times and I still do not get it.
> I kept thinking about this and I want to make sure that I correctly
> understand the end result.
So do I, and regardless of this being merged or not, I'm really happy that we
have the discussion as there are clearly a need for synchronizing the meaning of
promiscuity.
One of the reasons why we started working on this patch was the comments we got
when requesting comments on the "[PATCH] net: bridge: Allow bridge to join
multicast groups" where we want to be able to setup mac-entries for L2 multicast
MAC addresses. Here it is important for us to be able to control which ports a
static configured multicast address needs to be forwarded to (including the
CPU). When working on this patch it was pointed out that the interfaces are in
promisc mode anyway (if member of a bridge, and if flooding and learning are
enabled), and it therefore this optimization should not matter anyway.
I found that this was a fair comment, and it has been on the list of things we
wanted to "fix" for a long time.
The optimization does matter to us, as we have until now worked around the issue
by not implementing promisc mode.
When debugging, it is very useful for us to be able to see all the traffic RXed
on the interface, and to us it did make a lot of sense to "just" make promisc
mode work like we was used to.
But, this is for debugging, it is not the most important patch we have on the
backlog, and I would therefore prefer finding a solution which we are all happy
with.
> With these patches applied I assume I will see the following traffic
> when running tcpdump on one of the netdevs exposed by the ocelot driver:
>
> - Ingress: All
> - Egress: Only locally generated traffic and traffic forwarded by the
> kernel from interfaces not belonging to the ocelot driver
>
> The above means I will not see any offloaded traffic transmitted by the
> port. Is that correct?
Correct - but maybe we should change this.
In Ocelot and in LANxxxx (the part we are working on now), we can come pretty
close. We can get the offloaded TX traffic to the CPU, but it will not be
re-written (it will look like the ingress frame, which is not always the same as
the egress frame, vlan tags an others may be re-written).
In some of our chips we can actually do this (not Ocelot, and not the LANxxxx
part we are working on now) after the frame as been re-written.
> I see that the driver is setting 'offload_fwd_mark' for any traffic trapped
> from bridged ports, which means the bridge will drop it before it traverses
> the packet taps on egress.
Correct.
> Large parts of the discussion revolve around the fact that switch ports
> are not any different than other ports. Dave wrote "Please stop
> portraying switches as special in this regard" and Andrew wrote "[The
> user] just wants tcpdump to work like on their desktop."
And we are trying to get as close to this as practical possible, knowing that it
may not be exactly the same.
> But if anything, this discussion proves that switch ports are special in
> this regard and that tcpdump will not work like on the desktop.
I think it can come really close. Some drivers may be able to fix the TX issue
you point out, others may not.
> Beside the fact that I don't agree (but gave up) with the new
> interpretation of promisc mode, I wonder if we're not asking for trouble
> with this patchset. Users will see all offloaded traffic on ingress, but
> none of it on egress. This is in contrast to the sever/desktop, where
> Linux is much more dominant in comparison to switches (let alone hw
> accelerated ones) and where all the traffic is visible through tcpdump.
> I can already see myself having to explain this over and over again to
> confused users.
>
> Now, I understand that showing egress traffic is inherently difficult.
> It means one of two things:
>
> 1. We allow packets to be forwarded by both the software and the
> hardware
> 2. We trap all ingressing traffic from all the ports
If the HW cannot copy the egress traffic to the CPU (which our HW cannot), then
you need to do both. All ingress traffic needs to go to the CPU, you need to
make all the forwarding decisions in the CPU, to figure out what traffic happens
to go to the port you want to monitor.
I really doubt this will work in real life. Too much traffic, and HW may make
different forwarding decision that the SW (tc rules in HW but not in SW), which
means that it will not be good for debugging anyway.
> Both options can have devastating effects on the network and therefore
> should not be triggered by a supposedly innocent invocation of tcpdump.
Agree.
> I again wonder if it would not be wiser to solve this by introducing two
> new flags to tcpdump for ingress/egress (similar to -Q in/out) capturing
> of offloaded traffic. The capturing of egress offloaded traffic can be
> documented with the appropriate warnings.
Not sure I agree, but I will try to spend some more time considering it.
In the mean while, what TC action was it that Jiri suggestion we should use? The
trap action is no good, and it prevents the forwarding in silicon, and I'm not
aware of a "COPY-TO-CPU" action.
> Anyway, I don't want to hold you up, I merely want to make sure that the
> above (assuming it's correct) is considered before the patches are
> applied.
Sounds good, and thanks for all the time spend on reviewing and asking the
critical questions.
/Allan
On Tue, Sep 03, 2019 at 10:14:12AM +0200, Allan W. Nielsen wrote:
> The 09/03/2019 09:13, Ido Schimmel wrote:
> > On Mon, Sep 02, 2019 at 07:42:31PM +0200, Allan W. Nielsen wrote:
> > With these patches applied I assume I will see the following traffic
> > when running tcpdump on one of the netdevs exposed by the ocelot driver:
> >
> > - Ingress: All
> > - Egress: Only locally generated traffic and traffic forwarded by the
> > kernel from interfaces not belonging to the ocelot driver
> >
> > The above means I will not see any offloaded traffic transmitted by the
> > port. Is that correct?
> Correct - but maybe we should change this.
>
> In Ocelot and in LANxxxx (the part we are working on now), we can come pretty
> close. We can get the offloaded TX traffic to the CPU, but it will not be
> re-written (it will look like the ingress frame, which is not always the same as
> the egress frame, vlan tags an others may be re-written).
Yes, this is the same with mlxsw. You can trap the egress frames, but
they will reach the CPU unmodified via the ingress port.
> In some of our chips we can actually do this (not Ocelot, and not the LANxxxx
> part we are working on now) after the frame as been re-written.
Cool.
> > I see that the driver is setting 'offload_fwd_mark' for any traffic trapped
> > from bridged ports, which means the bridge will drop it before it traverses
> > the packet taps on egress.
> Correct.
>
> > Large parts of the discussion revolve around the fact that switch ports
> > are not any different than other ports. Dave wrote "Please stop
> > portraying switches as special in this regard" and Andrew wrote "[The
> > user] just wants tcpdump to work like on their desktop."
> And we are trying to get as close to this as practical possible, knowing that it
> may not be exactly the same.
>
> > But if anything, this discussion proves that switch ports are special in
> > this regard and that tcpdump will not work like on the desktop.
> I think it can come really close. Some drivers may be able to fix the TX issue
> you point out, others may not.
>
> > Beside the fact that I don't agree (but gave up) with the new
> > interpretation of promisc mode, I wonder if we're not asking for trouble
> > with this patchset. Users will see all offloaded traffic on ingress, but
> > none of it on egress. This is in contrast to the sever/desktop, where
> > Linux is much more dominant in comparison to switches (let alone hw
> > accelerated ones) and where all the traffic is visible through tcpdump.
> > I can already see myself having to explain this over and over again to
> > confused users.
> >
> > Now, I understand that showing egress traffic is inherently difficult.
> > It means one of two things:
> >
> > 1. We allow packets to be forwarded by both the software and the
> > hardware
> > 2. We trap all ingressing traffic from all the ports
> If the HW cannot copy the egress traffic to the CPU (which our HW cannot), then
> you need to do both. All ingress traffic needs to go to the CPU, you need to
> make all the forwarding decisions in the CPU, to figure out what traffic happens
> to go to the port you want to monitor.
>
> I really doubt this will work in real life. Too much traffic, and HW may make
> different forwarding decision that the SW (tc rules in HW but not in SW), which
> means that it will not be good for debugging anyway.
I agree.
>
> > Both options can have devastating effects on the network and therefore
> > should not be triggered by a supposedly innocent invocation of tcpdump.
> Agree.
>
> > I again wonder if it would not be wiser to solve this by introducing two
> > new flags to tcpdump for ingress/egress (similar to -Q in/out) capturing
> > of offloaded traffic. The capturing of egress offloaded traffic can be
> > documented with the appropriate warnings.
> Not sure I agree, but I will try to spend some more time considering it.
>
> In the mean while, what TC action was it that Jiri suggestion we should use? The
> trap action is no good, and it prevents the forwarding in silicon, and I'm not
> aware of a "COPY-TO-CPU" action.
I agree. We would either need a new or just extend the existing one with
a new attribute.
> > Anyway, I don't want to hold you up, I merely want to make sure that the
> > above (assuming it's correct) is considered before the patches are
> > applied.
> Sounds good, and thanks for all the time spend on reviewing and asking the
> critical questions.
Thanks for bringing up these issues. I will be happy to review future
patches.