The first patch of this series improves the RX side. As it seems to be
an expensive operation to read the RX timestamp for every frame, then
read it only if it is required. This will give an improvement of ~70mbit
on the RX side.
The second patch stops using the packing library. This improves mostly
the TX side as this library is used to set diffent bits in the IFH. If
this library is replaced with a more simple/shorter implementation,
this gives an improvement of ~100mbit on TX side.
All the measurements were done using iperf3.
Horatiu Vultur (2):
net: lan966x: Don't read RX timestamp if not needed
net: lan966x: Stop using packing library
.../net/ethernet/microchip/lan966x/Kconfig | 1 -
.../ethernet/microchip/lan966x/lan966x_fdma.c | 2 +-
.../ethernet/microchip/lan966x/lan966x_main.c | 77 +++++++++++++------
.../ethernet/microchip/lan966x/lan966x_main.h | 5 +-
.../ethernet/microchip/lan966x/lan966x_ptp.c | 20 ++---
5 files changed, 66 insertions(+), 39 deletions(-)
--
2.38.0
Whenever a frame was received to the CPU, the HW is timestamping the
frame. In the IFH(Inter Frame Header) it is found the nanosecond part
of the timestamps the SW is required to read from HW the second part.
But reading the second part it seems to be a expensive operations, so
so change this such to read the second part only when rx filter is
enabled.
Doing this change gives the RX a performance boost of ~70mbit.
before:
[ 5] 0.00-10.01 sec 546 MBytes 457 Mbits/sec 0 sender
now:
[ 5] 0.00-10.01 sec 652 MBytes 530 Mbits/sec 0 sender
Signed-off-by: Horatiu Vultur <[email protected]>
---
.../ethernet/microchip/lan966x/lan966x_fdma.c | 2 +-
.../ethernet/microchip/lan966x/lan966x_main.c | 2 +-
.../ethernet/microchip/lan966x/lan966x_main.h | 5 +++--
.../ethernet/microchip/lan966x/lan966x_ptp.c | 20 +++++++++----------
4 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
index 55b484b105627..2ed76bb61a731 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
@@ -517,7 +517,7 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx,
if (likely(!(skb->dev->features & NETIF_F_RXFCS)))
skb_trim(skb, skb->len - ETH_FCS_LEN);
- lan966x_ptp_rxtstamp(lan966x, skb, timestamp);
+ lan966x_ptp_rxtstamp(lan966x, skb, src_port, timestamp);
skb->protocol = eth_type_trans(skb, skb->dev);
if (lan966x->bridge_mask & BIT(src_port)) {
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index 685e8cd7658c9..4584a78c6ecbd 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -668,7 +668,7 @@ static irqreturn_t lan966x_xtr_irq_handler(int irq, void *args)
*buf = val;
}
- lan966x_ptp_rxtstamp(lan966x, skb, timestamp);
+ lan966x_ptp_rxtstamp(lan966x, skb, src_port, timestamp);
skb->protocol = eth_type_trans(skb, dev);
if (lan966x->bridge_mask & BIT(src_port)) {
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
index cbdae0ab8bb6d..851afb0166b19 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
@@ -407,7 +407,8 @@ struct lan966x_port {
struct phy *serdes;
struct fwnode_handle *fwnode;
- u8 ptp_cmd;
+ u8 ptp_tx_cmd;
+ bool ptp_rx_cmd;
u16 ts_id;
struct sk_buff_head tx_skbs;
@@ -527,7 +528,7 @@ void lan966x_ptp_deinit(struct lan966x *lan966x);
int lan966x_ptp_hwtstamp_set(struct lan966x_port *port, struct ifreq *ifr);
int lan966x_ptp_hwtstamp_get(struct lan966x_port *port, struct ifreq *ifr);
void lan966x_ptp_rxtstamp(struct lan966x *lan966x, struct sk_buff *skb,
- u64 timestamp);
+ u64 src_port, u64 timestamp);
int lan966x_ptp_txtstamp_request(struct lan966x_port *port,
struct sk_buff *skb);
void lan966x_ptp_txtstamp_release(struct lan966x_port *port,
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
index 931e37b9a0ada..266a21a2d1246 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
@@ -272,13 +272,13 @@ int lan966x_ptp_hwtstamp_set(struct lan966x_port *port, struct ifreq *ifr)
switch (cfg.tx_type) {
case HWTSTAMP_TX_ON:
- port->ptp_cmd = IFH_REW_OP_TWO_STEP_PTP;
+ port->ptp_tx_cmd = IFH_REW_OP_TWO_STEP_PTP;
break;
case HWTSTAMP_TX_ONESTEP_SYNC:
- port->ptp_cmd = IFH_REW_OP_ONE_STEP_PTP;
+ port->ptp_tx_cmd = IFH_REW_OP_ONE_STEP_PTP;
break;
case HWTSTAMP_TX_OFF:
- port->ptp_cmd = IFH_REW_OP_NOOP;
+ port->ptp_tx_cmd = IFH_REW_OP_NOOP;
break;
default:
return -ERANGE;
@@ -286,6 +286,7 @@ int lan966x_ptp_hwtstamp_set(struct lan966x_port *port, struct ifreq *ifr)
switch (cfg.rx_filter) {
case HWTSTAMP_FILTER_NONE:
+ port->ptp_rx_cmd = false;
break;
case HWTSTAMP_FILTER_ALL:
case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
@@ -301,6 +302,7 @@ int lan966x_ptp_hwtstamp_set(struct lan966x_port *port, struct ifreq *ifr)
case HWTSTAMP_FILTER_PTP_V2_SYNC:
case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
case HWTSTAMP_FILTER_NTP_ALL:
+ port->ptp_rx_cmd = true;
cfg.rx_filter = HWTSTAMP_FILTER_ALL;
break;
default:
@@ -332,7 +334,7 @@ static int lan966x_ptp_classify(struct lan966x_port *port, struct sk_buff *skb)
u8 msgtype;
int type;
- if (port->ptp_cmd == IFH_REW_OP_NOOP)
+ if (port->ptp_tx_cmd == IFH_REW_OP_NOOP)
return IFH_REW_OP_NOOP;
type = ptp_classify_raw(skb);
@@ -343,7 +345,7 @@ static int lan966x_ptp_classify(struct lan966x_port *port, struct sk_buff *skb)
if (!header)
return IFH_REW_OP_NOOP;
- if (port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP)
+ if (port->ptp_tx_cmd == IFH_REW_OP_TWO_STEP_PTP)
return IFH_REW_OP_TWO_STEP_PTP;
/* If it is sync and run 1 step then set the correct operation,
@@ -1009,9 +1011,6 @@ static int lan966x_ptp_phc_init(struct lan966x *lan966x,
phc->index = index;
phc->lan966x = lan966x;
- /* PTP Rx stamping is always enabled. */
- phc->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
-
return 0;
}
@@ -1088,14 +1087,15 @@ void lan966x_ptp_deinit(struct lan966x *lan966x)
}
void lan966x_ptp_rxtstamp(struct lan966x *lan966x, struct sk_buff *skb,
- u64 timestamp)
+ u64 src_port, u64 timestamp)
{
struct skb_shared_hwtstamps *shhwtstamps;
struct lan966x_phc *phc;
struct timespec64 ts;
u64 full_ts_in_ns;
- if (!lan966x->ptp)
+ if (!lan966x->ptp ||
+ !lan966x->ports[src_port]->ptp_rx_cmd)
return;
phc = &lan966x->phc[LAN966X_PHC_PORT];
--
2.38.0
When a frame is injected from CPU, it is required to create an IFH(Inter
frame header) which sits in front of the frame that is transmitted.
This IFH, contains different fields like destination port, to bypass the
analyzer, priotity, etc. Lan966x it is using packing library to set and
get the fields of this IFH. But this seems to be an expensive
operations.
If this is changed with a simpler implementation, the RX will be
improved with ~5Mbit while on the TX is a much bigger improvement as it
is required to set more fields. Below are the numbers for TX.
Before:
[ 5] 0.00-10.02 sec 439 MBytes 367 Mbits/sec 0 sender
After:
[ 5] 0.00-10.00 sec 563 MBytes 472 Mbits/sec 0 sender
Signed-off-by: Horatiu Vultur <[email protected]>
---
.../net/ethernet/microchip/lan966x/Kconfig | 1 -
.../ethernet/microchip/lan966x/lan966x_main.c | 75 +++++++++++++------
2 files changed, 51 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ethernet/microchip/lan966x/Kconfig b/drivers/net/ethernet/microchip/lan966x/Kconfig
index 8bcd60f17d6d3..571e6d4da1e9d 100644
--- a/drivers/net/ethernet/microchip/lan966x/Kconfig
+++ b/drivers/net/ethernet/microchip/lan966x/Kconfig
@@ -6,7 +6,6 @@ config LAN966X_SWITCH
depends on NET_SWITCHDEV
depends on BRIDGE || BRIDGE=n
select PHYLINK
- select PACKING
select PAGE_POOL
select VCAP
help
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index 4584a78c6ecbd..9134716b62a55 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -7,7 +7,6 @@
#include <linux/ip.h>
#include <linux/of_platform.h>
#include <linux/of_net.h>
-#include <linux/packing.h>
#include <linux/phy/phy.h>
#include <linux/reset.h>
#include <net/addrconf.h>
@@ -305,46 +304,58 @@ static int lan966x_port_ifh_xmit(struct sk_buff *skb,
return NETDEV_TX_BUSY;
}
+static void lan966x_ifh_set(u8 *ifh, size_t val, size_t pos, size_t length)
+{
+ u32 v = 0;
+
+ for (int i = 0; i < length ; i++) {
+ int j = pos + i;
+ int k = j % 8;
+
+ if (i == 0 || k == 0)
+ v = ifh[IFH_LEN_BYTES - (j / 8) - 1];
+
+ if (val & (1 << i))
+ v |= (1 << k);
+
+ if (i == (length - 1) || k == 7)
+ ifh[IFH_LEN_BYTES - (j / 8) - 1] = v;
+ }
+}
+
void lan966x_ifh_set_bypass(void *ifh, u64 bypass)
{
- packing(ifh, &bypass, IFH_POS_BYPASS + IFH_WID_BYPASS - 1,
- IFH_POS_BYPASS, IFH_LEN * 4, PACK, 0);
+ lan966x_ifh_set(ifh, bypass, IFH_POS_BYPASS, IFH_WID_BYPASS);
}
-void lan966x_ifh_set_port(void *ifh, u64 bypass)
+void lan966x_ifh_set_port(void *ifh, u64 port)
{
- packing(ifh, &bypass, IFH_POS_DSTS + IFH_WID_DSTS - 1,
- IFH_POS_DSTS, IFH_LEN * 4, PACK, 0);
+ lan966x_ifh_set(ifh, port, IFH_POS_DSTS, IFH_WID_DSTS);
}
-static void lan966x_ifh_set_qos_class(void *ifh, u64 bypass)
+static void lan966x_ifh_set_qos_class(void *ifh, u64 qos)
{
- packing(ifh, &bypass, IFH_POS_QOS_CLASS + IFH_WID_QOS_CLASS - 1,
- IFH_POS_QOS_CLASS, IFH_LEN * 4, PACK, 0);
+ lan966x_ifh_set(ifh, qos, IFH_POS_QOS_CLASS, IFH_WID_QOS_CLASS);
}
-static void lan966x_ifh_set_ipv(void *ifh, u64 bypass)
+static void lan966x_ifh_set_ipv(void *ifh, u64 ipv)
{
- packing(ifh, &bypass, IFH_POS_IPV + IFH_WID_IPV - 1,
- IFH_POS_IPV, IFH_LEN * 4, PACK, 0);
+ lan966x_ifh_set(ifh, ipv, IFH_POS_IPV, IFH_WID_IPV);
}
static void lan966x_ifh_set_vid(void *ifh, u64 vid)
{
- packing(ifh, &vid, IFH_POS_TCI + IFH_WID_TCI - 1,
- IFH_POS_TCI, IFH_LEN * 4, PACK, 0);
+ lan966x_ifh_set(ifh, vid, IFH_POS_TCI, IFH_WID_TCI);
}
static void lan966x_ifh_set_rew_op(void *ifh, u64 rew_op)
{
- packing(ifh, &rew_op, IFH_POS_REW_CMD + IFH_WID_REW_CMD - 1,
- IFH_POS_REW_CMD, IFH_LEN * 4, PACK, 0);
+ lan966x_ifh_set(ifh, rew_op, IFH_POS_REW_CMD, IFH_WID_REW_CMD);
}
static void lan966x_ifh_set_timestamp(void *ifh, u64 timestamp)
{
- packing(ifh, ×tamp, IFH_POS_TIMESTAMP + IFH_WID_TIMESTAMP - 1,
- IFH_POS_TIMESTAMP, IFH_LEN * 4, PACK, 0);
+ lan966x_ifh_set(ifh, timestamp, IFH_POS_TIMESTAMP, IFH_WID_TIMESTAMP);
}
static netdev_tx_t lan966x_port_xmit(struct sk_buff *skb,
@@ -582,22 +593,38 @@ static int lan966x_rx_frame_word(struct lan966x *lan966x, u8 grp, u32 *rval)
}
}
+static u64 lan966x_ifh_get(u8 *ifh, size_t pos, size_t length)
+{
+ u64 val = 0;
+ u8 v;
+
+ for (int i = 0; i < length ; i++) {
+ int j = pos + i;
+ int k = j % 8;
+
+ if (i == 0 || k == 0)
+ v = ifh[IFH_LEN_BYTES - (j / 8) - 1];
+
+ if (v & (1 << k))
+ val |= (1 << i);
+ }
+
+ return val;
+}
+
void lan966x_ifh_get_src_port(void *ifh, u64 *src_port)
{
- packing(ifh, src_port, IFH_POS_SRCPORT + IFH_WID_SRCPORT - 1,
- IFH_POS_SRCPORT, IFH_LEN * 4, UNPACK, 0);
+ *src_port = lan966x_ifh_get(ifh, IFH_POS_SRCPORT, IFH_WID_SRCPORT);
}
static void lan966x_ifh_get_len(void *ifh, u64 *len)
{
- packing(ifh, len, IFH_POS_LEN + IFH_WID_LEN - 1,
- IFH_POS_LEN, IFH_LEN * 4, UNPACK, 0);
+ *len = lan966x_ifh_get(ifh, IFH_POS_LEN, IFH_WID_LEN);
}
void lan966x_ifh_get_timestamp(void *ifh, u64 *timestamp)
{
- packing(ifh, timestamp, IFH_POS_TIMESTAMP + IFH_WID_TIMESTAMP - 1,
- IFH_POS_TIMESTAMP, IFH_LEN * 4, UNPACK, 0);
+ *timestamp = lan966x_ifh_get(ifh, IFH_POS_TIMESTAMP, IFH_WID_TIMESTAMP);
}
static irqreturn_t lan966x_xtr_irq_handler(int irq, void *args)
--
2.38.0
From: Horatiu Vultur
> Sent: 12 March 2023 20:24
>
> When a frame is injected from CPU, it is required to create an IFH(Inter
> frame header) which sits in front of the frame that is transmitted.
> This IFH, contains different fields like destination port, to bypass the
> analyzer, priotity, etc. Lan966x it is using packing library to set and
> get the fields of this IFH. But this seems to be an expensive
> operations.
> If this is changed with a simpler implementation, the RX will be
> improved with ~5Mbit while on the TX is a much bigger improvement as it
> is required to set more fields. Below are the numbers for TX.
...
> +static void lan966x_ifh_set(u8 *ifh, size_t val, size_t pos, size_t length)
> +{
> + u32 v = 0;
> +
> + for (int i = 0; i < length ; i++) {
> + int j = pos + i;
> + int k = j % 8;
> +
> + if (i == 0 || k == 0)
> + v = ifh[IFH_LEN_BYTES - (j / 8) - 1];
> +
> + if (val & (1 << i))
> + v |= (1 << k);
> +
> + if (i == (length - 1) || k == 7)
> + ifh[IFH_LEN_BYTES - (j / 8) - 1] = v;
> + }
> +}
> +
It has to be possible to do much better that that.
Given that 'pos' and 'length' are always constants it looks like
each call should reduce to (something like):
ifh[k] |= val << n;
ifk[k + 1] |= val >> (8 - n);
...
It might be that the compiler manages to do this, but I doubt it.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Mon, 13 Mar 2023 17:04:11 +0000 David Laight wrote:
> It has to be possible to do much better that that.
> Given that 'pos' and 'length' are always constants it looks like
> each call should reduce to (something like):
> ifh[k] |= val << n;
> ifk[k + 1] |= val >> (8 - n);
> ...
> It might be that the compiler manages to do this, but I doubt it.
Agreed, going bit-by-bit seems overly cautious.
The 03/13/2023 17:04, David Laight wrote:
>
> From: Horatiu Vultur
> > Sent: 12 March 2023 20:24
> >
> > When a frame is injected from CPU, it is required to create an IFH(Inter
> > frame header) which sits in front of the frame that is transmitted.
> > This IFH, contains different fields like destination port, to bypass the
> > analyzer, priotity, etc. Lan966x it is using packing library to set and
> > get the fields of this IFH. But this seems to be an expensive
> > operations.
> > If this is changed with a simpler implementation, the RX will be
> > improved with ~5Mbit while on the TX is a much bigger improvement as it
> > is required to set more fields. Below are the numbers for TX.
> ...
> > +static void lan966x_ifh_set(u8 *ifh, size_t val, size_t pos, size_t length)
> > +{
> > + u32 v = 0;
> > +
> > + for (int i = 0; i < length ; i++) {
> > + int j = pos + i;
> > + int k = j % 8;
> > +
> > + if (i == 0 || k == 0)
> > + v = ifh[IFH_LEN_BYTES - (j / 8) - 1];
> > +
> > + if (val & (1 << i))
> > + v |= (1 << k);
> > +
> > + if (i == (length - 1) || k == 7)
> > + ifh[IFH_LEN_BYTES - (j / 8) - 1] = v;
> > + }
> > +}
> > +
>
> It has to be possible to do much better that that.
> Given that 'pos' and 'length' are always constants it looks like
> each call should reduce to (something like):
> ifh[k] |= val << n;
> ifk[k + 1] |= val >> (8 - n);
> ...
> It might be that the compiler manages to do this, but I doubt it.
Thanks for the review. I will update this in the next version.
Do you think it is worth updating the code in lan966x_ifh_get to use
byte access and not to read each bit individually?
As there is no much improvement on the RX side that is using lan966x_ifh_get.
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>
--
/Horatiu