2023-02-09 09:27:53

by Wei Fang

[permalink] [raw]
Subject: [PATCH net-next] net: fec: add CBS offload support

From: Wei Fang <[email protected]>

The FEC hardware supports the Credit-based shaper (CBS) which control
the bandwidth distribution between normal traffic and time-sensitive
traffic with respect to the total link bandwidth available.
But notice that the bandwidth allocation of hardware is restricted to
certain values. Below is the equation which is used to calculate the
BW (bandwidth) fraction for per class:
BW fraction = 1 / (1 + 512 / idle_slope)

For values of idle_slope less than 128, idle_slope = 2 ^ n, when n =
0,1,2,...,6. For values equal to or greater than 128, idle_slope =
128 * m, where m = 1,2,3,...,12.
Example 1. idle_slope = 64, therefore BW fraction = 0.111.
Example 2. idle_slope = 128, therefore BW fraction = 0.200.

Here is an example command to set 200Mbps bandwidth on 1000Mbps port
for TC 2 and 111Mbps for TC 3.
tc qdisc add dev eth0 parent root handle 100 mqprio num_tc 3 map \
0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 queues 1@0 1@1 1@2 hw 0
tc qdisc replace dev eth0 parent 100:2 cbs idleslope 200000 \
sendslope -800000 hicredit 153 locredit -1389 offload 1
tc qdisc replace dev eth0 parent 100:3 cbs idleslope 111000 \
sendslope -889000 hicredit 90 locredit -892 offload 1

Signed-off-by: Wei Fang <[email protected]>
---
drivers/net/ethernet/freescale/fec.h | 4 +
drivers/net/ethernet/freescale/fec_main.c | 106 ++++++++++++++++++++++
2 files changed, 110 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index 5ba1e0d71c68..ad5f968aa086 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -340,6 +340,10 @@ struct bufdesc_ex {
#define RCMR_CMP(X) (((X) == 1) ? RCMR_CMP_1 : RCMR_CMP_2)
#define FEC_TX_BD_FTYPE(X) (((X) & 0xf) << 20)

+#define FEC_QOS_TX_SHEME_MASK GENMASK(2, 0)
+#define CREDIT_BASED_SCHEME 0
+#define ROUND_ROBIN_SCHEME 1
+
/* The number of Tx and Rx buffers. These are allocated from the page
* pool. The code may assume these are power of two, so it it best
* to keep them that size.
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index c73e25f8995e..3bb3a071fa0c 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -66,6 +66,7 @@
#include <linux/mfd/syscon.h>
#include <linux/regmap.h>
#include <soc/imx/cpuidle.h>
+#include <net/pkt_sched.h>
#include <linux/filter.h>
#include <linux/bpf.h>

@@ -3232,6 +3233,110 @@ static int fec_enet_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
return phy_mii_ioctl(phydev, rq, cmd);
}

+static u32 fec_enet_get_idle_slope(u8 bw)
+{
+ u32 idle_slope, quotient, msb;
+
+ /* Convert bw to hardware idle slope */
+ idle_slope = (512 * bw) / (100 - bw);
+
+ if (idle_slope >= 128) {
+ /* For values equal to or greater than 128, idle_slope = 128 * m,
+ * where m = 1, 2, 3, ...12. Here we use the rounding method.
+ */
+ quotient = idle_slope / 128;
+ if (idle_slope >= quotient * 128 + 64)
+ idle_slope = 128 * (quotient + 1);
+ else
+ idle_slope = 128 * quotient;
+
+ goto end;
+ }
+
+ /* For values less than 128, idle_slope = 2 ^ n, where
+ * n = 0, 1, 2, ...6. Here we use the rounding method.
+ * So the minimum of idle_slope is 1.
+ */
+ msb = fls(idle_slope);
+
+ if (msb == 0 || msb == 1) {
+ idle_slope = 1;
+ goto end;
+ }
+
+ msb -= 1;
+ if (idle_slope >= (1 << msb) + (1 << (msb - 1)))
+ idle_slope = 1 << (msb + 1);
+ else
+ idle_slope = 1 << msb;
+
+end:
+ return idle_slope;
+}
+
+static int fec_enet_setup_tc_cbs(struct net_device *ndev, void *type_data)
+{
+ struct fec_enet_private *fep = netdev_priv(ndev);
+ struct tc_cbs_qopt_offload *cbs = type_data;
+ int queue = cbs->queue;
+ u32 speed = fep->speed;
+ u32 val, idle_slope;
+ u8 bw;
+
+ if (!(fep->quirks & FEC_QUIRK_HAS_AVB))
+ return -EOPNOTSUPP;
+
+ /* Queue 1 for Class A, Queue 2 for Class B, so the ENET must has
+ * three queues.
+ */
+ if (fep->num_tx_queues != FEC_ENET_MAX_TX_QS)
+ return -EOPNOTSUPP;
+
+ /* Queue 0 is not AVB capable */
+ if (queue <= 0 || queue >= fep->num_tx_queues)
+ return -EINVAL;
+
+ val = readl(fep->hwp + FEC_QOS_SCHEME);
+ val &= ~FEC_QOS_TX_SHEME_MASK;
+ if (!cbs->enable) {
+ val |= ROUND_ROBIN_SCHEME;
+ writel(val, fep->hwp + FEC_QOS_SCHEME);
+
+ return 0;
+ }
+
+ val |= CREDIT_BASED_SCHEME;
+ writel(val, fep->hwp + FEC_QOS_SCHEME);
+
+ /* cbs->idleslope is in kilobits per second. speed is the port rate
+ * in megabits per second. So bandwidth ratio bw = (idleslope /
+ * (speed * 1000)) * 100, the unit is percentage.
+ */
+ bw = cbs->idleslope / (speed * 10UL);
+ /* bw% can not >= 100% */
+ if (bw >= 100)
+ return -EINVAL;
+ idle_slope = fec_enet_get_idle_slope(bw);
+
+ val = readl(fep->hwp + FEC_DMA_CFG(queue));
+ val &= ~IDLE_SLOPE_MASK;
+ val |= idle_slope & IDLE_SLOPE_MASK;
+ writel(val, fep->hwp + FEC_DMA_CFG(queue));
+
+ return 0;
+}
+
+static int fec_enet_setup_tc(struct net_device *ndev, enum tc_setup_type type,
+ void *type_data)
+{
+ switch (type) {
+ case TC_SETUP_QDISC_CBS:
+ return fec_enet_setup_tc_cbs(ndev, type_data);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
static void fec_enet_free_buffers(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
@@ -3882,6 +3987,7 @@ static const struct net_device_ops fec_netdev_ops = {
.ndo_tx_timeout = fec_timeout,
.ndo_set_mac_address = fec_set_mac_address,
.ndo_eth_ioctl = fec_enet_ioctl,
+ .ndo_setup_tc = fec_enet_setup_tc,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = fec_poll_controller,
#endif
--
2.25.1



2023-02-09 16:14:34

by Simon Horman

[permalink] [raw]
Subject: Re: [PATCH net-next] net: fec: add CBS offload support

On Thu, Feb 09, 2023 at 05:24:22PM +0800, [email protected] wrote:
> From: Wei Fang <[email protected]>
>
> The FEC hardware supports the Credit-based shaper (CBS) which control
> the bandwidth distribution between normal traffic and time-sensitive
> traffic with respect to the total link bandwidth available.
> But notice that the bandwidth allocation of hardware is restricted to
> certain values. Below is the equation which is used to calculate the
> BW (bandwidth) fraction for per class:
> BW fraction = 1 / (1 + 512 / idle_slope)
>
> For values of idle_slope less than 128, idle_slope = 2 ^ n, when n =
> 0,1,2,...,6. For values equal to or greater than 128, idle_slope =
> 128 * m, where m = 1,2,3,...,12.
> Example 1. idle_slope = 64, therefore BW fraction = 0.111.
> Example 2. idle_slope = 128, therefore BW fraction = 0.200.
>
> Here is an example command to set 200Mbps bandwidth on 1000Mbps port
> for TC 2 and 111Mbps for TC 3.
> tc qdisc add dev eth0 parent root handle 100 mqprio num_tc 3 map \
> 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 queues 1@0 1@1 1@2 hw 0
> tc qdisc replace dev eth0 parent 100:2 cbs idleslope 200000 \
> sendslope -800000 hicredit 153 locredit -1389 offload 1
> tc qdisc replace dev eth0 parent 100:3 cbs idleslope 111000 \
> sendslope -889000 hicredit 90 locredit -892 offload 1
>
> Signed-off-by: Wei Fang <[email protected]>
> ---
> drivers/net/ethernet/freescale/fec.h | 4 +
> drivers/net/ethernet/freescale/fec_main.c | 106 ++++++++++++++++++++++
> 2 files changed, 110 insertions(+)
>
> diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
> index 5ba1e0d71c68..ad5f968aa086 100644
> --- a/drivers/net/ethernet/freescale/fec.h
> +++ b/drivers/net/ethernet/freescale/fec.h
> @@ -340,6 +340,10 @@ struct bufdesc_ex {
> #define RCMR_CMP(X) (((X) == 1) ? RCMR_CMP_1 : RCMR_CMP_2)
> #define FEC_TX_BD_FTYPE(X) (((X) & 0xf) << 20)
>
> +#define FEC_QOS_TX_SHEME_MASK GENMASK(2, 0)
> +#define CREDIT_BASED_SCHEME 0
> +#define ROUND_ROBIN_SCHEME 1
> +
> /* The number of Tx and Rx buffers. These are allocated from the page
> * pool. The code may assume these are power of two, so it it best
> * to keep them that size.
> diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
> index c73e25f8995e..3bb3a071fa0c 100644
> --- a/drivers/net/ethernet/freescale/fec_main.c
> +++ b/drivers/net/ethernet/freescale/fec_main.c
> @@ -66,6 +66,7 @@
> #include <linux/mfd/syscon.h>
> #include <linux/regmap.h>
> #include <soc/imx/cpuidle.h>
> +#include <net/pkt_sched.h>
> #include <linux/filter.h>
> #include <linux/bpf.h>
>
> @@ -3232,6 +3233,110 @@ static int fec_enet_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
> return phy_mii_ioctl(phydev, rq, cmd);
> }
>
> +static u32 fec_enet_get_idle_slope(u8 bw)
> +{
> + u32 idle_slope, quotient, msb;
> +
> + /* Convert bw to hardware idle slope */
> + idle_slope = (512 * bw) / (100 - bw);
> +
> + if (idle_slope >= 128) {
> + /* For values equal to or greater than 128, idle_slope = 128 * m,
> + * where m = 1, 2, 3, ...12. Here we use the rounding method.
> + */

Perhaps the following would be clearer?

For values greater than or equal to 128,
idle_slope is rounded to the nearest multiple of 128.

> + quotient = idle_slope / 128;
> + if (idle_slope >= quotient * 128 + 64)
> + idle_slope = 128 * (quotient + 1);
> + else
> + idle_slope = 128 * quotient;

Maybe there is a helper that does this, but if
not, perhaps:

idle_slope = DIV_ROUND_CLOSEST(idle_slope, 128U) * 128U;


> +
> + goto end;

Maybe return here

> + }

Or an else here is nicer?

> +
> + /* For values less than 128, idle_slope = 2 ^ n, where

Perhaps the following would be clearer?

For values less than 128, idle_slope is rounded
to the nearest power of 2.

> + * n = 0, 1, 2, ...6. Here we use the rounding method.

n is 7 for input idle_slope around 128 (2^7)

> + * So the minimum of idle_slope is 1.
> + */
> + msb = fls(idle_slope);
> +
> + if (msb == 0 || msb == 1) {
> + idle_slope = 1;
> + goto end;
> + }

nit: maybe this is nicer

if (msb <= 1)
return 1;

> +
> + msb -= 1;
> + if (idle_slope >= (1 << msb) + (1 << (msb - 1)))
> + idle_slope = 1 << (msb + 1);
> + else
> + idle_slope = 1 << msb;

In the same vein as the suggestion for the >= 128 case, perhaps:

u32 d;

d = BIT(fls(idle_slope));
idle_slope = DIV_ROUND_CLOSEST(idle_slope, d) * d;

> +
> +end:
> + return idle_slope;
> +}
> +
> +static int fec_enet_setup_tc_cbs(struct net_device *ndev, void *type_data)
> +{
> + struct fec_enet_private *fep = netdev_priv(ndev);
> + struct tc_cbs_qopt_offload *cbs = type_data;
> + int queue = cbs->queue;

nit: extra space after '='

> + u32 speed = fep->speed;
> + u32 val, idle_slope;
> + u8 bw;
> +
> + if (!(fep->quirks & FEC_QUIRK_HAS_AVB))
> + return -EOPNOTSUPP;
> +
> + /* Queue 1 for Class A, Queue 2 for Class B, so the ENET must has

nit: s/has/have/

> + * three queues.
> + */
> + if (fep->num_tx_queues != FEC_ENET_MAX_TX_QS)
> + return -EOPNOTSUPP;
> +
> + /* Queue 0 is not AVB capable */
> + if (queue <= 0 || queue >= fep->num_tx_queues)
> + return -EINVAL;
> +
> + val = readl(fep->hwp + FEC_QOS_SCHEME);
> + val &= ~FEC_QOS_TX_SHEME_MASK;
> + if (!cbs->enable) {
> + val |= ROUND_ROBIN_SCHEME;
> + writel(val, fep->hwp + FEC_QOS_SCHEME);
> +
> + return 0;
> + }
> +
> + val |= CREDIT_BASED_SCHEME;
> + writel(val, fep->hwp + FEC_QOS_SCHEME);
> +
> + /* cbs->idleslope is in kilobits per second. speed is the port rate
> + * in megabits per second. So bandwidth ratio bw = (idleslope /
> + * (speed * 1000)) * 100, the unit is percentage.
> + */

suggestion:

/* cbs->idleslope is in kilobits per second.
* Speed is the port rate in megabits per second.
* So bandwidth the ratio, bw, is (idleslope / (speed * 1000)) * 100.
* The unit of bw is a percentage.
*/

> + bw = cbs->idleslope / (speed * 10UL);
> + /* bw% can not >= 100% */
> + if (bw >= 100)
> + return -EINVAL;

nit: maybe the above calculation and check fits better inside
fec_enet_get_idle_slope()

> + idle_slope = fec_enet_get_idle_slope(bw);
> +
> + val = readl(fep->hwp + FEC_DMA_CFG(queue));
> + val &= ~IDLE_SLOPE_MASK;
> + val |= idle_slope & IDLE_SLOPE_MASK;
> + writel(val, fep->hwp + FEC_DMA_CFG(queue));
> +
> + return 0;
> +}
> +
> +static int fec_enet_setup_tc(struct net_device *ndev, enum tc_setup_type type,
> + void *type_data)
> +{
> + switch (type) {
> + case TC_SETUP_QDISC_CBS:
> + return fec_enet_setup_tc_cbs(ndev, type_data);
> + default:
> + return -EOPNOTSUPP;
> + }
> +}
> +
> static void fec_enet_free_buffers(struct net_device *ndev)
> {
> struct fec_enet_private *fep = netdev_priv(ndev);
> @@ -3882,6 +3987,7 @@ static const struct net_device_ops fec_netdev_ops = {
> .ndo_tx_timeout = fec_timeout,
> .ndo_set_mac_address = fec_set_mac_address,
> .ndo_eth_ioctl = fec_enet_ioctl,
> + .ndo_setup_tc = fec_enet_setup_tc,
> #ifdef CONFIG_NET_POLL_CONTROLLER
> .ndo_poll_controller = fec_poll_controller,
> #endif
> --
> 2.25.1
>

2023-02-09 20:38:42

by Andrew Lunn

[permalink] [raw]
Subject: Re: [PATCH net-next] net: fec: add CBS offload support

> + /* cbs->idleslope is in kilobits per second. speed is the port rate
> + * in megabits per second. So bandwidth ratio bw = (idleslope /
> + * (speed * 1000)) * 100, the unit is percentage.
> + */
> + bw = cbs->idleslope / (speed * 10UL);

This appears to be a / 0 when the link is not up yet? Also, if the
link goes does, fep->speed keeps the old value, so if it comes up
again at a different speed, your calculations are all wrong. So i
think you need fec_enet_adjust_link() involved in this.


> + /* bw% can not >= 100% */
> + if (bw >= 100)
> + return -EINVAL;

Well > 100% could happen when the link goes from 1G to 10Half, or even
100Full. You should probably document the policy of what you do
then. Do you dedicate all the available bandwidth to the high priority
queue, or do you go back to best effort? Is it possible to indicate in
the tc show command that the configuration is no longer possible?

Presumably other drivers have already addressed all these issues, so
you just need to find out what they do.

Andrew

2023-02-10 07:24:11

by Wei Fang

[permalink] [raw]
Subject: RE: [PATCH net-next] net: fec: add CBS offload support

> -----Original Message-----
> From: Simon Horman <[email protected]>
> Sent: 2023??2??10?? 0:14
> To: Wei Fang <[email protected]>
> Cc: Shenwei Wang <[email protected]>; Clark Wang
> <[email protected]>; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected]; dl-linux-imx
> <[email protected]>; [email protected]
> Subject: Re: [PATCH net-next] net: fec: add CBS offload support
>
> On Thu, Feb 09, 2023 at 05:24:22PM +0800, [email protected] wrote:
> > From: Wei Fang <[email protected]>
> >
> > +
> > + if (idle_slope >= 128) {
> > + /* For values equal to or greater than 128, idle_slope = 128 * m,
> > + * where m = 1, 2, 3, ...12. Here we use the rounding method.
> > + */
>
> Perhaps the following would be clearer?
>
> For values greater than or equal to 128,
> idle_slope is rounded to the nearest multiple of 128.
>
> > + quotient = idle_slope / 128;
> > + if (idle_slope >= quotient * 128 + 64)
> > + idle_slope = 128 * (quotient + 1);
> > + else
> > + idle_slope = 128 * quotient;
>
> Maybe there is a helper that does this, but if
> not, perhaps:
>
> idle_slope = DIV_ROUND_CLOSEST(idle_slope, 128U) * 128U;
>
>
> > +
> > + goto end;
>
> Maybe return here
>
> > + }
>
> Or an else here is nicer?
>
> > +
> > + /* For values less than 128, idle_slope = 2 ^ n, where
>
> Perhaps the following would be clearer?
>
> For values less than 128, idle_slope is rounded
> to the nearest power of 2.
>
> > + * n = 0, 1, 2, ...6. Here we use the rounding method.
>
> n is 7 for input idle_slope around 128 (2^7)
>
> > + * So the minimum of idle_slope is 1.
> > + */
> > + msb = fls(idle_slope);
> > +
> > + if (msb == 0 || msb == 1) {
> > + idle_slope = 1;
> > + goto end;
> > + }
>
> nit: maybe this is nicer
>
> if (msb <= 1)
> return 1;
>
> > +
> > + msb -= 1;
> > + if (idle_slope >= (1 << msb) + (1 << (msb - 1)))
> > + idle_slope = 1 << (msb + 1);
> > + else
> > + idle_slope = 1 << msb;
>
> In the same vein as the suggestion for the >= 128 case, perhaps:
>
> u32 d;
>
> d = BIT(fls(idle_slope));
> idle_slope = DIV_ROUND_CLOSEST(idle_slope, d) * d;
>
> > +
> > +end:
> > + return idle_slope;
> > +}
> > +
> > +static int fec_enet_setup_tc_cbs(struct net_device *ndev, void
> > +*type_data) {
> > + struct fec_enet_private *fep = netdev_priv(ndev);
> > + struct tc_cbs_qopt_offload *cbs = type_data;
> > + int queue = cbs->queue;
>
> nit: extra space after '='
>
> > + u32 speed = fep->speed;
> > + u32 val, idle_slope;
> > + u8 bw;
> > +
> > + if (!(fep->quirks & FEC_QUIRK_HAS_AVB))
> > + return -EOPNOTSUPP;
> > +
> > + /* Queue 1 for Class A, Queue 2 for Class B, so the ENET must has
>
> nit: s/has/have/
>
> > + * three queues.
> > + */
> > + if (fep->num_tx_queues != FEC_ENET_MAX_TX_QS)
> > + return -EOPNOTSUPP;
> > +
> > + /* Queue 0 is not AVB capable */
> > + if (queue <= 0 || queue >= fep->num_tx_queues)
> > + return -EINVAL;
> > +
> > + val = readl(fep->hwp + FEC_QOS_SCHEME);
> > + val &= ~FEC_QOS_TX_SHEME_MASK;
> > + if (!cbs->enable) {
> > + val |= ROUND_ROBIN_SCHEME;
> > + writel(val, fep->hwp + FEC_QOS_SCHEME);
> > +
> > + return 0;
> > + }
> > +
> > + val |= CREDIT_BASED_SCHEME;
> > + writel(val, fep->hwp + FEC_QOS_SCHEME);
> > +
> > + /* cbs->idleslope is in kilobits per second. speed is the port rate
> > + * in megabits per second. So bandwidth ratio bw = (idleslope /
> > + * (speed * 1000)) * 100, the unit is percentage.
> > + */
>
> suggestion:
>
> /* cbs->idleslope is in kilobits per second.
> * Speed is the port rate in megabits per second.
> * So bandwidth the ratio, bw, is (idleslope / (speed * 1000)) * 100.
> * The unit of bw is a percentage.
> */
>
> > + bw = cbs->idleslope / (speed * 10UL);
> > + /* bw% can not >= 100% */
> > + if (bw >= 100)
> > + return -EINVAL;
>
> nit: maybe the above calculation and check fits better inside
> fec_enet_get_idle_slope()
>

Your suggestions are very helpful, I'll amend the patch in v2.
Thanks a lot.

2023-02-10 08:04:27

by Wei Fang

[permalink] [raw]
Subject: RE: [PATCH net-next] net: fec: add CBS offload support

> -----Original Message-----
> From: Andrew Lunn <[email protected]>
> Sent: 2023??2??10?? 4:38
> To: Wei Fang <[email protected]>
> Cc: Shenwei Wang <[email protected]>; Clark Wang
> <[email protected]>; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected]; dl-linux-imx
> <[email protected]>; [email protected]
> Subject: Re: [PATCH net-next] net: fec: add CBS offload support
>
> > + /* cbs->idleslope is in kilobits per second. speed is the port rate
> > + * in megabits per second. So bandwidth ratio bw = (idleslope /
> > + * (speed * 1000)) * 100, the unit is percentage.
> > + */
> > + bw = cbs->idleslope / (speed * 10UL);
>
> This appears to be a / 0 when the link is not up yet? Also, if the link goes
> does, fep->speed keeps the old value, so if it comes up again at a different
> speed, your calculations are all wrong. So i think you need
> fec_enet_adjust_link() involved in this.
>
Yes, speed = 0 is indeed a problem, we should check the value first.
For speed change, I'll think about how to handle this situation.

>
> > + /* bw% can not >= 100% */
> > + if (bw >= 100)
> > + return -EINVAL;
>
> Well > 100% could happen when the link goes from 1G to 10Half, or even
> 100Full. You should probably document the policy of what you do then. Do
> you dedicate all the available bandwidth to the high priority queue, or do you
> go back to best effort?
Actually, the FEC has always used the credit-based shaper by default. So I think
it's better to fall back the default setting and return error if the bw > 100%.

>Is it possible to indicate in the tc show command that
> the configuration is no longer possible?
>
Sorry, I have no knowledge about the tc show command.

> Presumably other drivers have already addressed all these issues, so you just
> need to find out what they do.
>
> Andrew