2024-04-24 09:19:10

by MD Danish Anwar

[permalink] [raw]
Subject: [PATCH net-next] net: ti: icssg_prueth: Add SW TX / RX Coalescing based on hrtimers

Add SW IRQ coalescing based on hrtimers for RX and TX data path for ICSSG
driver, which can be enabled by ethtool commands:

- RX coalescing
ethtool -C eth1 rx-usecs 50

- TX coalescing can be enabled per TX queue

- by default enables coalesing for TX0
ethtool -C eth1 tx-usecs 50
- configure TX0
ethtool -Q eth0 queue_mask 1 --coalesce tx-usecs 100
- configure TX1
ethtool -Q eth0 queue_mask 2 --coalesce tx-usecs 100
- configure TX0 and TX1
ethtool -Q eth0 queue_mask 3 --coalesce tx-usecs 100 --coalesce
tx-usecs 100

Minimum value for both rx-usecs and tx-usecs is 20us.

Comapared to gro_flush_timeout and napi_defer_hard_irqs this patch
allows to enable IRQ coalescing for RX path separately.

Signed-off-by: MD Danish Anwar <[email protected]>
---
drivers/net/ethernet/ti/icssg/icssg_common.c | 34 ++++++--
drivers/net/ethernet/ti/icssg/icssg_ethtool.c | 87 +++++++++++++++++++
drivers/net/ethernet/ti/icssg/icssg_prueth.c | 17 +++-
drivers/net/ethernet/ti/icssg/icssg_prueth.h | 11 ++-
4 files changed, 142 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c
index 1d62c05b5f7c..1561503becc7 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_common.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_common.c
@@ -122,7 +122,7 @@ void prueth_xmit_free(struct prueth_tx_chn *tx_chn,
}

int emac_tx_complete_packets(struct prueth_emac *emac, int chn,
- int budget)
+ int budget, bool *tdown)
{
struct net_device *ndev = emac->ndev;
struct cppi5_host_desc_t *desc_tx;
@@ -145,6 +145,7 @@ int emac_tx_complete_packets(struct prueth_emac *emac, int chn,
if (cppi5_desc_is_tdcm(desc_dma)) {
if (atomic_dec_and_test(&emac->tdown_cnt))
complete(&emac->tdown_complete);
+ *tdown = true;
break;
}

@@ -190,19 +191,35 @@ int emac_tx_complete_packets(struct prueth_emac *emac, int chn,
return num_tx;
}

+static enum hrtimer_restart emac_tx_timer_callback(struct hrtimer *timer)
+{
+ struct prueth_tx_chn *tx_chns =
+ container_of(timer, struct prueth_tx_chn, tx_hrtimer);
+
+ enable_irq(tx_chns->irq);
+ return HRTIMER_NORESTART;
+}
+
static int emac_napi_tx_poll(struct napi_struct *napi_tx, int budget)
{
struct prueth_tx_chn *tx_chn = prueth_napi_to_tx_chn(napi_tx);
struct prueth_emac *emac = tx_chn->emac;
+ bool tdown = false;
int num_tx_packets;

- num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget);
+ num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget, &tdown);

if (num_tx_packets >= budget)
return budget;

- if (napi_complete_done(napi_tx, num_tx_packets))
- enable_irq(tx_chn->irq);
+ if (napi_complete_done(napi_tx, num_tx_packets)) {
+ if (unlikely(tx_chn->tx_pace_timeout_ns && !tdown))
+ hrtimer_start(&tx_chn->tx_hrtimer,
+ ns_to_ktime(tx_chn->tx_pace_timeout_ns),
+ HRTIMER_MODE_REL_PINNED);
+ else
+ enable_irq(tx_chn->irq);
+ }

return num_tx_packets;
}
@@ -226,6 +243,8 @@ int prueth_ndev_add_tx_napi(struct prueth_emac *emac)
struct prueth_tx_chn *tx_chn = &emac->tx_chns[i];

netif_napi_add_tx(emac->ndev, &tx_chn->napi_tx, emac_napi_tx_poll);
+ hrtimer_init(&tx_chn->tx_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ tx_chn->tx_hrtimer.function = &emac_tx_timer_callback;
ret = request_irq(tx_chn->irq, prueth_tx_irq,
IRQF_TRIGGER_HIGH, tx_chn->name,
tx_chn);
@@ -870,7 +889,12 @@ int emac_napi_rx_poll(struct napi_struct *napi_rx, int budget)
}

if (num_rx < budget && napi_complete_done(napi_rx, num_rx))
- enable_irq(emac->rx_chns.irq[rx_flow]);
+ if (unlikely(emac->rx_pace_timeout_ns))
+ hrtimer_start(&emac->rx_hrtimer,
+ ns_to_ktime(emac->rx_pace_timeout_ns),
+ HRTIMER_MODE_REL_PINNED);
+ else
+ enable_irq(emac->rx_chns.irq[rx_flow]);

return num_rx;
}
diff --git a/drivers/net/ethernet/ti/icssg/icssg_ethtool.c b/drivers/net/ethernet/ti/icssg/icssg_ethtool.c
index ca20325d4d3e..d7409d82c428 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_ethtool.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_ethtool.c
@@ -201,6 +201,88 @@ static void emac_get_rmon_stats(struct net_device *ndev,
rmon_stats->hist_tx[4] = emac_get_stat_by_name(emac, "tx_bucket5_frames");
}

+static int emac_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct prueth_emac *emac = netdev_priv(ndev);
+ struct prueth_tx_chn *tx_chn;
+
+ tx_chn = &emac->tx_chns[0];
+
+ coal->rx_coalesce_usecs = emac->rx_pace_timeout_ns / 1000;
+ coal->tx_coalesce_usecs = tx_chn->tx_pace_timeout_ns / 1000;
+
+ return 0;
+}
+
+static int emac_get_per_queue_coalesce(struct net_device *ndev, u32 queue,
+ struct ethtool_coalesce *coal)
+{
+ struct prueth_emac *emac = netdev_priv(ndev);
+ struct prueth_tx_chn *tx_chn;
+
+ if (queue >= PRUETH_MAX_TX_QUEUES)
+ return -EINVAL;
+
+ tx_chn = &emac->tx_chns[queue];
+
+ coal->tx_coalesce_usecs = tx_chn->tx_pace_timeout_ns / 1000;
+
+ return 0;
+}
+
+static int emac_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *coal,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct prueth_emac *emac = netdev_priv(ndev);
+ struct prueth *prueth = emac->prueth;
+ struct prueth_tx_chn *tx_chn;
+
+ tx_chn = &emac->tx_chns[0];
+
+ if (coal->rx_coalesce_usecs && coal->rx_coalesce_usecs < ICSSG_MIN_COALESCE_USECS) {
+ dev_info(prueth->dev, "defaulting to min value of %dus for rx-usecs\n",
+ ICSSG_MIN_COALESCE_USECS);
+ coal->rx_coalesce_usecs = ICSSG_MIN_COALESCE_USECS;
+ }
+
+ if (coal->tx_coalesce_usecs && coal->tx_coalesce_usecs < ICSSG_MIN_COALESCE_USECS) {
+ dev_info(prueth->dev, "defaulting to min value of %dus for tx-usecs\n",
+ ICSSG_MIN_COALESCE_USECS);
+ coal->tx_coalesce_usecs = ICSSG_MIN_COALESCE_USECS;
+ }
+
+ emac->rx_pace_timeout_ns = coal->rx_coalesce_usecs * 1000;
+ tx_chn->tx_pace_timeout_ns = coal->tx_coalesce_usecs * 1000;
+
+ return 0;
+}
+
+static int emac_set_per_queue_coalesce(struct net_device *ndev, u32 queue,
+ struct ethtool_coalesce *coal)
+{
+ struct prueth_emac *emac = netdev_priv(ndev);
+ struct prueth *prueth = emac->prueth;
+ struct prueth_tx_chn *tx_chn;
+
+ if (queue >= PRUETH_MAX_TX_QUEUES)
+ return -EINVAL;
+
+ tx_chn = &emac->tx_chns[queue];
+
+ if (coal->tx_coalesce_usecs && coal->tx_coalesce_usecs < ICSSG_MIN_COALESCE_USECS) {
+ dev_info(prueth->dev, "defaulting to min value of %dus for tx-usecs for tx-%u\n",
+ ICSSG_MIN_COALESCE_USECS, queue);
+ coal->tx_coalesce_usecs = ICSSG_MIN_COALESCE_USECS;
+ }
+
+ tx_chn->tx_pace_timeout_ns = coal->tx_coalesce_usecs * 1000;
+
+ return 0;
+}
+
const struct ethtool_ops icssg_ethtool_ops = {
.get_drvinfo = emac_get_drvinfo,
.get_msglevel = emac_get_msglevel,
@@ -209,6 +291,11 @@ const struct ethtool_ops icssg_ethtool_ops = {
.get_ethtool_stats = emac_get_ethtool_stats,
.get_strings = emac_get_strings,
.get_ts_info = emac_get_ts_info,
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS,
+ .get_coalesce = emac_get_coalesce,
+ .set_coalesce = emac_set_coalesce,
+ .get_per_queue_coalesce = emac_get_per_queue_coalesce,
+ .set_per_queue_coalesce = emac_set_per_queue_coalesce,
.get_channels = emac_get_channels,
.set_channels = emac_set_channels,
.get_link_ksettings = emac_get_link_ksettings,
diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
index 186b0365c2e5..d620e88493c4 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
@@ -243,6 +243,16 @@ static void emac_adjust_link(struct net_device *ndev)
}
}

+static enum hrtimer_restart emac_rx_timer_callback(struct hrtimer *timer)
+{
+ struct prueth_emac *emac =
+ container_of(timer, struct prueth_emac, rx_hrtimer);
+ int rx_flow = PRUETH_RX_FLOW_DATA;
+
+ enable_irq(emac->rx_chns.irq[rx_flow]);
+ return HRTIMER_NORESTART;
+}
+
static int emac_phy_connect(struct prueth_emac *emac)
{
struct prueth *prueth = emac->prueth;
@@ -582,8 +592,10 @@ static int emac_ndo_stop(struct net_device *ndev)
netdev_err(ndev, "tx teardown timeout\n");

prueth_reset_tx_chan(emac, emac->tx_ch_num, true);
- for (i = 0; i < emac->tx_ch_num; i++)
+ for (i = 0; i < emac->tx_ch_num; i++) {
napi_disable(&emac->tx_chns[i].napi_tx);
+ hrtimer_cancel(&emac->tx_chns[i].tx_hrtimer);
+ }

max_rx_flows = PRUETH_MAX_RX_FLOWS;
k3_udma_glue_tdown_rx_chn(emac->rx_chns.rx_chn, true);
@@ -591,6 +603,7 @@ static int emac_ndo_stop(struct net_device *ndev)
prueth_reset_rx_chan(&emac->rx_chns, max_rx_flows, true);

napi_disable(&emac->napi_rx);
+ hrtimer_cancel(&emac->rx_hrtimer);

cancel_work_sync(&emac->rx_mode_work);

@@ -801,6 +814,8 @@ static int prueth_netdev_init(struct prueth *prueth,
ndev->features = ndev->hw_features;

netif_napi_add(ndev, &emac->napi_rx, emac_napi_rx_poll);
+ hrtimer_init(&emac->rx_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ emac->rx_hrtimer.function = &emac_rx_timer_callback;
prueth->emac[mac] = emac;

return 0;
diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h
index 82e38ef5635b..a78c5eb75fb8 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h
+++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h
@@ -108,6 +108,8 @@ struct prueth_tx_chn {
u32 descs_num;
unsigned int irq;
char name[32];
+ struct hrtimer tx_hrtimer;
+ unsigned long tx_pace_timeout_ns;
};

struct prueth_rx_chn {
@@ -127,6 +129,9 @@ struct prueth_rx_chn {

#define PRUETH_MAX_TX_TS_REQUESTS 50 /* Max simultaneous TX_TS requests */

+/* Minimum coalesce time in usecs for both Tx and Rx */
+#define ICSSG_MIN_COALESCE_USECS 20
+
/* data for each emac port */
struct prueth_emac {
bool is_sr1;
@@ -183,6 +188,10 @@ struct prueth_emac {

struct delayed_work stats_work;
u64 stats[ICSSG_NUM_STATS];
+
+ /* RX IRQ Coalescing Related */
+ struct hrtimer rx_hrtimer;
+ unsigned long rx_pace_timeout_ns;
};

/**
@@ -320,7 +329,7 @@ void prueth_ndev_del_tx_napi(struct prueth_emac *emac, int num);
void prueth_xmit_free(struct prueth_tx_chn *tx_chn,
struct cppi5_host_desc_t *desc);
int emac_tx_complete_packets(struct prueth_emac *emac, int chn,
- int budget);
+ int budget, bool *tdown);
int prueth_ndev_add_tx_napi(struct prueth_emac *emac);
int prueth_init_tx_chns(struct prueth_emac *emac);
int prueth_init_rx_chns(struct prueth_emac *emac,

base-commit: 1c04b46cbdddc7882eeb671521035ea884245b9f
--
2.34.1



2024-04-24 18:31:52

by Andrew Lunn

[permalink] [raw]
Subject: Re: [PATCH net-next] net: ti: icssg_prueth: Add SW TX / RX Coalescing based on hrtimers

On Wed, Apr 24, 2024 at 02:48:23PM +0530, MD Danish Anwar wrote:
> Add SW IRQ coalescing based on hrtimers for RX and TX data path for ICSSG
> driver, which can be enabled by ethtool commands:
>
> - RX coalescing
> ethtool -C eth1 rx-usecs 50
>
> - TX coalescing can be enabled per TX queue
>
> - by default enables coalesing for TX0
> ethtool -C eth1 tx-usecs 50
> - configure TX0
> ethtool -Q eth0 queue_mask 1 --coalesce tx-usecs 100
> - configure TX1
> ethtool -Q eth0 queue_mask 2 --coalesce tx-usecs 100
> - configure TX0 and TX1
> ethtool -Q eth0 queue_mask 3 --coalesce tx-usecs 100 --coalesce
> tx-usecs 100
>
> Minimum value for both rx-usecs and tx-usecs is 20us.

Do you have some benchmark numbers?

Did you see this patch on the mailing list:

https://lore.kernel.org/all/[email protected]/T/#md50cb07bbdd6daf985f3796508cf4b246b085268

This is basically a one line change, which brings big performance
gains. Did you try something as simple as that, rather than all your
hrtimer code?

Andrew

2024-04-25 06:44:42

by MD Danish Anwar

[permalink] [raw]
Subject: Re: [PATCH net-next] net: ti: icssg_prueth: Add SW TX / RX Coalescing based on hrtimers

Hi Andrew,

On 24/04/24 6:01 pm, Andrew Lunn wrote:
> On Wed, Apr 24, 2024 at 02:48:23PM +0530, MD Danish Anwar wrote:
>> Add SW IRQ coalescing based on hrtimers for RX and TX data path for ICSSG
>> driver, which can be enabled by ethtool commands:
>>
>> - RX coalescing
>> ethtool -C eth1 rx-usecs 50
>>
>> - TX coalescing can be enabled per TX queue
>>
>> - by default enables coalesing for TX0
>> ethtool -C eth1 tx-usecs 50
>> - configure TX0
>> ethtool -Q eth0 queue_mask 1 --coalesce tx-usecs 100
>> - configure TX1
>> ethtool -Q eth0 queue_mask 2 --coalesce tx-usecs 100
>> - configure TX0 and TX1
>> ethtool -Q eth0 queue_mask 3 --coalesce tx-usecs 100 --coalesce
>> tx-usecs 100
>>
>> Minimum value for both rx-usecs and tx-usecs is 20us.
>
> Do you have some benchmark numbers?
>
> Did you see this patch on the mailing list:
>
> https://lore.kernel.org/all/[email protected]/T/#md50cb07bbdd6daf985f3796508cf4b246b085268
>
> This is basically a one line change, which brings big performance
> gains. Did you try something as simple as that, rather than all your
> hrtimer code?
>

I did some benchmarking today with,
1. Default driver (without any IRQ coalescing enabled)
2. IRQ Coalescing (With this patch)
3. Default IRQ Coalescing (Suggested by you in the above patch)

I have pasted the full logs at [1].

Below are the final numbers,

==============================================================
Method | Tput_TX | CPU_TX | Tput_RX | CPU_RX |
==============================================================
Default Driver 943 Mbps 31% 517 Mbps 38% |
IRQ Coalescing (Patch) 943 Mbps 28% 518 Mbps 25% |
Default IRQ Coalescing 942 Mbps 32% 521 Mbps 25% |
==============================================================

I see that the performance number is more or less same for all three
methods only the CPU load seems to be varying. The IRQ coalescing patch
(using hrtimer) seems to improve the cpu load by 3-4% in TX and 13% in
RX. Whereas the default method that you have suggested doesn't give any
improvemnet in tx however cpu load improves in RX with the same amount
as method 2.

Please let me know if this patch is OK to you based on the benchmarking?

[1]
https://gist.githubusercontent.com/danish-ti/47855631be9f3635cee994693662a988/raw/94b4eb86b42fe243ab03186a88a314e0cb272fd0/gistfile1.txt

> Andrew

--
Thanks and Regards,
Danish

2024-04-25 12:58:52

by Andrew Lunn

[permalink] [raw]
Subject: Re: [PATCH net-next] net: ti: icssg_prueth: Add SW TX / RX Coalescing based on hrtimers

> I did some benchmarking today with,
> 1. Default driver (without any IRQ coalescing enabled)
> 2. IRQ Coalescing (With this patch)
> 3. Default IRQ Coalescing (Suggested by you in the above patch)
>
> I have pasted the full logs at [1].
>
> Below are the final numbers,
>
> ==============================================================
> Method | Tput_TX | CPU_TX | Tput_RX | CPU_RX |
> ==============================================================
> Default Driver 943 Mbps 31% 517 Mbps 38% |
> IRQ Coalescing (Patch) 943 Mbps 28% 518 Mbps 25% |
> Default IRQ Coalescing 942 Mbps 32% 521 Mbps 25% |
> ==============================================================
>
> I see that the performance number is more or less same for all three
> methods only the CPU load seems to be varying. The IRQ coalescing patch
> (using hrtimer) seems to improve the cpu load by 3-4% in TX and 13% in
> RX. Whereas the default method that you have suggested doesn't give any
> improvemnet in tx however cpu load improves in RX with the same amount
> as method 2.
>
> Please let me know if this patch is OK to you based on the benchmarking?

It is good to include benchmark results in patches which claim to
improve performance. Please add the default and the patch version
results to the commit message.

The numbers show your more complex version does bring benefits, so it
is O.K. to use it. I just wounder how many other drivers would benefit
from a one line change.

Andrew

2024-04-27 13:41:39

by Simon Horman

[permalink] [raw]
Subject: Re: [PATCH net-next] net: ti: icssg_prueth: Add SW TX / RX Coalescing based on hrtimers

On Wed, Apr 24, 2024 at 02:48:23PM +0530, MD Danish Anwar wrote:
> Add SW IRQ coalescing based on hrtimers for RX and TX data path for ICSSG
> driver, which can be enabled by ethtool commands:
>
> - RX coalescing
> ethtool -C eth1 rx-usecs 50
>
> - TX coalescing can be enabled per TX queue
>
> - by default enables coalesing for TX0
> ethtool -C eth1 tx-usecs 50
> - configure TX0
> ethtool -Q eth0 queue_mask 1 --coalesce tx-usecs 100
> - configure TX1
> ethtool -Q eth0 queue_mask 2 --coalesce tx-usecs 100
> - configure TX0 and TX1
> ethtool -Q eth0 queue_mask 3 --coalesce tx-usecs 100 --coalesce
> tx-usecs 100
>
> Minimum value for both rx-usecs and tx-usecs is 20us.
>
> Comapared to gro_flush_timeout and napi_defer_hard_irqs this patch

nit: Compared

> allows to enable IRQ coalescing for RX path separately.
>
> Signed-off-by: MD Danish Anwar <[email protected]>

..

> @@ -190,19 +191,35 @@ int emac_tx_complete_packets(struct prueth_emac *emac, int chn,
> return num_tx;
> }
>
> +static enum hrtimer_restart emac_tx_timer_callback(struct hrtimer *timer)
> +{
> + struct prueth_tx_chn *tx_chns =
> + container_of(timer, struct prueth_tx_chn, tx_hrtimer);
> +
> + enable_irq(tx_chns->irq);
> + return HRTIMER_NORESTART;
> +}
> +
> static int emac_napi_tx_poll(struct napi_struct *napi_tx, int budget)
> {
> struct prueth_tx_chn *tx_chn = prueth_napi_to_tx_chn(napi_tx);
> struct prueth_emac *emac = tx_chn->emac;
> + bool tdown = false;
> int num_tx_packets;
>
> - num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget);
> + num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget, &tdown);

Please consider limiting lines to 80 columns wide in Networking code.

>
> if (num_tx_packets >= budget)
> return budget;
>
> - if (napi_complete_done(napi_tx, num_tx_packets))
> - enable_irq(tx_chn->irq);
> + if (napi_complete_done(napi_tx, num_tx_packets)) {
> + if (unlikely(tx_chn->tx_pace_timeout_ns && !tdown))
> + hrtimer_start(&tx_chn->tx_hrtimer,
> + ns_to_ktime(tx_chn->tx_pace_timeout_ns),
> + HRTIMER_MODE_REL_PINNED);
> + else
> + enable_irq(tx_chn->irq);
> + }
>
> return num_tx_packets;
> }

..

> @@ -870,7 +889,12 @@ int emac_napi_rx_poll(struct napi_struct *napi_rx, int budget)
> }
>
> if (num_rx < budget && napi_complete_done(napi_rx, num_rx))
> - enable_irq(emac->rx_chns.irq[rx_flow]);
> + if (unlikely(emac->rx_pace_timeout_ns))
> + hrtimer_start(&emac->rx_hrtimer,
> + ns_to_ktime(emac->rx_pace_timeout_ns),
> + HRTIMER_MODE_REL_PINNED);
> + else
> + enable_irq(emac->rx_chns.irq[rx_flow]);

clang-18 and gcc-13 both complain about the if/else logic above.
I think it would be best to add {} to the outer if statement.

>
> return num_rx;
> }

..

2024-04-29 05:55:29

by MD Danish Anwar

[permalink] [raw]
Subject: Re: [PATCH net-next] net: ti: icssg_prueth: Add SW TX / RX Coalescing based on hrtimers



On 25/04/24 6:19 pm, Andrew Lunn wrote:
>> I did some benchmarking today with,
>> 1. Default driver (without any IRQ coalescing enabled)
>> 2. IRQ Coalescing (With this patch)
>> 3. Default IRQ Coalescing (Suggested by you in the above patch)
>>
>> I have pasted the full logs at [1].
>>
>> Below are the final numbers,
>>
>> ==============================================================
>> Method | Tput_TX | CPU_TX | Tput_RX | CPU_RX |
>> ==============================================================
>> Default Driver 943 Mbps 31% 517 Mbps 38% |
>> IRQ Coalescing (Patch) 943 Mbps 28% 518 Mbps 25% |
>> Default IRQ Coalescing 942 Mbps 32% 521 Mbps 25% |
>> ==============================================================
>>
>> I see that the performance number is more or less same for all three
>> methods only the CPU load seems to be varying. The IRQ coalescing patch
>> (using hrtimer) seems to improve the cpu load by 3-4% in TX and 13% in
>> RX. Whereas the default method that you have suggested doesn't give any
>> improvemnet in tx however cpu load improves in RX with the same amount
>> as method 2.
>>
>> Please let me know if this patch is OK to you based on the benchmarking?
>
> It is good to include benchmark results in patches which claim to
> improve performance. Please add the default and the patch version
> results to the commit message.

Sure, I will add the benchmarking numbers in commit message and send v2.

>
> The numbers show your more complex version does bring benefits, so it
> is O.K. to use it. I just wounder how many other drivers would benefit
> from a one line change.
>
> Andrew

--
Thanks and Regards,
Danish

2024-04-29 05:55:56

by MD Danish Anwar

[permalink] [raw]
Subject: Re: [PATCH net-next] net: ti: icssg_prueth: Add SW TX / RX Coalescing based on hrtimers



On 27/04/24 7:11 pm, Simon Horman wrote:
> On Wed, Apr 24, 2024 at 02:48:23PM +0530, MD Danish Anwar wrote:
>> Add SW IRQ coalescing based on hrtimers for RX and TX data path for ICSSG
>> driver, which can be enabled by ethtool commands:
>>
>> - RX coalescing
>> ethtool -C eth1 rx-usecs 50
>>
>> - TX coalescing can be enabled per TX queue
>>
>> - by default enables coalesing for TX0
>> ethtool -C eth1 tx-usecs 50
>> - configure TX0
>> ethtool -Q eth0 queue_mask 1 --coalesce tx-usecs 100
>> - configure TX1
>> ethtool -Q eth0 queue_mask 2 --coalesce tx-usecs 100
>> - configure TX0 and TX1
>> ethtool -Q eth0 queue_mask 3 --coalesce tx-usecs 100 --coalesce
>> tx-usecs 100
>>
>> Minimum value for both rx-usecs and tx-usecs is 20us.
>>
>> Comapared to gro_flush_timeout and napi_defer_hard_irqs this patch
>
> nit: Compared
>
>> allows to enable IRQ coalescing for RX path separately.
>>
>> Signed-off-by: MD Danish Anwar <[email protected]>
>
> ...
>
>> @@ -190,19 +191,35 @@ int emac_tx_complete_packets(struct prueth_emac *emac, int chn,
>> return num_tx;
>> }
>>
>> +static enum hrtimer_restart emac_tx_timer_callback(struct hrtimer *timer)
>> +{
>> + struct prueth_tx_chn *tx_chns =
>> + container_of(timer, struct prueth_tx_chn, tx_hrtimer);
>> +
>> + enable_irq(tx_chns->irq);
>> + return HRTIMER_NORESTART;
>> +}
>> +
>> static int emac_napi_tx_poll(struct napi_struct *napi_tx, int budget)
>> {
>> struct prueth_tx_chn *tx_chn = prueth_napi_to_tx_chn(napi_tx);
>> struct prueth_emac *emac = tx_chn->emac;
>> + bool tdown = false;
>> int num_tx_packets;
>>
>> - num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget);
>> + num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget, &tdown);
>
> Please consider limiting lines to 80 columns wide in Networking code.
>
>>
>> if (num_tx_packets >= budget)
>> return budget;
>>
>> - if (napi_complete_done(napi_tx, num_tx_packets))
>> - enable_irq(tx_chn->irq);
>> + if (napi_complete_done(napi_tx, num_tx_packets)) {
>> + if (unlikely(tx_chn->tx_pace_timeout_ns && !tdown))
>> + hrtimer_start(&tx_chn->tx_hrtimer,
>> + ns_to_ktime(tx_chn->tx_pace_timeout_ns),
>> + HRTIMER_MODE_REL_PINNED);
>> + else
>> + enable_irq(tx_chn->irq);
>> + }
>>
>> return num_tx_packets;
>> }
>
> ...
>
>> @@ -870,7 +889,12 @@ int emac_napi_rx_poll(struct napi_struct *napi_rx, int budget)
>> }
>>
>> if (num_rx < budget && napi_complete_done(napi_rx, num_rx))
>> - enable_irq(emac->rx_chns.irq[rx_flow]);
>> + if (unlikely(emac->rx_pace_timeout_ns))
>> + hrtimer_start(&emac->rx_hrtimer,
>> + ns_to_ktime(emac->rx_pace_timeout_ns),
>> + HRTIMER_MODE_REL_PINNED);
>> + else
>> + enable_irq(emac->rx_chns.irq[rx_flow]);
>
> clang-18 and gcc-13 both complain about the if/else logic above.
> I think it would be best to add {} to the outer if statement.
>
>>
>> return num_rx;
>> }
>
> ...

Sure Simon, I'll fix all this and send v2.

--
Thanks and Regards,
Danish