Pad small packets to ETH_ZLEN before transmit, cleanup dma sync calls,
add xmit_more functionality and then further remove atomic
variable usage in the prior.
Changes:
V2:
- Added patch for padding small packets to ETH_ZLEN, part of
optimization patches for transmit code missed out in V1
- Updated changelog to provide more details for dma_sync remove patch
- Updated changelog to use imperative tone in add xmit_more patch
V1: https://lore.kernel.org/all/[email protected]/
Shinas Rasheed (4):
octeon_ep: add padding for small packets
octeon_ep: remove dma sync in trasmit path
octeon_ep: implement xmit_more in transmit
octeon_ep: remove atomic variable usage in Tx data path
.../ethernet/marvell/octeon_ep/octep_config.h | 3 +-
.../ethernet/marvell/octeon_ep/octep_main.c | 38 ++++++++++---------
.../ethernet/marvell/octeon_ep/octep_main.h | 9 +++++
.../net/ethernet/marvell/octeon_ep/octep_tx.c | 5 +--
.../net/ethernet/marvell/octeon_ep/octep_tx.h | 3 --
5 files changed, 33 insertions(+), 25 deletions(-)
--
2.25.1
Add xmit_more handling in tx datapath for octeon_ep pf.
Signed-off-by: Shinas Rasheed <[email protected]>
---
V2:
- Updated changelog to have imperative tone.
V1: https://lore.kernel.org/all/[email protected]/
.../ethernet/marvell/octeon_ep/octep_config.h | 2 +-
.../ethernet/marvell/octeon_ep/octep_main.c | 19 +++++++++++++++----
2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
index 1622a6ebf036..ed8b1ace56b9 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
@@ -15,7 +15,7 @@
/* Tx Queue: maximum descriptors per ring */
#define OCTEP_IQ_MAX_DESCRIPTORS 1024
/* Minimum input (Tx) requests to be enqueued to ring doorbell */
-#define OCTEP_DB_MIN 1
+#define OCTEP_DB_MIN 8
/* Packet threshold for Tx queue interrupt */
#define OCTEP_IQ_INTR_THRESHOLD 0x0
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
index 1c02304677c9..1a88a6bf0598 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
@@ -818,6 +818,7 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb,
struct octep_iq *iq;
skb_frag_t *frag;
u16 nr_frags, si;
+ int xmit_more;
u16 q_no, wi;
if (skb_put_padto(skb, ETH_ZLEN))
@@ -895,18 +896,28 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb,
}
netdev_tx_sent_queue(iq->netdev_q, skb->len);
+
+ xmit_more = netdev_xmit_more();
+
skb_tx_timestamp(skb);
atomic_inc(&iq->instr_pending);
+ iq->fill_cnt++;
wi++;
if (wi == iq->max_count)
wi = 0;
iq->host_write_index = wi;
+ if (xmit_more &&
+ (atomic_read(&iq->instr_pending) <
+ (iq->max_count - OCTEP_WAKE_QUEUE_THRESHOLD)) &&
+ iq->fill_cnt < iq->fill_threshold)
+ return NETDEV_TX_OK;
+
/* Flush the hw descriptor before writing to doorbell */
wmb();
-
- /* Ring Doorbell to notify the NIC there is a new packet */
- writel(1, iq->doorbell_reg);
- iq->stats.instr_posted++;
+ /* Ring Doorbell to notify the NIC of new packets */
+ writel(iq->fill_cnt, iq->doorbell_reg);
+ iq->stats.instr_posted += iq->fill_cnt;
+ iq->fill_cnt = 0;
return NETDEV_TX_OK;
dma_map_sg_err:
--
2.25.1
Replace atomic variable "instr_pending" which represents number of
posted tx instructions pending completion, with host_write_idx and
flush_index variables in the xmit and completion processing respectively.
Signed-off-by: Shinas Rasheed <[email protected]>
---
V2:
- No changes
V1: https://lore.kernel.org/all/[email protected]/
drivers/net/ethernet/marvell/octeon_ep/octep_config.h | 1 +
drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 11 ++++-------
drivers/net/ethernet/marvell/octeon_ep/octep_main.h | 9 +++++++++
drivers/net/ethernet/marvell/octeon_ep/octep_tx.c | 5 +----
drivers/net/ethernet/marvell/octeon_ep/octep_tx.h | 3 ---
5 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
index ed8b1ace56b9..91cfa19c65b9 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
@@ -13,6 +13,7 @@
#define OCTEP_64BYTE_INSTR 64
/* Tx Queue: maximum descriptors per ring */
+/* This needs to be a power of 2 */
#define OCTEP_IQ_MAX_DESCRIPTORS 1024
/* Minimum input (Tx) requests to be enqueued to ring doorbell */
#define OCTEP_DB_MIN 8
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
index 1a88a6bf0598..53b7fbfe39ff 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
@@ -777,7 +777,7 @@ static int octep_stop(struct net_device *netdev)
*/
static inline int octep_iq_full_check(struct octep_iq *iq)
{
- if (likely((iq->max_count - atomic_read(&iq->instr_pending)) >=
+ if (likely((IQ_INSTR_SPACE(iq)) >
OCTEP_WAKE_QUEUE_THRESHOLD))
return 0;
@@ -787,7 +787,7 @@ static inline int octep_iq_full_check(struct octep_iq *iq)
/* check again and restart the queue, in case NAPI has just freed
* enough Tx ring entries.
*/
- if (unlikely((iq->max_count - atomic_read(&iq->instr_pending)) >=
+ if (unlikely(IQ_INSTR_SPACE(iq) >
OCTEP_WAKE_QUEUE_THRESHOLD)) {
netif_start_subqueue(iq->netdev, iq->q_no);
iq->stats.restart_cnt++;
@@ -900,14 +900,11 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb,
xmit_more = netdev_xmit_more();
skb_tx_timestamp(skb);
- atomic_inc(&iq->instr_pending);
iq->fill_cnt++;
wi++;
- if (wi == iq->max_count)
- wi = 0;
- iq->host_write_index = wi;
+ iq->host_write_index = wi & iq->ring_size_mask;
if (xmit_more &&
- (atomic_read(&iq->instr_pending) <
+ (IQ_INSTR_PENDING(iq) <
(iq->max_count - OCTEP_WAKE_QUEUE_THRESHOLD)) &&
iq->fill_cnt < iq->fill_threshold)
return NETDEV_TX_OK;
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
index 6df902ebb7f3..c33e046b69a4 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
@@ -40,6 +40,15 @@
#define OCTEP_OQ_INTR_RESEND_BIT 59
#define OCTEP_MMIO_REGIONS 3
+
+#define IQ_INSTR_PENDING(iq) ({ typeof(iq) iq__ = (iq); \
+ ((iq__)->host_write_index - (iq__)->flush_index) & \
+ (iq__)->ring_size_mask; \
+ })
+#define IQ_INSTR_SPACE(iq) ({ typeof(iq) iq_ = (iq); \
+ (iq_)->max_count - IQ_INSTR_PENDING(iq_); \
+ })
+
/* PCI address space mapping information.
* Each of the 3 address spaces given by BAR0, BAR2 and BAR4 of
* Octeon gets mapped to different physical address spaces in
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c
index d0adb82d65c3..06851b78aa28 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c
@@ -21,7 +21,6 @@ static void octep_iq_reset_indices(struct octep_iq *iq)
iq->flush_index = 0;
iq->pkts_processed = 0;
iq->pkt_in_done = 0;
- atomic_set(&iq->instr_pending, 0);
}
/**
@@ -82,7 +81,6 @@ int octep_iq_process_completions(struct octep_iq *iq, u16 budget)
}
iq->pkts_processed += compl_pkts;
- atomic_sub(compl_pkts, &iq->instr_pending);
iq->stats.instr_completed += compl_pkts;
iq->stats.bytes_sent += compl_bytes;
iq->stats.sgentry_sent += compl_sg;
@@ -91,7 +89,7 @@ int octep_iq_process_completions(struct octep_iq *iq, u16 budget)
netdev_tx_completed_queue(iq->netdev_q, compl_pkts, compl_bytes);
if (unlikely(__netif_subqueue_stopped(iq->netdev, iq->q_no)) &&
- ((iq->max_count - atomic_read(&iq->instr_pending)) >
+ (IQ_INSTR_SPACE(iq) >
OCTEP_WAKE_QUEUE_THRESHOLD))
netif_wake_subqueue(iq->netdev, iq->q_no);
return !budget;
@@ -144,7 +142,6 @@ static void octep_iq_free_pending(struct octep_iq *iq)
dev_kfree_skb_any(skb);
}
- atomic_set(&iq->instr_pending, 0);
iq->flush_index = fi;
netdev_tx_reset_queue(netdev_get_tx_queue(iq->netdev, iq->q_no));
}
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h
index 86c98b13fc44..1ba4ff65e54d 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h
@@ -172,9 +172,6 @@ struct octep_iq {
/* Statistics for this input queue. */
struct octep_iq_stats stats;
- /* This field keeps track of the instructions pending in this queue. */
- atomic_t instr_pending;
-
/* Pointer to the Virtual Base addr of the input ring. */
struct octep_tx_desc_hw *desc_ring;
--
2.25.1
Cleanup dma sync calls for scatter gather
mappings, since they are coherent allocations
and do not need explicit sync to be called.
Signed-off-by: Shinas Rasheed <[email protected]>
---
V2:
- Provided more details in changelog
V1: https://lore.kernel.org/all/[email protected]/
drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
index 2c86b911a380..1c02304677c9 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
@@ -872,9 +872,6 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb,
if (dma_mapping_error(iq->dev, dma))
goto dma_map_err;
- dma_sync_single_for_cpu(iq->dev, tx_buffer->sglist_dma,
- OCTEP_SGLIST_SIZE_PER_PKT,
- DMA_TO_DEVICE);
memset(sglist, 0, OCTEP_SGLIST_SIZE_PER_PKT);
sglist[0].len[3] = len;
sglist[0].dma_ptr[0] = dma;
@@ -894,10 +891,6 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb,
frag++;
si++;
}
- dma_sync_single_for_device(iq->dev, tx_buffer->sglist_dma,
- OCTEP_SGLIST_SIZE_PER_PKT,
- DMA_TO_DEVICE);
-
hw_desc->dptr = tx_buffer->sglist_dma;
}
--
2.25.1
Pad small packets to ETH_ZLEN before transmit.
Signed-off-by: Shinas Rasheed <[email protected]>
---
V1->V2:
- New patch
drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
index 552970c7dec0..2c86b911a380 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
@@ -820,6 +820,9 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb,
u16 nr_frags, si;
u16 q_no, wi;
+ if (skb_put_padto(skb, ETH_ZLEN))
+ return NETDEV_TX_OK;
+
q_no = skb_get_queue_mapping(skb);
if (q_no >= oct->num_iqs) {
netdev_err(netdev, "Invalid Tx skb->queue_mapping=%d\n", q_no);
--
2.25.1
On Tue, 24 Oct 2023 07:51:16 -0700 Shinas Rasheed wrote:
> Pad small packets to ETH_ZLEN before transmit.
The commit message needs to focus on the "why", rather than "what".
--
pw-bot: cr
On Tue, 24 Oct 2023 07:51:18 -0700 Shinas Rasheed wrote:
> iq->host_write_index = wi;
> + if (xmit_more &&
> + (atomic_read(&iq->instr_pending) <
> + (iq->max_count - OCTEP_WAKE_QUEUE_THRESHOLD)) &&
> + iq->fill_cnt < iq->fill_threshold)
> + return NETDEV_TX_OK;
Does this guarantee that a full-sized skb can be accommodated?
If so - consider stopping stopping the queue when the condition
is not true. The recommended way of implementing 'driver flow control'
is to stop the queue once next packet may not fit, and then use
netif_xmit_stopped() when deciding whether we need to flush or we can
trust xmit_more. see
https://www.kernel.org/doc/html/next/networking/driver.html#transmit-path-guidelines
> /* Flush the hw descriptor before writing to doorbell */
> wmb();
> -
> - /* Ring Doorbell to notify the NIC there is a new packet */
> - writel(1, iq->doorbell_reg);
> - iq->stats.instr_posted++;
> + /* Ring Doorbell to notify the NIC of new packets */
> + writel(iq->fill_cnt, iq->doorbell_reg);
> + iq->stats.instr_posted += iq->fill_cnt;
> + iq->fill_cnt = 0;
> return NETDEV_TX_OK;
Hi Jakub,
Again, thanks for your review.
> Does this guarantee that a full-sized skb can be accommodated?
>> I assume by full-sized skb you mean a non-linear skb with MAX_SG_FRAGS elements in frags array. Yes, this can be accommodated and the hardware uses separate SG list memory to siphon these skb frags instead of obtaining them from the same tx hardware queue. What I'm trying to say is, this means that a single tx descriptor will be enough for a full-sized skb as well, as hardware can pick up SG frags from separate memory and doesn't require separate descriptors.
>If so - consider stopping stopping the queue when the condition is not true.
>> We do stop the queue if tx queue is full, as in octep_iq_full_check earlier on.
>The recommended way of implementing 'driver flow control'
is to stop the queue once next packet may not fit, and then use
netif_xmit_stopped() when deciding whether we need to flush or we can
trust xmit_more. see
https://urldefense.proofpoint.com/v2/url?u=https-3A__www.kernel.org_doc_html_next_networking_driver.html-23transmit-2Dpath-2Dguidelines&d=DwICAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=1OxLD4y-oxrlgQ1rjXgWtmLz1pnaDjD96sDq-cKUwK4&m=FyJHTb5Z2u9DTFSYPU38S5kPcP5KvwGWzY-DPcqOl1gdnm7ToZhTFpyvhLMqh1hd&s=dBMmwfWKAi0UH3nrz7j9kYnAodDjuN3LZ5tC2aL_Prs&e=
>> In the skeleton code above, as I understand each tx desc holds a skb frag and hence there can be possibility of receiving a full-sized skb, stopping the queue but on receiving another normal skb we should observe our queue to be stopped. This doesn't arise in our case as even if the skb is full-sized, it will only use a single tx descriptor so we can be sure if queue has been stopped, the write index will only be updated once posted (and read) tx descriptors are processed in napi context and queues awoken.
Please correct me if I'm wrong anywhere (sorry if so) to further my understanding, and again thanks for your time!
On Thu, Oct 26, 2023 at 9:58 AM Shinas Rasheed <[email protected]> wrote:
>
> Hi Jakub,
>
> Again, thanks for your review.
>
> > Does this guarantee that a full-sized skb can be accommodated?
> >> I assume by full-sized skb you mean a non-linear skb with MAX_SG_FRAGS elements in frags array. Yes, this can be accommodated and the hardware uses separate SG list memory to siphon these skb frags instead of obtaining them from the same tx hardware queue. What I'm trying to say is, this means that a single tx descriptor will be enough for a full-sized skb as well, as hardware can pick up SG frags from separate memory and doesn't require separate descriptors.
>
> >If so - consider stopping stopping the queue when the condition is not true.
> >> We do stop the queue if tx queue is full, as in octep_iq_full_check earlier on.
>
> >The recommended way of implementing 'driver flow control'
> is to stop the queue once next packet may not fit, and then use
> netif_xmit_stopped() when deciding whether we need to flush or we can
> trust xmit_more. see
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.kernel.org_doc_html_next_networking_driver.html-23transmit-2Dpath-2Dguidelines&d=DwICAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=1OxLD4y-oxrlgQ1rjXgWtmLz1pnaDjD96sDq-cKUwK4&m=FyJHTb5Z2u9DTFSYPU38S5kPcP5KvwGWzY-DPcqOl1gdnm7ToZhTFpyvhLMqh1hd&s=dBMmwfWKAi0UH3nrz7j9kYnAodDjuN3LZ5tC2aL_Prs&e=
>
> >> In the skeleton code above, as I understand each tx desc holds a skb frag and hence there can be possibility of receiving a full-sized skb, stopping the queue but on receiving another normal skb we should observe our queue to be stopped. This doesn't arise in our case as even if the skb is full-sized, it will only use a single tx descriptor so we can be sure if queue has been stopped, the write index will only be updated once posted (and read) tx descriptors are processed in napi context and queues awoken.
>
> Please correct me if I'm wrong anywhere (sorry if so) to further my understanding, and again thanks for your time!
Fact that octep_start_xmit() can return NETDEV_TX_BUSY is very suspicious.
I do not think a driver can implement xmit_more and keep
NETDEV_TX_BUSY at the same time.
Please make sure to remove NETDEV_TX_BUSY first, by stopping the queue earlier.
On Thu, 26 Oct 2023 07:57:54 +0000 Shinas Rasheed wrote:
> >The recommended way of implementing 'driver flow control'
> is to stop the queue once next packet may not fit, and then use
> netif_xmit_stopped() when deciding whether we need to flush or we can
> trust xmit_more. see
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.kernel.org_doc_html_next_networking_driver.html-23transmit-2Dpath-2Dguidelines&d=DwICAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=1OxLD4y-oxrlgQ1rjXgWtmLz1pnaDjD96sDq-cKUwK4&m=FyJHTb5Z2u9DTFSYPU38S5kPcP5KvwGWzY-DPcqOl1gdnm7ToZhTFpyvhLMqh1hd&s=dBMmwfWKAi0UH3nrz7j9kYnAodDjuN3LZ5tC2aL_Prs&e=
>
> >> In the skeleton code above, as I understand each tx desc holds a skb frag and hence there can be possibility of receiving a full-sized skb, stopping the queue but on receiving another normal skb we should observe our queue to be stopped. This doesn't arise in our case as even if the skb is full-sized, it will only use a single tx descriptor so we can be sure if queue has been stopped, the write index will only be updated once posted (and read) tx descriptors are processed in napi context and queues awoken.
>
> Please correct me if I'm wrong anywhere (sorry if so) to further my understanding, and again thanks for your time!
Could you please do some training on how to use normal / mailing list
style email at Marvell? Multiple people from Marvell can't quote
replies correctly, it makes it hard to have a conversation and help
y'all.
> -----Original Message-----
> From: Jakub Kicinski <[email protected]>
> Sent: Thursday, October 26, 2023 8:15 PM
> To: Shinas Rasheed <[email protected]>
> Cc: [email protected]; [email protected]; Haseeb Gani
> <[email protected]>; Vimlesh Kumar <[email protected]>;
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; Veerasenareddy Burru <[email protected]>;
> Sathesh B Edara <[email protected]>; Eric Dumazet
> <[email protected]>
> Subject: Re: [EXT] Re: [PATCH net-next v2 3/4] octeon_ep: implement
> xmit_more in transmit
>
> On Thu, 26 Oct 2023 07:57:54 +0000 Shinas Rasheed wrote:
> > >The recommended way of implementing 'driver flow control'
> > is to stop the queue once next packet may not fit, and then use
> > netif_xmit_stopped() when deciding whether we need to flush or we can
> > trust xmit_more. see
> > https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__http://www.kernel.org_doc_html_next_networking_driver.html-23transmit-
> 2Dpath-2Dguidelines&d=DwICAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=1OxLD4y-
> oxrlgQ1rjXgWtmLz1pnaDjD96sDq-
> cKUwK4&m=FyJHTb5Z2u9DTFSYPU38S5kPcP5KvwGWzY-
> DPcqOl1gdnm7ToZhTFpyvhLMqh1hd&s=dBMmwfWKAi0UH3nrz7j9kYnAodDj
> uN3LZ5tC2aL_Prs&e=
> >
> > >> In the skeleton code above, as I understand each tx desc holds a skb frag
> and hence there can be possibility of receiving a full-sized skb, stopping the
> queue but on receiving another normal skb we should observe our queue to
> be stopped. This doesn't arise in our case as even if the skb is full-sized, it will
> only use a single tx descriptor so we can be sure if queue has been stopped,
> the write index will only be updated once posted (and read) tx descriptors
> are processed in napi context and queues awoken.
> >
> > Please correct me if I'm wrong anywhere (sorry if so) to further my
> understanding, and again thanks for your time!
>
> Could you please do some training on how to use normal / mailing list
> style email at Marvell? Multiple people from Marvell can't quote
> replies correctly, it makes it hard to have a conversation and help
> y'all.
Hi Jacub, apologizing for format errors on my part, will try to rectify in coming mails. Sorry again.
> -----Original Message-----
> From: Eric Dumazet <[email protected]>
> Sent: Thursday, October 26, 2023 1:59 PM
> To: Shinas Rasheed <[email protected]>
> Cc: Jakub Kicinski <[email protected]>; [email protected]; linux-
> [email protected]; Haseeb Gani <[email protected]>; Vimlesh Kumar
> <[email protected]>; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; Veerasenareddy Burru
> <[email protected]>; Sathesh B Edara <[email protected]>
> Subject: Re: [EXT] Re: [PATCH net-next v2 3/4] octeon_ep: implement
> xmit_more in transmit
>
> Fact that octep_start_xmit() can return NETDEV_TX_BUSY is very suspicious.
>
> I do not think a driver can implement xmit_more and keep
> NETDEV_TX_BUSY at the same time.
>
> Please make sure to remove NETDEV_TX_BUSY first, by stopping the queue
> earlier.
Hi Eric, I think I understand your point and shall submit an updated patch. Thanks your time!
From: Shinas Rasheed
> Sent: 24 October 2023 15:51
>
> Add xmit_more handling in tx datapath for octeon_ep pf.
>
...
> -
> - /* Ring Doorbell to notify the NIC there is a new packet */
> - writel(1, iq->doorbell_reg);
> - iq->stats.instr_posted++;
> + /* Ring Doorbell to notify the NIC of new packets */
> + writel(iq->fill_cnt, iq->doorbell_reg);
> + iq->stats.instr_posted += iq->fill_cnt;
> + iq->fill_cnt = 0;
> return NETDEV_TX_OK;
Does that really need the count?
A 'doorbell' register usually just tells the MAC engine
to go and look at the transmit ring.
It then continues to process transmits until it fails
to find a packet.
So if the transmit is active you don't need to set the bit.
(Although that is actually rather hard to detect.)
The 'xmit_more' flag is useful if (the equivalent of) writing
the doorbell register is expensive since it can be delayed
to a later frame and only done once - adding a slight latency
to the earlier transmits if the mac engine was idle.
I'm not sure how much (if any) performance gain you actually
get from avoiding the writel().
Single PCIe writes are 'posted' and pretty much completely
asynchronous.
The other problem I've seen is that netdev_xmit_more() is
the state of the queue when the transmit was started, not
the current state.
If a packet is added while the earlier transmit setup code
is running (setting up the descriptors etc) the it isn't set.
So the fast path doesn't get taken.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Hi,
I understand the window is closed, but just replying to a pending comment on the thread.
> -----Original Message-----
> From: David Laight <[email protected]>
> ----------------------------------------------------------------------
> From: Shinas Rasheed
> > Sent: 24 October 2023 15:51
> >
> > Add xmit_more handling in tx datapath for octeon_ep pf.
> >
> ...
> > -
> > - /* Ring Doorbell to notify the NIC there is a new packet */
> > - writel(1, iq->doorbell_reg);
> > - iq->stats.instr_posted++;
> > + /* Ring Doorbell to notify the NIC of new packets */
> > + writel(iq->fill_cnt, iq->doorbell_reg);
> > + iq->stats.instr_posted += iq->fill_cnt;
> > + iq->fill_cnt = 0;
> > return NETDEV_TX_OK;
>
> Does that really need the count?
> A 'doorbell' register usually just tells the MAC engine
> to go and look at the transmit ring.
> It then continues to process transmits until it fails
> to find a packet.
> So if the transmit is active you don't need to set the bit.
> (Although that is actually rather hard to detect.)
The way the octeon hardware works is that it expects number of newly updated packets to be written to the doorbell register,
which effectively increments the doorbell count which shall be decremented by hardware as it reads these packets. So in essence,
the doorbell count also indicates outstanding packets to be read by hardware.
> The 'xmit_more' flag is useful if (the equivalent of) writing
> the doorbell register is expensive since it can be delayed
> to a later frame and only done once - adding a slight latency
> to the earlier transmits if the mac engine was idle.
>
> I'm not sure how much (if any) performance gain you actually
> get from avoiding the writel().
> Single PCIe writes are 'posted' and pretty much completely
> asynchronous.
Can you elaborate what you are suggesting here to do? The driver is trying to make use of the 'xmit_more'
hint from the network stack, as any network driver might opt to do. I think avoiding continuous PCIe posts for each packet
shall still be wasteful as the hardware can bulk read from the queue if we give it a batch of packets.
> The other problem I've seen is that netdev_xmit_more() is
> the state of the queue when the transmit was started, not
> the current state.
> If a packet is added while the earlier transmit setup code
> is running (setting up the descriptors etc) the it isn't set.
> So the fast path doesn't get taken.
By the next packet the kernel sends, the xmit_more should be set
as far I understand, right? (as the xmit_more bool is set if skb->next is present, if the transmit path follows dev_hard_start_xmit).
From: Shinas Rasheed <[email protected]>
> Sent: 30 October 2023 14:15
>
> Hi,
>
> I understand the window is closed, but just replying to a pending comment on the thread.
>
> > -----Original Message-----
> > From: David Laight <[email protected]>
> > ----------------------------------------------------------------------
> > From: Shinas Rasheed
> > > Sent: 24 October 2023 15:51
> > >
> > > Add xmit_more handling in tx datapath for octeon_ep pf.
> > >
> > ...
> > > -
> > > - /* Ring Doorbell to notify the NIC there is a new packet */
> > > - writel(1, iq->doorbell_reg);
> > > - iq->stats.instr_posted++;
> > > + /* Ring Doorbell to notify the NIC of new packets */
> > > + writel(iq->fill_cnt, iq->doorbell_reg);
> > > + iq->stats.instr_posted += iq->fill_cnt;
> > > + iq->fill_cnt = 0;
> > > return NETDEV_TX_OK;
> >
> > Does that really need the count?
> > A 'doorbell' register usually just tells the MAC engine
> > to go and look at the transmit ring.
> > It then continues to process transmits until it fails
> > to find a packet.
> > So if the transmit is active you don't need to set the bit.
> > (Although that is actually rather hard to detect.)
>
> The way the octeon hardware works is that it expects number of newly updated packets
> to be written to the doorbell register,which effectively increments the doorbell
> count which shall be decremented by hardware as it reads these packets. So in essence,
> the doorbell count also indicates outstanding packets to be read by hardware.
Unusual - I wouldn't call that a doorbell register.
> > The 'xmit_more' flag is useful if (the equivalent of) writing
> > the doorbell register is expensive since it can be delayed
> > to a later frame and only done once - adding a slight latency
> > to the earlier transmits if the mac engine was idle.
> >
> > I'm not sure how much (if any) performance gain you actually
> > get from avoiding the writel().
> > Single PCIe writes are 'posted' and pretty much completely
> > asynchronous.
>
> Can you elaborate what you are suggesting here to do? The driver is trying
> to make use of the 'xmit_more' hint from the network stack, as any network
> driver might opt to do.
There are some drivers where waking up the MAC engine is expensive.
If you need to do a PCIe read then they are expensive.
There might also be drivers that need to send a USB message.
I don't actually know which one it was added for.
> I think avoiding continuous PCIe posts for each packet shall still be wasteful
> as the hardware can bulk read from the queue if we give it a batch of packets.
If you do writes for every packet then the hardware can get on with
sending the first packet and might be able to do bulk reads
for the next packet(s) when that finishes.
The extra code you are adding could easily (waving hands)
be more expensive than the posted PCIe write.
(Especially if you have to add an atomic operation.)
Unless, of course, you have to wait for it to send that batch
of packets before you can give it any more.
Which would be rather entirely broken and would really require
you do the write in the end-of-transit path.
> > The other problem I've seen is that netdev_xmit_more() is
> > the state of the queue when the transmit was started, not
> > the current state.
> > If a packet is added while the earlier transmit setup code
> > is running (setting up the descriptors etc) the it isn't set.
> > So the fast path doesn't get taken.
>
> By the next packet the kernel sends, the xmit_more should be set
> as far I understand, right? (as the xmit_more bool is set if skb->next
> is present, if the transmit path follows dev_hard_start_xmit).
The loop is something like:
while (get_packet()) {
per_cpu->xmit_more = !queue_empty();
if (transmit_packet() != TX_OK)
break;
}
So if a packet is added while all the transmit setup code is running
it isn't detected.
I managed to repeatedly get that to loop when xmit_more wasn't set
and in a driver where the 'doorbell' write wasn't entirely trivial.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Hi David,
> -----Original Message-----
> From: David Laight <[email protected]>
> Sent: Monday, October 30, 2023 9:00 PM
> To: Shinas Rasheed <[email protected]>; [email protected];
> [email protected]
> > > > - /* Ring Doorbell to notify the NIC there is a new packet */
> > > > - writel(1, iq->doorbell_reg);
> > > > - iq->stats.instr_posted++;
> > > > + /* Ring Doorbell to notify the NIC of new packets */
> > > > + writel(iq->fill_cnt, iq->doorbell_reg);
> > > > + iq->stats.instr_posted += iq->fill_cnt;
> > > > + iq->fill_cnt = 0;
> > > > return NETDEV_TX_OK;
> > >
> > > Does that really need the count?
> > > A 'doorbell' register usually just tells the MAC engine
> > > to go and look at the transmit ring.
> > > It then continues to process transmits until it fails
> > > to find a packet.
> > > So if the transmit is active you don't need to set the bit.
> > > (Although that is actually rather hard to detect.)
> >
> > The way the octeon hardware works is that it expects number of newly
> updated packets
> > to be written to the doorbell register,which effectively increments the
> doorbell
> > count which shall be decremented by hardware as it reads these packets.
> So in essence,
> > the doorbell count also indicates outstanding packets to be read by
> hardware.
>
> Unusual - I wouldn't call that a doorbell register.
I double checked in earlier implementations as well as the reference manuals.
This is how the hardware register is prescribed to be used.
> If you do writes for every packet then the hardware can get on with
> sending the first packet and might be able to do bulk reads
> for the next packet(s) when that finishes.
>
> The extra code you are adding could easily (waving hands)
> be more expensive than the posted PCIe write.
> (Especially if you have to add an atomic operation.)
>
> Unless, of course, you have to wait for it to send that batch
> of packets before you can give it any more.
> Which would be rather entirely broken and would really require
> you do the write in the end-of-transit path.
The atomic operation is replaced in the very next patch. Other than that,
what is it that you suggest we should 'fix' in this implementation of handling xmit_more?
> The loop is something like:
> while (get_packet()) {
> per_cpu->xmit_more = !queue_empty();
> if (transmit_packet() != TX_OK)
> break;
> }
> So if a packet is added while all the transmit setup code is running
> it isn't detected.
> I managed to repeatedly get that to loop when xmit_more wasn't set
> and in a driver where the 'doorbell' write wasn't entirely trivial.
How are you suggesting we handle or diagnose such a case, in the driver? Can you provide an example
reference which handles adequately this special case?
When the net-next opens again, I can submit the patches again with the added changes you suggested. Thanks for reviewing!
Shinas