There are some optimizations related to IO path.
Yunsheng Lin (6):
net: hns3: batch the page reference count updates
net: hns3: batch tx doorbell operation
net: hns3: optimize the tx clean process
net: hns3: optimize the rx clean process
net: hns3: use writel() to optimize the barrier operation
net: hns3: use napi_consume_skb() when cleaning tx desc
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 225 ++++++++++++---------
drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 20 +-
drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 8 +-
3 files changed, 141 insertions(+), 112 deletions(-)
--
2.7.4
From: Yunsheng Lin <[email protected]>
Currently HNS3_RING_RX_RING_FBDNUM_REG register is read to determine
how many rx desc can be cleaned. To avoid the register read operation
in the critical data path, use the valid bit in the rx desc to determine
if a specific rx desc can be cleaned.
The hns3 driver clear valid bit in the rx desc before notifying the
rx desc to the hw, and hw will only set the valid bit of the rx desc
after corresponding buffer is filled with packet data and other field
in the rx desc is set accordingly.
Add hns3_rx_ring_move_fw() function to clear the valid bit in the rx
desc before moving rx ring's next_to_clean forward to avoid double
cleaning a rx desc, also add a dma_rmb() barrier in hns3_handle_rx_bd()
to make sure valid bit is set before reading other field in the rx desc.
Signed-off-by: Yunsheng Lin <[email protected]>
Signed-off-by: Huazhong Tan <[email protected]>
---
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 61 +++++++++++++------------
1 file changed, 31 insertions(+), 30 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 2db6c03..8490754 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2845,6 +2845,16 @@ static bool hns3_parse_vlan_tag(struct hns3_enet_ring *ring,
}
}
+static void hns3_rx_ring_move_fw(struct hns3_enet_ring *ring)
+{
+ ring->desc[ring->next_to_clean].rx.bd_base_info &=
+ cpu_to_le32(~BIT(HNS3_RXD_VLD_B));
+ ring->next_to_clean += 1;
+
+ if (unlikely(ring->next_to_clean == ring->desc_num))
+ ring->next_to_clean = 0;
+}
+
static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
unsigned char *va)
{
@@ -2880,7 +2890,7 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
__page_frag_cache_drain(desc_cb->priv,
desc_cb->pagecnt_bias);
- ring_ptr_move_fw(ring, next_to_clean);
+ hns3_rx_ring_move_fw(ring);
return 0;
}
u64_stats_update_begin(&ring->syncp);
@@ -2891,7 +2901,7 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
__skb_put(skb, ring->pull_len);
hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
desc_cb);
- ring_ptr_move_fw(ring, next_to_clean);
+ hns3_rx_ring_move_fw(ring);
return 0;
}
@@ -2946,7 +2956,7 @@ static int hns3_add_frag(struct hns3_enet_ring *ring)
hns3_nic_reuse_page(skb, ring->frag_num++, ring, 0, desc_cb);
trace_hns3_rx_desc(ring);
- ring_ptr_move_fw(ring, next_to_clean);
+ hns3_rx_ring_move_fw(ring);
ring->pending_buf++;
} while (!(bd_base_info & BIT(HNS3_RXD_FE_B)));
@@ -3088,32 +3098,32 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring)
prefetch(desc);
- length = le16_to_cpu(desc->rx.size);
- bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
+ if (!skb) {
+ bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
+
+ /* Check valid BD */
+ if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
+ return -ENXIO;
- /* Check valid BD */
- if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
- return -ENXIO;
+ dma_rmb();
+ length = le16_to_cpu(desc->rx.size);
- if (!skb) {
ring->va = desc_cb->buf + desc_cb->page_offset;
dma_sync_single_for_cpu(ring_to_dev(ring),
desc_cb->dma + desc_cb->page_offset,
hns3_buf_size(ring),
DMA_FROM_DEVICE);
- }
- /* Prefetch first cache line of first page
- * Idea is to cache few bytes of the header of the packet. Our L1 Cache
- * line size is 64B so need to prefetch twice to make it 128B. But in
- * actual we can have greater size of caches with 128B Level 1 cache
- * lines. In such a case, single fetch would suffice to cache in the
- * relevant part of the header.
- */
- net_prefetch(ring->va);
+ /* Prefetch first cache line of first page.
+ * Idea is to cache few bytes of the header of the packet.
+ * Our L1 Cache line size is 64B so need to prefetch twice to make
+ * it 128B. But in actual we can have greater size of caches with
+ * 128B Level 1 cache lines. In such a case, single fetch would
+ * suffice to cache in the relevant part of the header.
+ */
+ net_prefetch(ring->va);
- if (!skb) {
ret = hns3_alloc_skb(ring, length, ring->va);
skb = ring->skb;
@@ -3153,19 +3163,11 @@ int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget,
#define RCB_NOF_ALLOC_RX_BUFF_ONCE 16
int unused_count = hns3_desc_unused(ring);
int recv_pkts = 0;
- int recv_bds = 0;
- int err, num;
+ int err;
- num = readl_relaxed(ring->tqp->io_base + HNS3_RING_RX_RING_FBDNUM_REG);
- num -= unused_count;
unused_count -= ring->pending_buf;
- if (num <= 0)
- goto out;
-
- rmb(); /* Make sure num taken effect before the other data is touched */
-
- while (recv_pkts < budget && recv_bds < num) {
+ while (recv_pkts < budget) {
/* Reuse or realloc buffers */
if (unused_count >= RCB_NOF_ALLOC_RX_BUFF_ONCE) {
hns3_nic_alloc_rx_buffers(ring, unused_count);
@@ -3183,7 +3185,6 @@ int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget,
recv_pkts++;
}
- recv_bds += ring->pending_buf;
unused_count += ring->pending_buf;
ring->skb = NULL;
ring->pending_buf = 0;
--
2.7.4
From: Yunsheng Lin <[email protected]>
Batch the page reference count updates instead of doing them
one at a time. By doing this we can improve the overall receive
performance by avoid some atomic increment operations when the
rx page is reused.
Signed-off-by: Yunsheng Lin <[email protected]>
Signed-off-by: Huazhong Tan <[email protected]>
---
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 32 ++++++++++++++++++-------
drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 1 +
2 files changed, 25 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 93825a4..3762142 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2302,6 +2302,8 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
cb->buf = page_address(p);
cb->length = hns3_page_size(ring);
cb->type = DESC_TYPE_PAGE;
+ page_ref_add(p, USHRT_MAX - 1);
+ cb->pagecnt_bias = USHRT_MAX;
return 0;
}
@@ -2311,8 +2313,8 @@ static void hns3_free_buffer(struct hns3_enet_ring *ring,
{
if (cb->type == DESC_TYPE_SKB)
dev_kfree_skb_any((struct sk_buff *)cb->priv);
- else if (!HNAE3_IS_TX_RING(ring))
- put_page((struct page *)cb->priv);
+ else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias)
+ __page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
memset(cb, 0, sizeof(*cb));
}
@@ -2610,6 +2612,11 @@ static bool hns3_page_is_reusable(struct page *page)
!page_is_pfmemalloc(page);
}
+static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)
+{
+ return (page_count(cb->priv) - cb->pagecnt_bias) == 1;
+}
+
static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
struct hns3_enet_ring *ring, int pull_len,
struct hns3_desc_cb *desc_cb)
@@ -2618,6 +2625,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
int size = le16_to_cpu(desc->rx.size);
u32 truesize = hns3_buf_size(ring);
+ desc_cb->pagecnt_bias--;
skb_add_rx_frag(skb, i, desc_cb->priv, desc_cb->page_offset + pull_len,
size - pull_len, truesize);
@@ -2625,20 +2633,27 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
* when page_offset rollback to zero, flag default unreuse
*/
if (unlikely(!hns3_page_is_reusable(desc_cb->priv)) ||
- (!desc_cb->page_offset && page_count(desc_cb->priv) > 1))
+ (!desc_cb->page_offset && !hns3_can_reuse_page(desc_cb))) {
+ __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias);
return;
+ }
/* Move offset up to the next cache line */
desc_cb->page_offset += truesize;
if (desc_cb->page_offset + truesize <= hns3_page_size(ring)) {
desc_cb->reuse_flag = 1;
- /* Bump ref count on page before it is given */
- get_page(desc_cb->priv);
- } else if (page_count(desc_cb->priv) == 1) {
+ } else if (hns3_can_reuse_page(desc_cb)) {
desc_cb->reuse_flag = 1;
desc_cb->page_offset = 0;
- get_page(desc_cb->priv);
+ } else if (desc_cb->pagecnt_bias) {
+ __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias);
+ return;
+ }
+
+ if (unlikely(!desc_cb->pagecnt_bias)) {
+ page_ref_add(desc_cb->priv, USHRT_MAX);
+ desc_cb->pagecnt_bias = USHRT_MAX;
}
}
@@ -2846,7 +2861,8 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
if (likely(hns3_page_is_reusable(desc_cb->priv)))
desc_cb->reuse_flag = 1;
else /* This page cannot be reused so discard it */
- put_page(desc_cb->priv);
+ __page_frag_cache_drain(desc_cb->priv,
+ desc_cb->pagecnt_bias);
ring_ptr_move_fw(ring, next_to_clean);
return 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 98ca6ea..8f784094 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -287,6 +287,7 @@ struct hns3_desc_cb {
/* desc type, used by the ring user to mark the type of the priv data */
u16 type;
+ u16 pagecnt_bias;
};
enum hns3_pkt_l3type {
--
2.7.4