From: Sunil Goutham <[email protected]>
Below patches attempts to improve performance by reducing
no of atomic operations while allocating new receive buffers
and reducing cache misses by adjusting nicvf structure elements.
Changes from v1:
No changes, resubmitting a fresh as per David's suggestion.
Sunil Goutham (2):
net: thunderx: Set recevie buffer page usage count in bulk
net: thunderx: Adjust nicvf structure to reduce cache misses
drivers/net/ethernet/cavium/thunder/nic.h | 51 ++++++++++++--------
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 31 +++++++++---
2 files changed, 53 insertions(+), 29 deletions(-)
From: Sunil Goutham <[email protected]>
Instead of calling get_page() for every receive buffer carved out
of page, set page's usage count at the end, to reduce no of atomic
calls.
Signed-off-by: Sunil Goutham <[email protected]>
---
drivers/net/ethernet/cavium/thunder/nic.h | 1 +
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 31 ++++++++++++++-----
2 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 092f097..872b22d 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -294,6 +294,7 @@ struct nicvf {
u32 speed;
struct page *rb_page;
u32 rb_page_offset;
+ u16 rb_pageref;
bool rb_alloc_fail;
bool rb_work_scheduled;
struct delayed_work rbdr_work;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 0dd1abf..fa05e34 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -18,6 +18,15 @@
#include "q_struct.h"
#include "nicvf_queues.h"
+static void nicvf_get_page(struct nicvf *nic)
+{
+ if (!nic->rb_pageref || !nic->rb_page)
+ return;
+
+ atomic_add(nic->rb_pageref, &nic->rb_page->_count);
+ nic->rb_pageref = 0;
+}
+
/* Poll a register for a specific value */
static int nicvf_poll_reg(struct nicvf *nic, int qidx,
u64 reg, int bit_pos, int bits, int val)
@@ -81,16 +90,15 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp,
int order = (PAGE_SIZE <= 4096) ? PAGE_ALLOC_COSTLY_ORDER : 0;
/* Check if request can be accomodated in previous allocated page */
- if (nic->rb_page) {
- if ((nic->rb_page_offset + buf_len + buf_len) >
- (PAGE_SIZE << order)) {
- nic->rb_page = NULL;
- } else {
- nic->rb_page_offset += buf_len;
- get_page(nic->rb_page);
- }
+ if (nic->rb_page &&
+ ((nic->rb_page_offset + buf_len) < (PAGE_SIZE << order))) {
+ nic->rb_pageref++;
+ goto ret;
}
+ nicvf_get_page(nic);
+ nic->rb_page = NULL;
+
/* Allocate a new page */
if (!nic->rb_page) {
nic->rb_page = alloc_pages(gfp | __GFP_COMP | __GFP_NOWARN,
@@ -102,7 +110,9 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp,
nic->rb_page_offset = 0;
}
+ret:
*rbuf = (u64 *)((u64)page_address(nic->rb_page) + nic->rb_page_offset);
+ nic->rb_page_offset += buf_len;
return 0;
}
@@ -158,6 +168,9 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
desc = GET_RBDR_DESC(rbdr, idx);
desc->buf_addr = virt_to_phys(rbuf) >> NICVF_RCV_BUF_ALIGN;
}
+
+ nicvf_get_page(nic);
+
return 0;
}
@@ -241,6 +254,8 @@ refill:
new_rb++;
}
+ nicvf_get_page(nic);
+
/* make sure all memory stores are done before ringing doorbell */
smp_wmb();
--
1.7.1
From: Sunil Goutham <[email protected]>
Adjusted nicvf structure such that all elements used in hot
path like napi, xmit e.t.c fall into same cache line. This reduced
no of cache misses and resulted in ~2% increase in no of packets
handled on a core.
Also modified elements with :1 notation to boolean, to be
consistent with other element definitions.
Signed-off-by: Sunil Goutham <[email protected]>
---
drivers/net/ethernet/cavium/thunder/nic.h | 52 ++++++++++++++++------------
1 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 872b22d..83025bb 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -272,46 +272,54 @@ struct nicvf {
struct nicvf *pnicvf;
struct net_device *netdev;
struct pci_dev *pdev;
+ void __iomem *reg_base;
+ struct queue_set *qs;
+ struct nicvf_cq_poll *napi[8];
u8 vf_id;
- u8 node;
- u8 tns_mode:1;
- u8 sqs_mode:1;
- u8 loopback_supported:1;
+ u8 sqs_id;
+ bool sqs_mode;
bool hw_tso;
- u16 mtu;
- struct queue_set *qs;
+
+ /* Receive buffer alloc */
+ u32 rb_page_offset;
+ u16 rb_pageref;
+ bool rb_alloc_fail;
+ bool rb_work_scheduled;
+ struct page *rb_page;
+ struct delayed_work rbdr_work;
+ struct tasklet_struct rbdr_task;
+
+ /* Secondary Qset */
+ u8 sqs_count;
#define MAX_SQS_PER_VF_SINGLE_NODE 5
#define MAX_SQS_PER_VF 11
- u8 sqs_id;
- u8 sqs_count; /* Secondary Qset count */
struct nicvf *snicvf[MAX_SQS_PER_VF];
+
+ /* Queue count */
u8 rx_queues;
u8 tx_queues;
u8 max_queues;
- void __iomem *reg_base;
+
+ u8 node;
+ u8 cpi_alg;
+ u16 mtu;
bool link_up;
u8 duplex;
u32 speed;
- struct page *rb_page;
- u32 rb_page_offset;
- u16 rb_pageref;
- bool rb_alloc_fail;
- bool rb_work_scheduled;
- struct delayed_work rbdr_work;
- struct tasklet_struct rbdr_task;
- struct tasklet_struct qs_err_task;
- struct tasklet_struct cq_task;
- struct nicvf_cq_poll *napi[8];
+ bool tns_mode;
+ bool loopback_supported;
struct nicvf_rss_info rss_info;
- u8 cpi_alg;
+ struct tasklet_struct qs_err_task;
+ struct work_struct reset_task;
+
/* Interrupt coalescing settings */
u32 cq_coalesce_usecs;
-
u32 msg_enable;
+
+ /* Stats */
struct nicvf_hw_stats hw_stats;
struct nicvf_drv_stats drv_stats;
struct bgx_stats bgx_stats;
- struct work_struct reset_task;
/* MSI-X */
bool msix_enabled;
--
1.7.1
From: [email protected]
Date: Mon, 14 Mar 2016 16:36:13 +0530
> Below patches attempts to improve performance by reducing
> no of atomic operations while allocating new receive buffers
> and reducing cache misses by adjusting nicvf structure elements.
>
> Changes from v1:
> No changes, resubmitting a fresh as per David's suggestion.
Series applied, thanks.