From: Sunil Goutham <[email protected]>
Below patches attempts to improve performance by reducing
no of atomic operations while allocating new receive buffers
and reducing cache misses by adjusting nicvf structure elements.
Sunil Goutham (2):
net: thunderx: Set recevie buffer page usage count in bulk
net: thunderx: Adjust nicvf structure to reduce cache misses
drivers/net/ethernet/cavium/thunder/nic.h | 51 ++++++++++++--------
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 31 +++++++++---
2 files changed, 53 insertions(+), 29 deletions(-)
From: Sunil Goutham <[email protected]>
Instead of calling get_page() for every receive buffer carved out
of page, set page's usage count at the end, to reduce no of atomic
calls.
Signed-off-by: Sunil Goutham <[email protected]>
---
drivers/net/ethernet/cavium/thunder/nic.h | 1 +
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 31 ++++++++++++++-----
2 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 00cc915..5628aea 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -285,6 +285,7 @@ struct nicvf {
u32 speed;
struct page *rb_page;
u32 rb_page_offset;
+ u16 rb_pageref;
bool rb_alloc_fail;
bool rb_work_scheduled;
struct delayed_work rbdr_work;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 0dd1abf..fa05e34 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -18,6 +18,15 @@
#include "q_struct.h"
#include "nicvf_queues.h"
+static void nicvf_get_page(struct nicvf *nic)
+{
+ if (!nic->rb_pageref || !nic->rb_page)
+ return;
+
+ atomic_add(nic->rb_pageref, &nic->rb_page->_count);
+ nic->rb_pageref = 0;
+}
+
/* Poll a register for a specific value */
static int nicvf_poll_reg(struct nicvf *nic, int qidx,
u64 reg, int bit_pos, int bits, int val)
@@ -81,16 +90,15 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp,
int order = (PAGE_SIZE <= 4096) ? PAGE_ALLOC_COSTLY_ORDER : 0;
/* Check if request can be accomodated in previous allocated page */
- if (nic->rb_page) {
- if ((nic->rb_page_offset + buf_len + buf_len) >
- (PAGE_SIZE << order)) {
- nic->rb_page = NULL;
- } else {
- nic->rb_page_offset += buf_len;
- get_page(nic->rb_page);
- }
+ if (nic->rb_page &&
+ ((nic->rb_page_offset + buf_len) < (PAGE_SIZE << order))) {
+ nic->rb_pageref++;
+ goto ret;
}
+ nicvf_get_page(nic);
+ nic->rb_page = NULL;
+
/* Allocate a new page */
if (!nic->rb_page) {
nic->rb_page = alloc_pages(gfp | __GFP_COMP | __GFP_NOWARN,
@@ -102,7 +110,9 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp,
nic->rb_page_offset = 0;
}
+ret:
*rbuf = (u64 *)((u64)page_address(nic->rb_page) + nic->rb_page_offset);
+ nic->rb_page_offset += buf_len;
return 0;
}
@@ -158,6 +168,9 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr,
desc = GET_RBDR_DESC(rbdr, idx);
desc->buf_addr = virt_to_phys(rbuf) >> NICVF_RCV_BUF_ALIGN;
}
+
+ nicvf_get_page(nic);
+
return 0;
}
@@ -241,6 +254,8 @@ refill:
new_rb++;
}
+ nicvf_get_page(nic);
+
/* make sure all memory stores are done before ringing doorbell */
smp_wmb();
--
1.7.1
From: Sunil Goutham <[email protected]>
Adjusted nicvf structure such that all elements used in hot
path like napi, xmit e.t.c fall into same cache line. This reduced
no of cache misses and resulted in ~2% increase in no of packets
handled on a core.
Also modified elements with :1 notation to boolean, to be
consistent with other element definitions.
Signed-off-by: Sunil Goutham <[email protected]>
---
drivers/net/ethernet/cavium/thunder/nic.h | 52 ++++++++++++++++------------
1 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 5628aea..c063d92 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -263,46 +263,54 @@ struct nicvf {
struct nicvf *pnicvf;
struct net_device *netdev;
struct pci_dev *pdev;
+ void __iomem *reg_base;
+ struct queue_set *qs;
+ struct nicvf_cq_poll *napi[8];
u8 vf_id;
- u8 node;
- u8 tns_mode:1;
- u8 sqs_mode:1;
- u8 loopback_supported:1;
+ u8 sqs_id;
+ bool sqs_mode;
bool hw_tso;
- u16 mtu;
- struct queue_set *qs;
+
+ /* Receive buffer alloc */
+ u32 rb_page_offset;
+ u16 rb_pageref;
+ bool rb_alloc_fail;
+ bool rb_work_scheduled;
+ struct page *rb_page;
+ struct delayed_work rbdr_work;
+ struct tasklet_struct rbdr_task;
+
+ /* Secondary Qset */
+ u8 sqs_count;
#define MAX_SQS_PER_VF_SINGLE_NODE 5
#define MAX_SQS_PER_VF 11
- u8 sqs_id;
- u8 sqs_count; /* Secondary Qset count */
struct nicvf *snicvf[MAX_SQS_PER_VF];
+
+ /* Queue count */
u8 rx_queues;
u8 tx_queues;
u8 max_queues;
- void __iomem *reg_base;
+
+ u8 node;
+ u8 cpi_alg;
+ u16 mtu;
bool link_up;
u8 duplex;
u32 speed;
- struct page *rb_page;
- u32 rb_page_offset;
- u16 rb_pageref;
- bool rb_alloc_fail;
- bool rb_work_scheduled;
- struct delayed_work rbdr_work;
- struct tasklet_struct rbdr_task;
- struct tasklet_struct qs_err_task;
- struct tasklet_struct cq_task;
- struct nicvf_cq_poll *napi[8];
+ bool tns_mode;
+ bool loopback_supported;
struct nicvf_rss_info rss_info;
- u8 cpi_alg;
+ struct tasklet_struct qs_err_task;
+ struct work_struct reset_task;
+
/* Interrupt coalescing settings */
u32 cq_coalesce_usecs;
-
u32 msg_enable;
+
+ /* Stats */
struct nicvf_hw_stats hw_stats;
struct nicvf_drv_stats drv_stats;
struct bgx_stats bgx_stats;
- struct work_struct reset_task;
/* MSI-X */
bool msix_enabled;
--
1.7.1
From: [email protected]
Date: Mon, 7 Mar 2016 13:05:56 +0530
> From: Sunil Goutham <[email protected]>
>
> Instead of calling get_page() for every receive buffer carved out
> of page, set page's usage count at the end, to reduce no of atomic
> calls.
>
> Signed-off-by: Sunil Goutham <[email protected]>
> ---
> drivers/net/ethernet/cavium/thunder/nic.h | 1 +
> drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 31 ++++++++++++++-----
> 2 files changed, 24 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
> index 00cc915..5628aea 100644
> --- a/drivers/net/ethernet/cavium/thunder/nic.h
> +++ b/drivers/net/ethernet/cavium/thunder/nic.h
> @@ -285,6 +285,7 @@ struct nicvf {
> u32 speed;
> struct page *rb_page;
> u32 rb_page_offset;
> + u16 rb_pageref;
> bool rb_alloc_fail;
> bool rb_work_scheduled;
> struct delayed_work rbdr_work;
> diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
> index 0dd1abf..fa05e34 100644
> --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
> +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
> @@ -18,6 +18,15 @@
> #include "q_struct.h"
> #include "nicvf_queues.h"
>
> +static void nicvf_get_page(struct nicvf *nic)
> +{
> + if (!nic->rb_pageref || !nic->rb_page)
> + return;
> +
> + atomic_add(nic->rb_pageref, &nic->rb_page->_count);
> + nic->rb_pageref = 0;
> +}
> +
> /* Poll a register for a specific value */
> static int nicvf_poll_reg(struct nicvf *nic, int qidx,
> u64 reg, int bit_pos, int bits, int val)
> @@ -81,16 +90,15 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp,
> int order = (PAGE_SIZE <= 4096) ? PAGE_ALLOC_COSTLY_ORDER : 0;
>
> /* Check if request can be accomodated in previous allocated page */
> - if (nic->rb_page) {
> - if ((nic->rb_page_offset + buf_len + buf_len) >
> - (PAGE_SIZE << order)) {
> - nic->rb_page = NULL;
> - } else {
> - nic->rb_page_offset += buf_len;
> - get_page(nic->rb_page);
> - }
> + if (nic->rb_page &&
> + ((nic->rb_page_offset + buf_len) < (PAGE_SIZE << order))) {
> + nic->rb_pageref++;
> + goto ret;
> }
I do not see how this can sanely work.
By deferring the atomic increment of the page count, you create a window of
time during which the consumer can release the page and prematurely free it.
I'm not applying this, as it looks extremely buggy. Sorry.
Hi David,
>> you create a window of time during which the consumer
>> can release the page and prematurely free it.
Okay, but here the consumer i.e HW is notified only after page count
is incremented.
For example if you check 'nicvf_refill_rbdr' fn() only after receive
buffer ring is
refilled with buffers, doorbell is issued to hardware to start using
the new buffers.
So
@@ -241,6 +254,8 @@ refill:
new_rb++;
}
+ nicvf_get_page(nic);
+
calling 'nicvf_get_page' before issuing doorbell ensures page ref
count is properly
set before hw/consumer can use the buffers.
Thanks,
Sunil.
From: Sunil Kovvuri <[email protected]>
Date: Mon, 7 Mar 2016 22:28:39 +0530
> Hi David,
>
>>> you create a window of time during which the consumer
>>> can release the page and prematurely free it.
> Okay, but here the consumer i.e HW is notified only after page count
> is incremented.
> For example if you check 'nicvf_refill_rbdr' fn() only after receive
> buffer ring is
> refilled with buffers, doorbell is issued to hardware to start using
> the new buffers.
>
> So
> @@ -241,6 +254,8 @@ refill:
> new_rb++;
> }
>
> + nicvf_get_page(nic);
> +
> calling 'nicvf_get_page' before issuing doorbell ensures page ref
> count is properly
> set before hw/consumer can use the buffers.
So if you know ahead of time how the page will be split up, just
calculate that when you get the page and increment the page count
appropriately.
That's what we do in the NIU driver.
On Mon, Mar 7, 2016 at 10:34 PM, David Miller <[email protected]> wrote:
> From: Sunil Kovvuri <[email protected]>
> Date: Mon, 7 Mar 2016 22:28:39 +0530
>
>> Hi David,
>>
>>>> you create a window of time during which the consumer
>>>> can release the page and prematurely free it.
>> Okay, but here the consumer i.e HW is notified only after page count
>> is incremented.
>> For example if you check 'nicvf_refill_rbdr' fn() only after receive
>> buffer ring is
>> refilled with buffers, doorbell is issued to hardware to start using
>> the new buffers.
>>
>> So
>> @@ -241,6 +254,8 @@ refill:
>> new_rb++;
>> }
>>
>> + nicvf_get_page(nic);
>> +
>> calling 'nicvf_get_page' before issuing doorbell ensures page ref
>> count is properly
>> set before hw/consumer can use the buffers.
>
> So if you know ahead of time how the page will be split up, just
> calculate that when you get the page and increment the page count
> appropriately.
>
> That's what we do in the NIU driver.
Thanks for the suggestion, will check and get back.
Regards,
Sunil.
Hi David,
>> So if you know ahead of time how the page will be split up, just
>> calculate that when you get the page and increment the page count
>> appropriately.
>>
>> That's what we do in the NIU driver.
>
> Thanks for the suggestion, will check and get back.
>
I looked at the NIU driver and in fn() niu_rbr_refill()
static void niu_rbr_refill(struct niu *np, struct rx_ring_info *rp, gfp_t mask)
{
int index = rp->rbr_index;
rp->rbr_pending++;
if ((rp->rbr_pending % rp->rbr_blocks_per_page) == 0) {
Here it's been checked whether rbr_pending is a exact multiple of page
split count.
And hence updating page count based on fixed calculation is right.
On my platform driver receives a interrupt when free buffer count
falls below a threshold
and by the time SW reads count of buffers to be refilled it can be any
number i.e
may or may not be a exact multiple of page split count.
When interface is being brought down and while freeing unused receive
buffers one
page's count will go wrong and hence will disappear completely. In the
patch i am
updating page->count before issuing doorbell to HW irrespective of
full page utilised
or not. So there is no mismatch while doing put_page(unused buffer).
Thanks,
Sunil.
From: Sunil Kovvuri <[email protected]>
Date: Thu, 10 Mar 2016 16:13:28 +0530
> Hi David,
>
>
>>> So if you know ahead of time how the page will be split up, just
>>> calculate that when you get the page and increment the page count
>>> appropriately.
>>>
>>> That's what we do in the NIU driver.
>>
>> Thanks for the suggestion, will check and get back.
>>
>
> I looked at the NIU driver and in fn() niu_rbr_refill()
> static void niu_rbr_refill(struct niu *np, struct rx_ring_info *rp, gfp_t mask)
> {
> int index = rp->rbr_index;
>
> rp->rbr_pending++;
> if ((rp->rbr_pending % rp->rbr_blocks_per_page) == 0) {
>
> Here it's been checked whether rbr_pending is a exact multiple of page
> split count.
> And hence updating page count based on fixed calculation is right.
>
> On my platform driver receives a interrupt when free buffer count
> falls below a threshold
> and by the time SW reads count of buffers to be refilled it can be any
> number i.e
> may or may not be a exact multiple of page split count.
So calculate the modulus on the page split count and optimize the
increment ahead of time when possible, and for the sub page split
pieces do it one at a time.
I don't understand what the problem is.
>
> So calculate the modulus on the page split count and optimize the
> increment ahead of time when possible, and for the sub page split
> pieces do it one at a time.
>
Patch does almost the same with a negligible overhead of a counter
for page->_count increment at a later time but still before HW starts
using buffers.
Difference between NIU driver and this patch is there it's
calculate split count, increment page count and then divide page into
buffers. Here it's divide page into buffers, have a counter which increments
at every split and then at the end do a atomic increment of page->_count.
Any issue with this approach ?
Thanks,
Sunil.
From: Sunil Kovvuri <[email protected]>
Date: Thu, 10 Mar 2016 23:57:48 +0530
> Difference between NIU driver and this patch is there it's
> calculate split count, increment page count and then divide page into
> buffers. Here it's divide page into buffers, have a counter which increments
> at every split and then at the end do a atomic increment of page->_count.
>
> Any issue with this approach ?
I guess not.
On Fri, Mar 11, 2016 at 1:18 AM, David Miller <[email protected]> wrote:
> From: Sunil Kovvuri <[email protected]>
> Date: Thu, 10 Mar 2016 23:57:48 +0530
>
>> Difference between NIU driver and this patch is there it's
>> calculate split count, increment page count and then divide page into
>> buffers. Here it's divide page into buffers, have a counter which increments
>> at every split and then at the end do a atomic increment of page->_count.
>>
>> Any issue with this approach ?
>
> I guess not.
Okay, so can i assume you will consider the patch for merging if no
other comments
are received. Or do you want me to resubmit patches a fresh. ?
From: Sunil Kovvuri <[email protected]>
Date: Fri, 11 Mar 2016 22:34:19 +0530
> On Fri, Mar 11, 2016 at 1:18 AM, David Miller <[email protected]> wrote:
>> From: Sunil Kovvuri <[email protected]>
>> Date: Thu, 10 Mar 2016 23:57:48 +0530
>>
>>> Difference between NIU driver and this patch is there it's
>>> calculate split count, increment page count and then divide page into
>>> buffers. Here it's divide page into buffers, have a counter which increments
>>> at every split and then at the end do a atomic increment of page->_count.
>>>
>>> Any issue with this approach ?
>>
>> I guess not.
>
> Okay, so can i assume you will consider the patch for merging if no
> other comments
> are received. Or do you want me to resubmit patches a fresh. ?
Please resubmit, thanks a lot.