2002-12-07 22:31:17

by Jeff Garzik

[permalink] [raw]
Subject: [RFC][PATCH] net drivers and cache alignment

===== drivers/net/tg3.c 1.41 vs edited =====
--- 1.41/drivers/net/tg3.c Wed Nov 20 00:49:23 2002
+++ edited/drivers/net/tg3.c Sat Dec 7 17:12:38 2002
@@ -25,6 +25,7 @@
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tcp.h>
+#include <linux/cache.h>

#include <asm/system.h>
#include <asm/io.h>
===== drivers/net/tg3.h 1.19 vs edited =====
--- 1.19/drivers/net/tg3.h Mon Nov 11 05:27:52 2002
+++ edited/drivers/net/tg3.h Sat Dec 7 17:20:25 2002
@@ -1741,17 +1741,66 @@
* necessary for acquisition of 'tx_lock'.
*/
spinlock_t lock;
- spinlock_t tx_lock;
+ spinlock_t indirect_lock;
+
+ unsigned long regs;
+ struct net_device *dev;
+ struct pci_dev *pdev;
+
+ struct tg3_hw_status *hw_status;
+ dma_addr_t status_mapping;
+
+ u32 msg_enable;
+
+ /* end "general, frequently-used members" cacheline section */
+ ALIGNED_PAD(_pad1_)

u32 tx_prod;
u32 tx_cons;
+ u32 tx_pending;
+
+ spinlock_t tx_lock;
+
+ /* TX descs are only used if TG3_FLAG_HOST_TXDS is set. */
+ struct tg3_tx_buffer_desc *tx_ring;
+ struct tx_ring_info *tx_buffers;
+ dma_addr_t tx_desc_mapping;
+
+ /* end "tx thread" cacheline section */
+ ALIGNED_PAD(_pad2_)
+
u32 rx_rcb_ptr;
u32 rx_std_ptr;
u32 rx_jumbo_ptr;
#if TG3_MINI_RING_WORKS
u32 rx_mini_ptr;
#endif
- spinlock_t indirect_lock;
+ u32 rx_pending;
+#if TG3_MINI_RING_WORKS
+ u32 rx_mini_pending;
+#endif
+ u32 rx_jumbo_pending;
+#if TG3_VLAN_TAG_USED
+ struct vlan_group *vlgrp;
+#endif
+
+ struct tg3_rx_buffer_desc *rx_std;
+ struct ring_info *rx_std_buffers;
+ dma_addr_t rx_std_mapping;
+#if TG3_MINI_RING_WORKS
+ struct tg3_rx_buffer_desc *rx_mini;
+ struct ring_info *rx_mini_buffers;
+ dma_addr_t rx_mini_mapping;
+#endif
+ struct tg3_rx_buffer_desc *rx_jumbo;
+ struct ring_info *rx_jumbo_buffers;
+ dma_addr_t rx_jumbo_mapping;
+
+ struct tg3_rx_buffer_desc *rx_rcb;
+ dma_addr_t rx_rcb_mapping;
+
+ /* end "rx thread" cacheline section */
+ ALIGNED_PAD(_pad3_)

struct net_device_stats net_stats;
struct net_device_stats net_stats_prev;
@@ -1791,8 +1840,6 @@
#define TG3_FLAG_SPLIT_MODE 0x40000000
#define TG3_FLAG_INIT_COMPLETE 0x80000000

- u32 msg_enable;
-
u32 split_mode_max_reqs;
#define SPLIT_MODE_5704_MAX_REQ 3

@@ -1806,13 +1853,6 @@
struct tg3_link_config link_config;
struct tg3_bufmgr_config bufmgr_config;

- u32 rx_pending;
-#if TG3_MINI_RING_WORKS
- u32 rx_mini_pending;
-#endif
- u32 rx_jumbo_pending;
- u32 tx_pending;
-
/* cache h/w values, often passed straight to h/w */
u32 rx_mode;
u32 tx_mode;
@@ -1865,38 +1905,10 @@
(X) == PHY_ID_BCM5703 || (X) == PHY_ID_BCM5704 || \
(X) == PHY_ID_BCM8002 || (X) == PHY_ID_SERDES)

- unsigned long regs;
- struct pci_dev *pdev;
- struct net_device *dev;
-#if TG3_VLAN_TAG_USED
- struct vlan_group *vlgrp;
-#endif
-
- struct tg3_rx_buffer_desc *rx_std;
- struct ring_info *rx_std_buffers;
- dma_addr_t rx_std_mapping;
-#if TG3_MINI_RING_WORKS
- struct tg3_rx_buffer_desc *rx_mini;
- struct ring_info *rx_mini_buffers;
- dma_addr_t rx_mini_mapping;
-#endif
- struct tg3_rx_buffer_desc *rx_jumbo;
- struct ring_info *rx_jumbo_buffers;
- dma_addr_t rx_jumbo_mapping;
-
- struct tg3_rx_buffer_desc *rx_rcb;
- dma_addr_t rx_rcb_mapping;
-
- /* TX descs are only used if TG3_FLAG_HOST_TXDS is set. */
- struct tg3_tx_buffer_desc *tx_ring;
- struct tx_ring_info *tx_buffers;
- dma_addr_t tx_desc_mapping;
-
- struct tg3_hw_status *hw_status;
- dma_addr_t status_mapping;
-
struct tg3_hw_stats *hw_stats;
dma_addr_t stats_mapping;
+
+ /* end "everything else" cacheline(s) section */
};

#endif /* !(_T3_H) */
===== include/linux/cache.h 1.5 vs edited =====
--- 1.5/include/linux/cache.h Tue Aug 27 16:04:10 2002
+++ edited/include/linux/cache.h Sat Dec 7 17:12:13 2002
@@ -53,4 +53,14 @@
#endif
#endif

+/* helper used inside struct definitions to break up struct at
+ * cacheline boundaries.
+ * Note: do not add a semi-colon (';') after an ALIGNED_PAD use.
+ */
+struct dummy_cacheline_struct {
+ int x;
+} ____cacheline_aligned;
+#define ALIGNED_PAD(name) \
+ struct dummy_cacheline_struct name;
+
#endif /* __LINUX_CACHE_H */


Attachments:
patch (4.13 kB)

2002-12-07 22:35:40

by David Miller

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment


Can't the cacheline_aligned attribute be applied to individual
struct members? I remember doing this for thread_struct on
sparc ages ago.

2002-12-07 22:39:34

by Jeff Garzik

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

David S. Miller wrote:
> Can't the cacheline_aligned attribute be applied to individual
> struct members? I remember doing this for thread_struct on
> sparc ages ago.


I was hoping someone who knows gcc better than me knew that, and would
speak up ;-)


2002-12-07 22:59:19

by Jeff Garzik

[permalink] [raw]
Subject: [RFC][PATCH] net drivers and cache alignment

===== drivers/net/tg3.c 1.41 vs edited =====
--- 1.41/drivers/net/tg3.c Wed Nov 20 00:49:23 2002
+++ edited/drivers/net/tg3.c Sat Dec 7 17:12:38 2002
@@ -25,6 +25,7 @@
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tcp.h>
+#include <linux/cache.h>

#include <asm/system.h>
#include <asm/io.h>
===== drivers/net/tg3.h 1.19 vs edited =====
--- 1.19/drivers/net/tg3.h Mon Nov 11 05:27:52 2002
+++ edited/drivers/net/tg3.h Sat Dec 7 18:01:08 2002
@@ -1728,6 +1728,8 @@
};

struct tg3 {
+ /* begin "general, frequently-used members" cacheline section */
+
/* SMP locking strategy:
*
* lock: Held during all operations except TX packet
@@ -1740,20 +1742,63 @@
* be disabled to take 'lock' but only softirq disabling is
* necessary for acquisition of 'tx_lock'.
*/
- spinlock_t lock;
- spinlock_t tx_lock;
+ spinlock_t lock ____cacheline_aligned;
+ spinlock_t indirect_lock;

- u32 tx_prod;
+ unsigned long regs;
+ struct net_device *dev;
+ struct pci_dev *pdev;
+
+ struct tg3_hw_status *hw_status;
+ dma_addr_t status_mapping;
+
+ u32 msg_enable;
+
+ /* begin "tx thread" cacheline section */
+ u32 tx_prod ____cacheline_aligned;
u32 tx_cons;
- u32 rx_rcb_ptr;
+ u32 tx_pending;
+
+ spinlock_t tx_lock;
+
+ /* TX descs are only used if TG3_FLAG_HOST_TXDS is set. */
+ struct tg3_tx_buffer_desc *tx_ring;
+ struct tx_ring_info *tx_buffers;
+ dma_addr_t tx_desc_mapping;
+
+ /* begin "rx thread" cacheline section */
+ u32 rx_rcb_ptr ____cacheline_aligned;
u32 rx_std_ptr;
u32 rx_jumbo_ptr;
#if TG3_MINI_RING_WORKS
u32 rx_mini_ptr;
#endif
- spinlock_t indirect_lock;
+ u32 rx_pending;
+#if TG3_MINI_RING_WORKS
+ u32 rx_mini_pending;
+#endif
+ u32 rx_jumbo_pending;
+#if TG3_VLAN_TAG_USED
+ struct vlan_group *vlgrp;
+#endif
+
+ struct tg3_rx_buffer_desc *rx_std;
+ struct ring_info *rx_std_buffers;
+ dma_addr_t rx_std_mapping;
+#if TG3_MINI_RING_WORKS
+ struct tg3_rx_buffer_desc *rx_mini;
+ struct ring_info *rx_mini_buffers;
+ dma_addr_t rx_mini_mapping;
+#endif
+ struct tg3_rx_buffer_desc *rx_jumbo;
+ struct ring_info *rx_jumbo_buffers;
+ dma_addr_t rx_jumbo_mapping;

- struct net_device_stats net_stats;
+ struct tg3_rx_buffer_desc *rx_rcb;
+ dma_addr_t rx_rcb_mapping;
+
+ /* begin "everything else" cacheline(s) section */
+ struct net_device_stats net_stats ____cacheline_aligned;
struct net_device_stats net_stats_prev;
unsigned long phy_crc_errors;

@@ -1791,8 +1836,6 @@
#define TG3_FLAG_SPLIT_MODE 0x40000000
#define TG3_FLAG_INIT_COMPLETE 0x80000000

- u32 msg_enable;
-
u32 split_mode_max_reqs;
#define SPLIT_MODE_5704_MAX_REQ 3

@@ -1806,13 +1849,6 @@
struct tg3_link_config link_config;
struct tg3_bufmgr_config bufmgr_config;

- u32 rx_pending;
-#if TG3_MINI_RING_WORKS
- u32 rx_mini_pending;
-#endif
- u32 rx_jumbo_pending;
- u32 tx_pending;
-
/* cache h/w values, often passed straight to h/w */
u32 rx_mode;
u32 tx_mode;
@@ -1864,36 +1900,6 @@
(X) == PHY_ID_BCM5411 || (X) == PHY_ID_BCM5701 || \
(X) == PHY_ID_BCM5703 || (X) == PHY_ID_BCM5704 || \
(X) == PHY_ID_BCM8002 || (X) == PHY_ID_SERDES)
-
- unsigned long regs;
- struct pci_dev *pdev;
- struct net_device *dev;
-#if TG3_VLAN_TAG_USED
- struct vlan_group *vlgrp;
-#endif
-
- struct tg3_rx_buffer_desc *rx_std;
- struct ring_info *rx_std_buffers;
- dma_addr_t rx_std_mapping;
-#if TG3_MINI_RING_WORKS
- struct tg3_rx_buffer_desc *rx_mini;
- struct ring_info *rx_mini_buffers;
- dma_addr_t rx_mini_mapping;
-#endif
- struct tg3_rx_buffer_desc *rx_jumbo;
- struct ring_info *rx_jumbo_buffers;
- dma_addr_t rx_jumbo_mapping;
-
- struct tg3_rx_buffer_desc *rx_rcb;
- dma_addr_t rx_rcb_mapping;
-
- /* TX descs are only used if TG3_FLAG_HOST_TXDS is set. */
- struct tg3_tx_buffer_desc *tx_ring;
- struct tx_ring_info *tx_buffers;
- dma_addr_t tx_desc_mapping;
-
- struct tg3_hw_status *hw_status;
- dma_addr_t status_mapping;

struct tg3_hw_stats *hw_stats;
dma_addr_t stats_mapping;


Attachments:
patch (3.98 kB)

2002-12-07 23:21:45

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

Jeff Garzik wrote:
>
> David S. Miller wrote:
> > Can't the cacheline_aligned attribute be applied to individual
> > struct members? I remember doing this for thread_struct on
> > sparc ages ago.
>
> Looks like it from the 2.4 processor.h code.
>
> Attached is cut #2. Thanks for all the near-instant feedback so far :)
> Andrew, does the attached still need padding on SMP?

It needs padding _only_ on SMP. ____cacheline_aligned_in_smp.

#define offsetof(t, m) ((int)(&((t *)0)->m))

struct foo {
int a;
int b __attribute__((__aligned__(1024)));
int c;
} foo;

main()
{
printf("%d\n", sizeof(struct foo));
printf("%d\n", offsetof(struct foo, a));
printf("%d\n", offsetof(struct foo, b));
printf("%d\n", offsetof(struct foo, c));
}

./a.out
2048
0
1024
1028

So your patch will do what you want it to do. You should just tag the
first member of a group with ____cacheline_aligned_in_smp, and keep an
eye on things with offsetof().

Not sure why sizeof() returned 2048 though.

2002-12-07 23:26:22

by David Miller

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

From: Andrew Morton <[email protected]>
Date: Sat, 07 Dec 2002 15:29:16 -0800

Jeff Garzik wrote:
> Attached is cut #2. Thanks for all the near-instant feedback so far :)
> Andrew, does the attached still need padding on SMP?

It needs padding _only_ on SMP. ____cacheline_aligned_in_smp.

non-smp machines lack L2 caches? That's new to me :-)

More seriously, there are real benefits on non-SMP systems.

2002-12-07 23:29:52

by Jeff Garzik

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

Andrew Morton wrote:
> It needs padding _only_ on SMP. ____cacheline_aligned_in_smp.
[...]
> So your patch will do what you want it to do. You should just tag the
> first member of a group with ____cacheline_aligned_in_smp, and keep an
> eye on things with offsetof().


thanks.

For this case, though, I want to align on cacheline bounaries even on
UP, right? That's why I picked ____cacheline_aligned. It uses
L1_CACHE_BYTES when !CONFIG_SMP. Other uses of ____cacheline_aligned in
the kernel seem to relate to irq matters, just like my groupings in tg3.h.

[obviously benchmarking can answer some of this, but I want to hammer
out silliness first]

Jeff



2002-12-07 23:30:41

by J.A. Magallon

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment


On 2002.12.08 Andrew Morton wrote:
>Jeff Garzik wrote:
>>
>> David S. Miller wrote:
>> > Can't the cacheline_aligned attribute be applied to individual
>> > struct members? I remember doing this for thread_struct on
>> > sparc ages ago.
>>
>> Looks like it from the 2.4 processor.h code.
>>
>> Attached is cut #2. Thanks for all the near-instant feedback so far :)
>> Andrew, does the attached still need padding on SMP?
>

What do you all think about this:

#include <stdio.h>

#define CACHE_LINE_SIZE 128
#define ____cacheline_aligned __attribute__((__aligned__(CACHE_LINE_SIZE)))

#define __cacheline_start struct { } ____cacheline_aligned;

#define offsetof(t, m) ((int)(&((t *)0)->m))

struct S {
__cacheline_start

int x;

__cacheline_start

int y;
int z;
};

int main()
{
struct S s;

printf("%d\n",sizeof(struct S));
printf("%d\n",offsetof(struct S,x));
printf("%d\n",offsetof(struct S,y));
printf("%d\n",offsetof(struct S,z));
}

werewolf:~> vi kk.c
werewolf:~> kk
256
0
128
132

So you don't have to modify any field, just put __cacheline_start where
needed ? (and does not add any extra sizeof(int) overhead).

--
J.A. Magallon <[email protected]> \ Software is like sex:
werewolf.able.es \ It's better when it's free
Mandrake Linux release 9.1 (Cooker) for i586
Linux 2.4.20-jam1 (gcc 3.2 (Mandrake Linux 9.1 3.2-4mdk))

2002-12-07 23:37:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

"J.A. Magallon" wrote:
>
> #define __cacheline_start struct { } ____cacheline_aligned;

That will generate a warning on faster^Wolder versions of gcc.

mnm:/home/akpm> gcc t2.c
t2.c:11: warning: unnamed struct/union that defines no instances
t2.c:15: warning: unnamed struct/union that defines no instances
mnm:/home/akpm> gcc -v
Reading specs from /usr/local/lib/gcc-lib/i686-pc-linux-gnu/2.95.3/specs
gcc version 2.95.3 20010315 (release)

2002-12-07 23:34:28

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

"David S. Miller" wrote:
>
> From: Andrew Morton <[email protected]>
> Date: Sat, 07 Dec 2002 15:29:16 -0800
>
> Jeff Garzik wrote:
> > Attached is cut #2. Thanks for all the near-instant feedback so far :)
> > Andrew, does the attached still need padding on SMP?
>
> It needs padding _only_ on SMP. ____cacheline_aligned_in_smp.
>
> non-smp machines lack L2 caches? That's new to me :-)
>
> More seriously, there are real benefits on non-SMP systems.

Then I am most confused. None of these fields will be put under
busmastering or anything like that, so what advantage is there in
spreading them out?

2002-12-07 23:36:07

by Jeff Garzik

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

J.A. Magallon wrote:
> On 2002.12.08 Andrew Morton wrote:
>
>>Jeff Garzik wrote:
>>
>>>David S. Miller wrote:
>>>
>>>>Can't the cacheline_aligned attribute be applied to individual
>>>>struct members? I remember doing this for thread_struct on
>>>>sparc ages ago.
>>>
>>>Looks like it from the 2.4 processor.h code.
>>>
>>>Attached is cut #2. Thanks for all the near-instant feedback so far :)
>>> Andrew, does the attached still need padding on SMP?
>>
>
> What do you all think about this:
>
> #include <stdio.h>
>
> #define CACHE_LINE_SIZE 128
> #define ____cacheline_aligned __attribute__((__aligned__(CACHE_LINE_SIZE)))
>
> #define __cacheline_start struct { } ____cacheline_aligned;


if you can mark struct members with attributes, as it appears you can,
there's no need to define a struct.

Jeff



2002-12-07 23:44:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

Andrew Morton wrote:
>
> "David S. Miller" wrote:
> >
> > From: Andrew Morton <[email protected]>
> > Date: Sat, 07 Dec 2002 15:29:16 -0800
> >
> > Jeff Garzik wrote:
> > > Attached is cut #2. Thanks for all the near-instant feedback so far :)
> > > Andrew, does the attached still need padding on SMP?
> >
> > It needs padding _only_ on SMP. ____cacheline_aligned_in_smp.
> >
> > non-smp machines lack L2 caches? That's new to me :-)
> >
> > More seriously, there are real benefits on non-SMP systems.
>
> Then I am most confused. None of these fields will be put under
> busmastering or anything like that, so what advantage is there in
> spreading them out?

Oh I see what you want - to be able to pick up all the operating fields
in a single fetch.

That will increase the overall cache footprint though. I wonder if
it's really a net win, over just keeping it small.

2002-12-07 23:45:39

by J.A. Magallon

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment


On 2002.12.08 Andrew Morton wrote:
>"J.A. Magallon" wrote:
>>
>> #define __cacheline_start struct { } ____cacheline_aligned;
>
>That will generate a warning on faster^Wolder versions of gcc.
>
>mnm:/home/akpm> gcc t2.c
>t2.c:11: warning: unnamed struct/union that defines no instances
>t2.c:15: warning: unnamed struct/union that defines no instances
>mnm:/home/akpm> gcc -v
>Reading specs from /usr/local/lib/gcc-lib/i686-pc-linux-gnu/2.95.3/specs
>gcc version 2.95.3 20010315 (release)
>

And how 'bout this (do not have any gcc oldie available to test):

#define __cacheline_start union { int :0; } ____cacheline_aligned;

It passes gcc-3.2 -Wall...
I think it's nicer to insert __c_s than to go field by field marking
them...

--
J.A. Magallon <[email protected]> \ Software is like sex:
werewolf.able.es \ It's better when it's free
Mandrake Linux release 9.1 (Cooker) for i586
Linux 2.4.20-jam1 (gcc 3.2 (Mandrake Linux 9.1 3.2-4mdk))

2002-12-08 01:07:02

by Daniel Jacobowitz

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

On Sat, Dec 07, 2002 at 03:29:16PM -0800, Andrew Morton wrote:
> Jeff Garzik wrote:
> >
> > David S. Miller wrote:
> > > Can't the cacheline_aligned attribute be applied to individual
> > > struct members? I remember doing this for thread_struct on
> > > sparc ages ago.
> >
> > Looks like it from the 2.4 processor.h code.
> >
> > Attached is cut #2. Thanks for all the near-instant feedback so far :)
> > Andrew, does the attached still need padding on SMP?
>
> It needs padding _only_ on SMP. ____cacheline_aligned_in_smp.
>
> #define offsetof(t, m) ((int)(&((t *)0)->m))
>
> struct foo {
> int a;
> int b __attribute__((__aligned__(1024)));
> int c;
> } foo;
>
> main()
> {
> printf("%d\n", sizeof(struct foo));
> printf("%d\n", offsetof(struct foo, a));
> printf("%d\n", offsetof(struct foo, b));
> printf("%d\n", offsetof(struct foo, c));
> }
>
> ./a.out
> 2048
> 0
> 1024
> 1028
>
> So your patch will do what you want it to do. You should just tag the
> first member of a group with ____cacheline_aligned_in_smp, and keep an
> eye on things with offsetof().
>
> Not sure why sizeof() returned 2048 though.

The structure contains an __aligned__(1024) item. Think about an array
of 'struct foo' items. They have to be 2048 bytes or you won't align
correctly.

C allows for empty space in structure padding, but not in arrays,
AFAIK.


--
Daniel Jacobowitz
MontaVista Software Debian GNU/Linux Developer

2002-12-08 19:56:27

by David Miller

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

From: Andrew Morton <[email protected]>
Date: Sat, 07 Dec 2002 15:42:00 -0800

"David S. Miller" wrote:
> non-smp machines lack L2 caches? That's new to me :-)
>
> More seriously, there are real benefits on non-SMP systems.

Then I am most confused. None of these fields will be put under
busmastering or anything like that, so what advantage is there in
spreading them out?

When you are in the "tx path" you'll take one L2 cache miss
to bring all the necessary information into the cpu's caches.

Otherwise, when data is arbitrarily scattered over multiple L2
cache lines, you'll need to service potentially more L2 cache
misses.

This optimization has nothing to do with false data sharing amoungst
multiple processors. It's about packing the data accesses optimally
for specific code paths.

2002-12-15 18:23:24

by Jes Sorensen

[permalink] [raw]
Subject: Re: [RFC][PATCH] net drivers and cache alignment

>>>>> "Andrew" == Andrew Morton <[email protected]> writes:

Andrew> Andrew Morton wrote:
>> Then I am most confused. None of these fields will be put under
>> busmastering or anything like that, so what advantage is there in
>> spreading them out?

Andrew> Oh I see what you want - to be able to pick up all the
Andrew> operating fields in a single fetch.

Andrew> That will increase the overall cache footprint though. I
Andrew> wonder if it's really a net win, over just keeping it small.

There's another case where it matters, I guess one could look at it as
similar to the SMP case, but between CPU and device. Some devices have
producer indices in host memory which they update whenever it
receiving a packet. By putting that seperate from TX data structures
you avoid the CPU and the NIC fighting over cache lines.

Cheers,
Jes