The structures virtnet_info and receive_queue have byte holes in
middle, and their members could do with some rearranging
(order-of-declaration wise) in order to overcome this.
Rearranging the members helps in:
* elimination the byte holes in the middle of the structures
* reduce the size of the structure (virtnet_info)
* have more members stored in one cache line (as opposed to
unnecessarily crossing the cacheline boundary and spanning
different cachelines)
The analysis was performed using pahole.
These patches may be applied in any order.
Anant Thazhemadam (2):
net: reorder members of virtnet_info for optimization
net: reorder members of receive_queue in virtio_net for optimization
drivers/net/virtio_net.c | 44 +++++++++++++++++++++-------------------
1 file changed, 23 insertions(+), 21 deletions(-)
--
2.25.1
Analysis of the structure virtnet_info using pahole gives the
following stats.
/* size: 256, cachelines: 4, members: 25 */
/* sum members: 245, holes: 3, sum holes: 11 */
/* paddings: 1, sum paddings: 4 */
Reordering the order in which the members of virtnet_info are declared
helps in packing byte holes in the middle of virtnet_info, reduce the
size required by the structure by 8 bytes, and also allows members to be
stored without overstepping the boundaries of a cacheline (for a
cacheline of size 64bytes) unnecessarily.
Analysis using pahole post-reordering of members gives the following
stats.
/* size: 248, cachelines: 4, members: 25 */
/* padding: 3 */
/* paddings: 1, sum paddings: 4 */
/* last cacheline: 56 bytes */
Signed-off-by: Anant Thazhemadam <[email protected]>
---
The complete analysis done by pahole can be found below.
Before the change:
struct virtnet_info {
struct virtio_device * vdev; /* 0 8 */
struct virtqueue * cvq; /* 8 8 */
struct net_device * dev; /* 16 8 */
struct send_queue * sq; /* 24 8 */
struct receive_queue * rq; /* 32 8 */
unsigned int status; /* 40 4 */
u16 max_queue_pairs; /* 44 2 */
u16 curr_queue_pairs; /* 46 2 */
u16 xdp_queue_pairs; /* 48 2 */
bool big_packets; /* 50 1 */
bool mergeable_rx_bufs; /* 51 1 */
bool has_cvq; /* 52 1 */
bool any_header_sg; /* 53 1 */
u8 hdr_len; /* 54 1 */
/* XXX 1 byte hole, try to pack */
struct delayed_work refill; /* 56 88 */
/* XXX last struct has 4 bytes of padding */
/* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */
struct work_struct config_work; /* 144 32 */
bool affinity_hint_set; /* 176 1 */
/* XXX 7 bytes hole, try to pack */
struct hlist_node node; /* 184 16 */
/* --- cacheline 3 boundary (192 bytes) was 8 bytes ago --- */
struct hlist_node node_dead; /* 200 16 */
struct control_buf * ctrl; /* 216 8 */
u8 duplex; /* 224 1 */
/* XXX 3 bytes hole, try to pack */
u32 speed; /* 228 4 */
long unsigned int guest_offloads; /* 232 8 */
long unsigned int guest_offloads_capable; /* 240 8 */
struct failover * failover; /* 248 8 */
/* size: 256, cachelines: 4, members: 25 */
/* sum members: 245, holes: 3, sum holes: 11 */
/* paddings: 1, sum paddings: 4 */
};
After the Change:
struct virtnet_info {
struct virtio_device * vdev; /* 0 8 */
struct virtqueue * cvq; /* 8 8 */
struct net_device * dev; /* 16 8 */
struct send_queue * sq; /* 24 8 */
struct receive_queue * rq; /* 32 8 */
unsigned int status; /* 40 4 */
u16 max_queue_pairs; /* 44 2 */
u16 curr_queue_pairs; /* 46 2 */
u16 xdp_queue_pairs; /* 48 2 */
bool big_packets; /* 50 1 */
bool mergeable_rx_bufs; /* 51 1 */
bool has_cvq; /* 52 1 */
bool any_header_sg; /* 53 1 */
bool affinity_hint_set; /* 54 1 */
u8 hdr_len; /* 55 1 */
struct control_buf * ctrl; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct work_struct config_work; /* 64 32 */
struct hlist_node node; /* 96 16 */
struct hlist_node node_dead; /* 112 16 */
/* --- cacheline 2 boundary (128 bytes) --- */
long unsigned int guest_offloads; /* 128 8 */
long unsigned int guest_offloads_capable; /* 136 8 */
struct failover * failover; /* 144 8 */
struct delayed_work refill; /* 152 88 */
/* XXX last struct has 4 bytes of padding */
/* --- cacheline 3 boundary (192 bytes) was 48 bytes ago --- */
u32 speed; /* 240 4 */
u8 duplex; /* 244 1 */
/* size: 248, cachelines: 4, members: 25 */
/* padding: 3 */
/* paddings: 1, sum paddings: 4 */
/* last cacheline: 56 bytes */
};
It can be seen that the size has reduced by 8 bytes, and the holes have been eliminated
as well. Also, more members of virtnet_info are accomodated within one cacheline
(without unnecessarily crossing over the cacheline boundary).
drivers/net/virtio_net.c | 42 ++++++++++++++++++++--------------------
1 file changed, 21 insertions(+), 21 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 263b005981bd..32747f1980ae 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -137,29 +137,29 @@ struct receive_queue {
struct napi_struct napi;
+ /* Name of this receive queue: input.$index */
+ char name[40];
+
struct bpf_prog __rcu *xdp_prog;
struct virtnet_rq_stats stats;
+ /* RX: fragments + linear part + virtio header */
+ struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+ /* Page frag for packet buffer allocation. */
+ struct page_frag alloc_frag;
+
/* Chain pages by the private ptr. */
struct page *pages;
/* Average packet length for mergeable receive buffers. */
struct ewma_pkt_len mrg_avg_pkt_len;
- /* Page frag for packet buffer allocation. */
- struct page_frag alloc_frag;
-
- /* RX: fragments + linear part + virtio header */
- struct scatterlist sg[MAX_SKB_FRAGS + 2];
+ struct xdp_rxq_info xdp_rxq;
/* Min single buffer size for mergeable buffers case. */
unsigned int min_buf_len;
-
- /* Name of this receive queue: input.$index */
- char name[40];
-
- struct xdp_rxq_info xdp_rxq;
};
/* Control VQ buffers: protected by the rtnl lock */
@@ -202,33 +202,33 @@ struct virtnet_info {
/* Host can handle any s/g split between our header and packet data */
bool any_header_sg;
+ /* Does the affinity hint is set for virtqueues? */
+ bool affinity_hint_set;
+
/* Packet virtio header size */
u8 hdr_len;
- /* Work struct for refilling if we run low on memory. */
- struct delayed_work refill;
+ struct control_buf *ctrl;
/* Work struct for config space updates */
struct work_struct config_work;
- /* Does the affinity hint is set for virtqueues? */
- bool affinity_hint_set;
-
/* CPU hotplug instances for online & dead */
struct hlist_node node;
struct hlist_node node_dead;
- struct control_buf *ctrl;
-
- /* Ethtool settings */
- u8 duplex;
- u32 speed;
-
unsigned long guest_offloads;
unsigned long guest_offloads_capable;
/* failover when STANDBY feature enabled */
struct failover *failover;
+
+ /* Work struct for refilling if we run low on memory. */
+ struct delayed_work refill;
+
+ /* Ethtool settings */
+ u32 speed;
+ u8 duplex;
};
struct padded_vnet_hdr {
--
2.25.1
Analysis of the structure receive_queue using pahole gives the
following stats.
/* size: 1280, cachelines: 20, members: 11 */
/* sum members: 1220, holes: 1, sum holes: 60 */
/* paddings: 2, sum paddings: 44 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 60 */
Reordering the order in which the members of receive_queue are declared
helps in packing byte holes in the middle of receive_queue, and also
allows more members to be fully stored in a cacheline (of size 64bytes)
without overstepping over cachelines unnecessarily.
Analysis using pahole post-reordering of members gives us the following
stats.
/* size: 1280, cachelines: 20, members: 11 */
/* padding: 60 */
/* paddings: 2, sum paddings: 44 */
/* forced alignments: 2 */
Signed-off-by: Anant Thazhemadam <[email protected]>
---
The complete analysis done by pahole can be found below.
Before the change:
struct receive_queue {
struct virtqueue * vq; /* 0 8 */
struct napi_struct napi __attribute__((__aligned__(8))); /* 8 392 */
/* XXX last struct has 4 bytes of padding */
/* --- cacheline 6 boundary (384 bytes) was 16 bytes ago --- */
struct bpf_prog * xdp_prog; /* 400 8 */
struct virtnet_rq_stats stats; /* 408 64 */
/* --- cacheline 7 boundary (448 bytes) was 24 bytes ago --- */
struct page * pages; /* 472 8 */
struct ewma_pkt_len mrg_avg_pkt_len; /* 480 8 */
struct page_frag alloc_frag; /* 488 16 */
struct scatterlist sg[19]; /* 504 608 */
/* --- cacheline 17 boundary (1088 bytes) was 24 bytes ago --- */
unsigned int min_buf_len; /* 1112 4 */
char name[40]; /* 1116 40 */
/* XXX 60 bytes hole, try to pack */
/* --- cacheline 19 boundary (1216 bytes) --- */
struct xdp_rxq_info xdp_rxq __attribute__((__aligned__(64))); /* 1216 64 */
/* XXX last struct has 40 bytes of padding */
/* size: 1280, cachelines: 20, members: 11 */
/* sum members: 1220, holes: 1, sum holes: 60 */
/* paddings: 2, sum paddings: 44 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 60 */
} __attribute__((__aligned__(64)));
After the change:
struct receive_queue {
struct virtqueue * vq; /* 0 8 */
struct napi_struct napi __attribute__((__aligned__(8))); /* 8 392 */
/* XXX last struct has 4 bytes of padding */
/* --- cacheline 6 boundary (384 bytes) was 16 bytes ago --- */
char name[40]; /* 400 40 */
struct bpf_prog * xdp_prog; /* 440 8 */
/* --- cacheline 7 boundary (448 bytes) --- */
struct virtnet_rq_stats stats; /* 448 64 */
/* --- cacheline 8 boundary (512 bytes) --- */
struct scatterlist sg[19]; /* 512 608 */
/* --- cacheline 17 boundary (1088 bytes) was 32 bytes ago --- */
struct page_frag alloc_frag; /* 1120 16 */
struct page * pages; /* 1136 8 */
struct ewma_pkt_len mrg_avg_pkt_len; /* 1144 8 */
/* --- cacheline 18 boundary (1152 bytes) --- */
struct xdp_rxq_info xdp_rxq __attribute__((__aligned__(64))); /* 1152 64 */
/* XXX last struct has 40 bytes of padding */
/* --- cacheline 19 boundary (1216 bytes) --- */
unsigned int min_buf_len; /* 1216 4 */
/* size: 1280, cachelines: 20, members: 11 */
/* padding: 60 */
/* paddings: 2, sum paddings: 44 */
/* forced alignments: 2 */
} __attribute__((__aligned__(64)));
It can be observed that the holes have been eliminated.
Also, more members of virtnet_info are accomodated within a cacheline (instead of
unnecessarily crossing over the cacheline boundary).
There is a padding of 60 performed at the end since the min_buf_len is only of
size 4, and xdp_rxq is of size 64. If declared anywhere else other than at the
end, a 60 bytes hole would open up again.
drivers/net/virtio_net.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f7bd85001cf0..b52db0b4879a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -137,29 +137,29 @@ struct receive_queue {
struct napi_struct napi;
+ /* Name of this receive queue: input.$index */
+ char name[40];
+
struct bpf_prog __rcu *xdp_prog;
struct virtnet_rq_stats stats;
+ /* RX: fragments + linear part + virtio header */
+ struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+ /* Page frag for packet buffer allocation. */
+ struct page_frag alloc_frag;
+
/* Chain pages by the private ptr. */
struct page *pages;
/* Average packet length for mergeable receive buffers. */
struct ewma_pkt_len mrg_avg_pkt_len;
- /* Page frag for packet buffer allocation. */
- struct page_frag alloc_frag;
-
- /* RX: fragments + linear part + virtio header */
- struct scatterlist sg[MAX_SKB_FRAGS + 2];
+ struct xdp_rxq_info xdp_rxq;
/* Min single buffer size for mergeable buffers case. */
unsigned int min_buf_len;
-
- /* Name of this receive queue: input.$index */
- char name[40];
-
- struct xdp_rxq_info xdp_rxq;
};
/* Control VQ buffers: protected by the rtnl lock */
--
2.25.1
From: Anant Thazhemadam <[email protected]>
Date: Wed, 30 Sep 2020 10:47:20 +0530
> The structures virtnet_info and receive_queue have byte holes in
> middle, and their members could do with some rearranging
> (order-of-declaration wise) in order to overcome this.
>
> Rearranging the members helps in:
> * elimination the byte holes in the middle of the structures
> * reduce the size of the structure (virtnet_info)
> * have more members stored in one cache line (as opposed to
> unnecessarily crossing the cacheline boundary and spanning
> different cachelines)
>
> The analysis was performed using pahole.
>
> These patches may be applied in any order.
What effects do these changes have on performance?
The cache locality for various TX and RX paths could be effected.
I'm not applying these patches without some data on the performance
impact.
Thank you.
On Fri, Oct 02, 2020 at 07:06:38PM -0700, David Miller wrote:
> From: Anant Thazhemadam <[email protected]>
> Date: Wed, 30 Sep 2020 10:47:20 +0530
>
> > The structures virtnet_info and receive_queue have byte holes in
> > middle, and their members could do with some rearranging
> > (order-of-declaration wise) in order to overcome this.
> >
> > Rearranging the members helps in:
> > * elimination the byte holes in the middle of the structures
> > * reduce the size of the structure (virtnet_info)
> > * have more members stored in one cache line (as opposed to
> > unnecessarily crossing the cacheline boundary and spanning
> > different cachelines)
> >
> > The analysis was performed using pahole.
> >
> > These patches may be applied in any order.
>
> What effects do these changes have on performance?
>
> The cache locality for various TX and RX paths could be effected.
>
> I'm not applying these patches without some data on the performance
> impact.
>
> Thank you.
Agree wholeheartedly.
--
MST