In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields.
Use flexible arrays instead of zero-element arrays (which look like they
are always overflowing) and split the cross-field memcpy() into two halves
that can be appropriately bounds-checked by the compiler.
We were doing:
#define ETH_HLEN 14
#define VLAN_HLEN 4
...
#define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
...
struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi);
...
struct mlx5_wqe_eth_seg *eseg = &wqe->eth;
struct mlx5_wqe_data_seg *dseg = wqe->data;
...
memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE);
target is wqe->eth.inline_hdr.start (which the compiler sees as being
2 bytes in size), but copying 18, intending to write across start
(really vlan_tci, 2 bytes). The remaining 16 bytes get written into
wqe->data[0], covering byte_count (4 bytes), lkey (4 bytes), and addr
(8 bytes).
struct mlx5e_tx_wqe {
struct mlx5_wqe_ctrl_seg ctrl; /* 0 16 */
struct mlx5_wqe_eth_seg eth; /* 16 16 */
struct mlx5_wqe_data_seg data[]; /* 32 0 */
/* size: 32, cachelines: 1, members: 3 */
/* last cacheline: 32 bytes */
};
struct mlx5_wqe_eth_seg {
u8 swp_outer_l4_offset; /* 0 1 */
u8 swp_outer_l3_offset; /* 1 1 */
u8 swp_inner_l4_offset; /* 2 1 */
u8 swp_inner_l3_offset; /* 3 1 */
u8 cs_flags; /* 4 1 */
u8 swp_flags; /* 5 1 */
__be16 mss; /* 6 2 */
__be32 flow_table_metadata; /* 8 4 */
union {
struct {
__be16 sz; /* 12 2 */
u8 start[2]; /* 14 2 */
} inline_hdr; /* 12 4 */
struct {
__be16 type; /* 12 2 */
__be16 vlan_tci; /* 14 2 */
} insert; /* 12 4 */
__be32 trailer; /* 12 4 */
}; /* 12 4 */
/* size: 16, cachelines: 1, members: 9 */
/* last cacheline: 16 bytes */
};
struct mlx5_wqe_data_seg {
__be32 byte_count; /* 0 4 */
__be32 lkey; /* 4 4 */
__be64 addr; /* 8 8 */
/* size: 16, cachelines: 1, members: 3 */
/* last cacheline: 16 bytes */
};
So, split the memcpy() so the compiler can reason about the buffer
sizes.
"pahole" shows no size nor member offset changes to struct mlx5e_tx_wqe
nor struct mlx5e_umr_wqe. "objdump -d" shows no meaningful object
code changes (i.e. only source line number induced differences and
optimizations).
Cc: Saeed Mahameed <[email protected]>
Cc: Leon Romanovsky <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Jakub Kicinski <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Kees Cook <[email protected]>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 ++--
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 +++-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 35668986878a..40af561c98d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -201,7 +201,7 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
struct mlx5e_tx_wqe {
struct mlx5_wqe_ctrl_seg ctrl;
struct mlx5_wqe_eth_seg eth;
- struct mlx5_wqe_data_seg data[0];
+ struct mlx5_wqe_data_seg data[];
};
struct mlx5e_rx_wqe_ll {
@@ -217,7 +217,7 @@ struct mlx5e_umr_wqe {
struct mlx5_wqe_ctrl_seg ctrl;
struct mlx5_wqe_umr_ctrl_seg uctrl;
struct mlx5_mkey_seg mkc;
- struct mlx5_mtt inline_mtts[0];
+ struct mlx5_mtt inline_mtts[];
};
extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 2f0df5cc1a2d..efae2444c26f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -341,8 +341,10 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
/* copy the inline part if required */
if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
- memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE);
+ memcpy(eseg->inline_hdr.start, xdptxd->data, sizeof(eseg->inline_hdr.start));
eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+ memcpy(dseg, xdptxd->data + sizeof(eseg->inline_hdr.start),
+ MLX5E_XDP_MIN_INLINE - sizeof(eseg->inline_hdr.start));
dma_len -= MLX5E_XDP_MIN_INLINE;
dma_addr += MLX5E_XDP_MIN_INLINE;
dseg++;
--
2.30.2
On Fri, 2021-08-06 at 14:50 -0700, Kees Cook wrote:
> In preparation for FORTIFY_SOURCE performing compile-time and run-
> time
> field bounds checking for memcpy(), memmove(), and memset(), avoid
> intentionally writing across neighboring fields.
>
> Use flexible arrays instead of zero-element arrays (which look like
> they
> are always overflowing) and split the cross-field memcpy() into two
> halves
> that can be appropriately bounds-checked by the compiler.
>
> We were doing:
>
> #define ETH_HLEN 14
> #define VLAN_HLEN 4
> ...
> #define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
> ...
> struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi);
> ...
> struct mlx5_wqe_eth_seg *eseg = &wqe->eth;
> struct mlx5_wqe_data_seg *dseg = wqe->data;
> ...
> memcpy(eseg->inline_hdr.start, xdptxd->data,
> MLX5E_XDP_MIN_INLINE);
>
> target is wqe->eth.inline_hdr.start (which the compiler sees as being
> 2 bytes in size), but copying 18, intending to write across start
> (really vlan_tci, 2 bytes). The remaining 16 bytes get written into
> wqe->data[0], covering byte_count (4 bytes), lkey (4 bytes), and addr
> (8 bytes).
>
> struct mlx5e_tx_wqe {
> struct mlx5_wqe_ctrl_seg ctrl; /* 0
> 16 */
> struct mlx5_wqe_eth_seg eth; /* 16
> 16 */
> struct mlx5_wqe_data_seg data[]; /* 32
> 0 */
>
> /* size: 32, cachelines: 1, members: 3 */
> /* last cacheline: 32 bytes */
> };
>
> struct mlx5_wqe_eth_seg {
> u8 swp_outer_l4_offset; /* 0
> 1 */
> u8 swp_outer_l3_offset; /* 1
> 1 */
> u8 swp_inner_l4_offset; /* 2
> 1 */
> u8 swp_inner_l3_offset; /* 3
> 1 */
> u8 cs_flags; /* 4
> 1 */
> u8 swp_flags; /* 5
> 1 */
> __be16 mss; /* 6
> 2 */
> __be32 flow_table_metadata; /* 8
> 4 */
> union {
> struct {
> __be16 sz; /* 12
> 2 */
> u8 start[2]; /* 14
> 2 */
> } inline_hdr; /* 12
> 4 */
> struct {
> __be16 type; /* 12
> 2 */
> __be16 vlan_tci; /* 14
> 2 */
> } insert; /* 12
> 4 */
> __be32 trailer; /* 12
> 4 */
> }; /* 12
> 4 */
>
> /* size: 16, cachelines: 1, members: 9 */
> /* last cacheline: 16 bytes */
> };
>
> struct mlx5_wqe_data_seg {
> __be32 byte_count; /* 0
> 4 */
> __be32 lkey; /* 4
> 4 */
> __be64 addr; /* 8
> 8 */
>
> /* size: 16, cachelines: 1, members: 3 */
> /* last cacheline: 16 bytes */
> };
>
> So, split the memcpy() so the compiler can reason about the buffer
> sizes.
>
> "pahole" shows no size nor member offset changes to struct
> mlx5e_tx_wqe
> nor struct mlx5e_umr_wqe. "objdump -d" shows no meaningful object
> code changes (i.e. only source line number induced differences and
> optimizations).
>
>
spiting the memcpy doesn't induce any performance degradation ? extra
instruction to copy the 1st 2 bytes ?
[...]
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
why only here ? mlx5 has at least 3 other places where we use this
unbound memcpy ..
> @@ -341,8 +341,10 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq,
> struct mlx5e_xmit_data *xdptxd,
>
> /* copy the inline part if required */
> if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
> - memcpy(eseg->inline_hdr.start, xdptxd->data,
> MLX5E_XDP_MIN_INLINE);
> + memcpy(eseg->inline_hdr.start, xdptxd->data,
> sizeof(eseg->inline_hdr.start));
> eseg->inline_hdr.sz =
> cpu_to_be16(MLX5E_XDP_MIN_INLINE);
> + memcpy(dseg, xdptxd->data + sizeof(eseg-
> >inline_hdr.start),
> + MLX5E_XDP_MIN_INLINE - sizeof(eseg-
> >inline_hdr.start));
> dma_len -= MLX5E_XDP_MIN_INLINE;
> dma_addr += MLX5E_XDP_MIN_INLINE;
> dseg++;
On Fri, 2021-08-06 at 15:45 -0700, Kees Cook wrote:
> On Fri, Aug 06, 2021 at 03:17:56PM -0700, Saeed Mahameed wrote:
> > On Fri, 2021-08-06 at 14:50 -0700, Kees Cook wrote:
> > > [...]
> > > So, split the memcpy() so the compiler can reason about the buffer
> > > sizes.
> > >
> > > "pahole" shows no size nor member offset changes to struct > >
> > > mlx5e_tx_wqe
> > > nor struct mlx5e_umr_wqe. "objdump -d" shows no meaningful object
> > > code changes (i.e. only source line number induced differences and
> > > optimizations).
> >
> > spiting the memcpy doesn't induce any performance degradation ? extra
> > instruction to copy the 1st 2 bytes ?
>
> Not meaningfully, but strictly speaking, yes, it's a different series
> of
> instructions.
>
> > [...]
> > > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> >
> > why only here ? mlx5 has at least 3 other places where we use this
> > unbound memcpy ..
>
> Can you point them out? I've been fixing only the ones I've been able
> to
> find through instrumentation (generally speaking, those with fixed
> sizes).
>
we will need to examine each change carefully to look for performance
degradation and maybe run some micro-benchmark tests in house before i
can ack this patch.
$ git grep -n "eseg->inline_hdr.start"
drivers/infiniband/hw/mlx5/wr.c:129: copysz = min_t(u64,
*cur_edge - (void *)eseg->inline_hdr.start,
drivers/infiniband/hw/mlx5/wr.c:131: memcpy(eseg-
>inline_hdr.start, pdata, copysz);
drivers/infiniband/hw/mlx5/wr.c:133:
sizeof(eseg->inline_hdr.start) + copysz, 16);
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c:344:
memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE);
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c:510:
mlx5e_insert_vlan(eseg->inline_hdr.start, skb, attr->ihs);
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c:514:
memcpy(eseg->inline_hdr.start, skb->data, attr->ihs);
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c:1033:
memcpy(eseg->inline_hdr.start, skb->data, attr.ihs);
On Fri, Aug 06, 2021 at 03:17:56PM -0700, Saeed Mahameed wrote:
> On Fri, 2021-08-06 at 14:50 -0700, Kees Cook wrote:
> > [...]
> > So, split the memcpy() so the compiler can reason about the buffer
> > sizes.
> >
> > "pahole" shows no size nor member offset changes to struct > > mlx5e_tx_wqe
> > nor struct mlx5e_umr_wqe. "objdump -d" shows no meaningful object
> > code changes (i.e. only source line number induced differences and
> > optimizations).
>
> spiting the memcpy doesn't induce any performance degradation ? extra
> instruction to copy the 1st 2 bytes ?
Not meaningfully, but strictly speaking, yes, it's a different series of
instructions.
> [...]
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
>
> why only here ? mlx5 has at least 3 other places where we use this
> unbound memcpy ..
Can you point them out? I've been fixing only the ones I've been able to
find through instrumentation (generally speaking, those with fixed
sizes).
--
Kees Cook