Reallocate the skb if there is no enough space to manage the AMSDU rx packets.
Do not always copy the first part of received frames if A-MSDU is enabled
for SG capable devices
Changes since v2:
- simplify mt76u_build_rx_skb
- add patch 2/3: mt76u: introduce mt76u_ep data structure
- align usb buffer size to usb max endpoint length
- set buf_size to PAGE_SIZE even for sg case
Changes since v1:
- do not allocate multiple page buffers but rely on fragmented skbs
if there is no enough space to manage the AMSDU rx packets
Lorenzo Bianconi (3):
mt76: usb: fix rx A-MSDU support
mt76: mt76u: introduce mt76u_ep data structure
mt76: usb: do not always copy the first part of received frames
drivers/net/wireless/mediatek/mt76/mt76.h | 17 +++--
drivers/net/wireless/mediatek/mt76/usb.c | 75 +++++++++++++++++------
2 files changed, 67 insertions(+), 25 deletions(-)
--
2.21.0
Commit f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes
for rx") breaks A-MSDU support. When A-MSDU is enable the device can
receive frames up to q->buf_size but they will be discarded in
mt76u_process_rx_entry since there is no enough room for
skb_shared_info. Fix the issue reallocating the skb and copying in the
linear area the first 128B of the received frames and in the frag_list
the remaining part.
Fixes: f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes for rx")
Signed-off-by: Lorenzo Bianconi <[email protected]>
---
drivers/net/wireless/mediatek/mt76/mt76.h | 1 +
drivers/net/wireless/mediatek/mt76/usb.c | 49 ++++++++++++++++++-----
2 files changed, 41 insertions(+), 9 deletions(-)
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 8ecbf81a906f..889b76deb703 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -30,6 +30,7 @@
#define MT_TX_RING_SIZE 256
#define MT_MCU_RING_SIZE 32
#define MT_RX_BUF_SIZE 2048
+#define MT_SKB_HEAD_LEN 128
struct mt76_dev;
struct mt76_wcid;
diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
index bbaa1365bbda..12d60d31cb51 100644
--- a/drivers/net/wireless/mediatek/mt76/usb.c
+++ b/drivers/net/wireless/mediatek/mt76/usb.c
@@ -429,6 +429,45 @@ static int mt76u_get_rx_entry_len(u8 *data, u32 data_len)
return dma_len;
}
+static struct sk_buff *
+mt76u_build_rx_skb(u8 *data, int len, int buf_size)
+{
+ struct sk_buff *skb;
+
+ if (SKB_WITH_OVERHEAD(buf_size) < MT_DMA_HDR_LEN + len) {
+ struct page *page;
+ int offset;
+
+ /* slow path, not enough space for data and
+ * skb_shared_info
+ */
+ skb = alloc_skb(MT_SKB_HEAD_LEN, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ skb_put_data(skb, data + MT_DMA_HDR_LEN, MT_SKB_HEAD_LEN);
+ data += (MT_SKB_HEAD_LEN + MT_DMA_HDR_LEN);
+ page = virt_to_head_page(data);
+ offset = data - (u8 *)page_address(page);
+
+ skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+ page, offset, len - MT_SKB_HEAD_LEN,
+ buf_size);
+
+ return skb;
+ }
+
+ /* fast path */
+ skb = build_skb(data, buf_size);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, MT_DMA_HDR_LEN);
+ __skb_put(skb, len);
+
+ return skb;
+}
+
static int
mt76u_process_rx_entry(struct mt76_dev *dev, struct urb *urb)
{
@@ -446,19 +485,11 @@ mt76u_process_rx_entry(struct mt76_dev *dev, struct urb *urb)
return 0;
data_len = min_t(int, len, data_len - MT_DMA_HDR_LEN);
- if (MT_DMA_HDR_LEN + data_len > SKB_WITH_OVERHEAD(q->buf_size)) {
- dev_err_ratelimited(dev->dev, "rx data too big %d\n", data_len);
- return 0;
- }
-
- skb = build_skb(data, q->buf_size);
+ skb = mt76u_build_rx_skb(data, data_len, q->buf_size);
if (!skb)
return 0;
- skb_reserve(skb, MT_DMA_HDR_LEN);
- __skb_put(skb, data_len);
len -= data_len;
-
while (len > 0 && nsgs < urb->num_sgs) {
data_len = min_t(int, len, urb->sg[nsgs].length);
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
--
2.21.0
Set usb buffer size taking into account skb_shared_info in order to
not always copy the first part of received frames if A-MSDU is enabled
for SG capable devices. Moreover align usb buffer size to max_ep
boundaries and set buf_size to PAGE_SIZE even for sg case
Signed-off-by: Lorenzo Bianconi <[email protected]>
---
drivers/net/wireless/mediatek/mt76/usb.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
index 1ee54a9b302e..2ee3f8fa1483 100644
--- a/drivers/net/wireless/mediatek/mt76/usb.c
+++ b/drivers/net/wireless/mediatek/mt76/usb.c
@@ -289,8 +289,10 @@ static int
mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
int nsgs, gfp_t gfp)
{
- int i;
+ int i, data_size;
+ data_size = rounddown(SKB_WITH_OVERHEAD(q->buf_size),
+ dev->usb.in_ep[MT_EP_IN_PKT_RX].max_packet);
for (i = 0; i < nsgs; i++) {
struct page *page;
void *data;
@@ -302,7 +304,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
page = virt_to_head_page(data);
offset = data - page_address(page);
- sg_set_page(&urb->sg[i], page, q->buf_size, offset);
+ sg_set_page(&urb->sg[i], page, data_size, offset);
}
if (i < nsgs) {
@@ -314,7 +316,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
}
urb->num_sgs = max_t(int, i, urb->num_sgs);
- urb->transfer_buffer_length = urb->num_sgs * q->buf_size,
+ urb->transfer_buffer_length = urb->num_sgs * data_size;
sg_init_marker(urb->sg, urb->num_sgs);
return i ? : -ENOMEM;
@@ -611,8 +613,9 @@ static int mt76u_alloc_rx(struct mt76_dev *dev)
if (!q->entry)
return -ENOMEM;
- q->buf_size = dev->usb.sg_en ? MT_RX_BUF_SIZE : PAGE_SIZE;
q->ndesc = MT_NUM_RX_ENTRIES;
+ q->buf_size = PAGE_SIZE;
+
for (i = 0; i < q->ndesc; i++) {
err = mt76u_rx_urb_alloc(dev, &q->entry[i]);
if (err < 0)
--
2.21.0
Introduce mt76u_ep data structure as a container for usb endpoint info.
This is a preliminary patch to compute proper usb buffer size and avoid
always copy the first part of received frames
Signed-off-by: Lorenzo Bianconi <[email protected]>
---
drivers/net/wireless/mediatek/mt76/mt76.h | 16 ++++++++++------
drivers/net/wireless/mediatek/mt76/usb.c | 15 +++++++++------
2 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 889b76deb703..1c51d6d48e60 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -382,6 +382,11 @@ enum mt76u_out_ep {
__MT_EP_OUT_MAX,
};
+struct mt76u_ep {
+ u16 max_packet;
+ u8 ep;
+};
+
#define MT_SG_MAX_SIZE 8
#define MT_NUM_TX_ENTRIES 256
#define MT_NUM_RX_ENTRIES 128
@@ -393,10 +398,8 @@ struct mt76_usb {
struct tasklet_struct rx_tasklet;
struct delayed_work stat_work;
- u8 out_ep[__MT_EP_OUT_MAX];
- u16 out_max_packet;
- u8 in_ep[__MT_EP_IN_MAX];
- u16 in_max_packet;
+ struct mt76u_ep out_ep[__MT_EP_OUT_MAX];
+ struct mt76u_ep in_ep[__MT_EP_IN_MAX];
bool sg_en;
struct mt76u_mcu {
@@ -786,9 +789,10 @@ mt76u_bulk_msg(struct mt76_dev *dev, void *data, int len, int *actual_len,
unsigned int pipe;
if (actual_len)
- pipe = usb_rcvbulkpipe(udev, usb->in_ep[MT_EP_IN_CMD_RESP]);
+ pipe = usb_rcvbulkpipe(udev, usb->in_ep[MT_EP_IN_CMD_RESP].ep);
else
- pipe = usb_sndbulkpipe(udev, usb->out_ep[MT_EP_OUT_INBAND_CMD]);
+ pipe = usb_sndbulkpipe(udev,
+ usb->out_ep[MT_EP_OUT_INBAND_CMD].ep);
return usb_bulk_msg(udev, pipe, data, len, actual_len, timeout);
}
diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
index 12d60d31cb51..1ee54a9b302e 100644
--- a/drivers/net/wireless/mediatek/mt76/usb.c
+++ b/drivers/net/wireless/mediatek/mt76/usb.c
@@ -260,19 +260,22 @@ mt76u_set_endpoints(struct usb_interface *intf,
struct usb_host_interface *intf_desc = intf->cur_altsetting;
struct usb_endpoint_descriptor *ep_desc;
int i, in_ep = 0, out_ep = 0;
+ struct mt76u_ep *ep;
for (i = 0; i < intf_desc->desc.bNumEndpoints; i++) {
ep_desc = &intf_desc->endpoint[i].desc;
if (usb_endpoint_is_bulk_in(ep_desc) &&
in_ep < __MT_EP_IN_MAX) {
- usb->in_ep[in_ep] = usb_endpoint_num(ep_desc);
- usb->in_max_packet = usb_endpoint_maxp(ep_desc);
+ ep = &usb->in_ep[in_ep];
+ ep->max_packet = usb_endpoint_maxp(ep_desc);
+ ep->ep = usb_endpoint_num(ep_desc);
in_ep++;
} else if (usb_endpoint_is_bulk_out(ep_desc) &&
out_ep < __MT_EP_OUT_MAX) {
- usb->out_ep[out_ep] = usb_endpoint_num(ep_desc);
- usb->out_max_packet = usb_endpoint_maxp(ep_desc);
+ ep = &usb->out_ep[out_ep];
+ ep->max_packet = usb_endpoint_maxp(ep_desc);
+ ep->ep = usb_endpoint_num(ep_desc);
out_ep++;
}
}
@@ -386,9 +389,9 @@ mt76u_fill_bulk_urb(struct mt76_dev *dev, int dir, int index,
unsigned int pipe;
if (dir == USB_DIR_IN)
- pipe = usb_rcvbulkpipe(udev, dev->usb.in_ep[index]);
+ pipe = usb_rcvbulkpipe(udev, dev->usb.in_ep[index].ep);
else
- pipe = usb_sndbulkpipe(udev, dev->usb.out_ep[index]);
+ pipe = usb_sndbulkpipe(udev, dev->usb.out_ep[index].ep);
urb->dev = udev;
urb->pipe = pipe;
--
2.21.0
On Thu, Jun 13, 2019 at 11:43:11PM +0200, Lorenzo Bianconi wrote:
> Commit f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes
> for rx") breaks A-MSDU support. When A-MSDU is enable the device can
> receive frames up to q->buf_size but they will be discarded in
> mt76u_process_rx_entry since there is no enough room for
> skb_shared_info. Fix the issue reallocating the skb and copying in the
> linear area the first 128B of the received frames and in the frag_list
> the remaining part.
>
> Fixes: f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes for rx")
> Signed-off-by: Lorenzo Bianconi <[email protected]>
> ---
> drivers/net/wireless/mediatek/mt76/mt76.h | 1 +
> drivers/net/wireless/mediatek/mt76/usb.c | 49 ++++++++++++++++++-----
> 2 files changed, 41 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
> index 8ecbf81a906f..889b76deb703 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt76.h
> +++ b/drivers/net/wireless/mediatek/mt76/mt76.h
> @@ -30,6 +30,7 @@
> #define MT_TX_RING_SIZE 256
> #define MT_MCU_RING_SIZE 32
> #define MT_RX_BUF_SIZE 2048
> +#define MT_SKB_HEAD_LEN 128
>
> struct mt76_dev;
> struct mt76_wcid;
> diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
> index bbaa1365bbda..12d60d31cb51 100644
> --- a/drivers/net/wireless/mediatek/mt76/usb.c
> +++ b/drivers/net/wireless/mediatek/mt76/usb.c
> @@ -429,6 +429,45 @@ static int mt76u_get_rx_entry_len(u8 *data, u32 data_len)
> return dma_len;
> }
>
> +static struct sk_buff *
> +mt76u_build_rx_skb(u8 *data, int len, int buf_size)
> +{
> + struct sk_buff *skb;
> +
> + if (SKB_WITH_OVERHEAD(buf_size) < MT_DMA_HDR_LEN + len) {
> + struct page *page;
> + int offset;
> +
> + /* slow path, not enough space for data and
> + * skb_shared_info
> + */
> + skb = alloc_skb(MT_SKB_HEAD_LEN, GFP_ATOMIC);
> + if (!skb)
> + return NULL;
> +
> + skb_put_data(skb, data + MT_DMA_HDR_LEN, MT_SKB_HEAD_LEN);
I looked how rx amsdu is processed in mac80211 and it is decomposed and
copied into newly allocated individual skb's, see ieee80211_amsdu_to_8023s()
So copy L3 & L4 headers doesn't do anything good here, actually seems to
be better to have them in frag as __ieee80211_amsdu_copy_frag() do some
magic to avid copy.
Stanislaw
On Thu, Jun 13, 2019 at 11:43:13PM +0200, Lorenzo Bianconi wrote:
> Set usb buffer size taking into account skb_shared_info in order to
> not always copy the first part of received frames if A-MSDU is enabled
> for SG capable devices. Moreover align usb buffer size to max_ep
> boundaries and set buf_size to PAGE_SIZE even for sg case
I think this should not be applied to wirless-drivers, only first patch
that fix the bug and optimizations should be done in -next.
> + int i, data_size;
>
> + data_size = rounddown(SKB_WITH_OVERHEAD(q->buf_size),
> + dev->usb.in_ep[MT_EP_IN_PKT_RX].max_packet);
> for (i = 0; i < nsgs; i++) {
> struct page *page;
> void *data;
> @@ -302,7 +304,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
>
> page = virt_to_head_page(data);
> offset = data - page_address(page);
> - sg_set_page(&urb->sg[i], page, q->buf_size, offset);
> + sg_set_page(&urb->sg[i], page, data_size, offset);
<snip>
> - q->buf_size = dev->usb.sg_en ? MT_RX_BUF_SIZE : PAGE_SIZE;
> q->ndesc = MT_NUM_RX_ENTRIES;
> + q->buf_size = PAGE_SIZE;
> +
This should be associated with decrease of MT_SG_MAX_SIZE to value that
is actually needed and currently this is 2 for 4k AMSDU.
However I don't think allocating 2 pages to avoid ieee80211 header and SNAP
copy is worth to do. For me best approach would be allocate 1 page for
4k AMSDU, 2 for 8k and 3 for 12k (still using sg, but without data_size
change to avoid 32B copying).
Stanislaw
> On Thu, Jun 13, 2019 at 11:43:11PM +0200, Lorenzo Bianconi wrote:
> > Commit f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes
> > for rx") breaks A-MSDU support. When A-MSDU is enable the device can
> > receive frames up to q->buf_size but they will be discarded in
> > mt76u_process_rx_entry since there is no enough room for
> > skb_shared_info. Fix the issue reallocating the skb and copying in the
> > linear area the first 128B of the received frames and in the frag_list
> > the remaining part.
> >
> > Fixes: f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes for rx")
> > Signed-off-by: Lorenzo Bianconi <[email protected]>
> > ---
> > drivers/net/wireless/mediatek/mt76/mt76.h | 1 +
> > drivers/net/wireless/mediatek/mt76/usb.c | 49 ++++++++++++++++++-----
> > 2 files changed, 41 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
> > index 8ecbf81a906f..889b76deb703 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt76.h
> > +++ b/drivers/net/wireless/mediatek/mt76/mt76.h
> > @@ -30,6 +30,7 @@
> > #define MT_TX_RING_SIZE 256
> > #define MT_MCU_RING_SIZE 32
> > #define MT_RX_BUF_SIZE 2048
> > +#define MT_SKB_HEAD_LEN 128
> >
> > struct mt76_dev;
> > struct mt76_wcid;
> > diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
> > index bbaa1365bbda..12d60d31cb51 100644
> > --- a/drivers/net/wireless/mediatek/mt76/usb.c
> > +++ b/drivers/net/wireless/mediatek/mt76/usb.c
> > @@ -429,6 +429,45 @@ static int mt76u_get_rx_entry_len(u8 *data, u32 data_len)
> > return dma_len;
> > }
> >
> > +static struct sk_buff *
> > +mt76u_build_rx_skb(u8 *data, int len, int buf_size)
> > +{
> > + struct sk_buff *skb;
> > +
> > + if (SKB_WITH_OVERHEAD(buf_size) < MT_DMA_HDR_LEN + len) {
> > + struct page *page;
> > + int offset;
> > +
> > + /* slow path, not enough space for data and
> > + * skb_shared_info
> > + */
> > + skb = alloc_skb(MT_SKB_HEAD_LEN, GFP_ATOMIC);
> > + if (!skb)
> > + return NULL;
> > +
> > + skb_put_data(skb, data + MT_DMA_HDR_LEN, MT_SKB_HEAD_LEN);
>
> I looked how rx amsdu is processed in mac80211 and it is decomposed and
> copied into newly allocated individual skb's, see ieee80211_amsdu_to_8023s()
>
> So copy L3 & L4 headers doesn't do anything good here, actually seems to
> be better to have them in frag as __ieee80211_amsdu_copy_frag() do some
> magic to avid copy.
Looking at __ieee80211_amsdu_copy() now I got why other drivers copy hdrlen +
8, thx :)
In our case reuse_frag is true in __ieee80211_amsdu_copy, so we will end up
copying 32B + ether_len. Anyway I think 32 is a little bit too low and we could get
better performances increasing it a little bit.
A typical use case (e.g IPv6 + TCP):
IPv6 = 40B, TCP = 32B --> so 72B..I guess 128B is a good value :)
@Felix, Johannes: what do you think?
Regarding the patch I guess let's apply it as it is in order to fix the pending
issue and then we will figure out how to proceed (copy hdr_len + 3 or increase
the value in __ieee80211_amsdu_copy)
Regards,
Lorenzo
>
> Stanislaw
On Fri, 2019-06-14 at 12:11 +0200, Lorenzo Bianconi wrote:
> Looking at __ieee80211_amsdu_copy() now I got why other drivers copy hdrlen +
> 8, thx :)
> In our case reuse_frag is true in __ieee80211_amsdu_copy, so we will end up
> copying 32B + ether_len. Anyway I think 32 is a little bit too low and we could get
> better performances increasing it a little bit.
> A typical use case (e.g IPv6 + TCP):
>
> IPv6 = 40B, TCP = 32B --> so 72B..I guess 128B is a good value :)
> @Felix, Johannes: what do you think?
I think while we might *allocate* more, I don't think we should *copy*
more, since then the TCP payload will no longer be in pages.
It'd probably be better to implement leaving enough tailroom (allocate
128), but copying nothing, unless the *entire* packet fits.
johannes
> On Thu, Jun 13, 2019 at 11:43:13PM +0200, Lorenzo Bianconi wrote:
> > Set usb buffer size taking into account skb_shared_info in order to
> > not always copy the first part of received frames if A-MSDU is enabled
> > for SG capable devices. Moreover align usb buffer size to max_ep
> > boundaries and set buf_size to PAGE_SIZE even for sg case
>
> I think this should not be applied to wirless-drivers, only first patch
> that fix the bug and optimizations should be done in -next.
ack, right. I think patch 2/3 and 3/3 can go directly in Felix's tree
>
> > + int i, data_size;
> >
> > + data_size = rounddown(SKB_WITH_OVERHEAD(q->buf_size),
> > + dev->usb.in_ep[MT_EP_IN_PKT_RX].max_packet);
> > for (i = 0; i < nsgs; i++) {
> > struct page *page;
> > void *data;
> > @@ -302,7 +304,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
> >
> > page = virt_to_head_page(data);
> > offset = data - page_address(page);
> > - sg_set_page(&urb->sg[i], page, q->buf_size, offset);
> > + sg_set_page(&urb->sg[i], page, data_size, offset);
> <snip>
> > - q->buf_size = dev->usb.sg_en ? MT_RX_BUF_SIZE : PAGE_SIZE;
> > q->ndesc = MT_NUM_RX_ENTRIES;
> > + q->buf_size = PAGE_SIZE;
> > +
>
> This should be associated with decrease of MT_SG_MAX_SIZE to value that
> is actually needed and currently this is 2 for 4k AMSDU.
MT_SG_MAX_SIZE is used even on tx side and I do not think we will end up with a
huge difference here
>
> However I don't think allocating 2 pages to avoid ieee80211 header and SNAP
> copy is worth to do. For me best approach would be allocate 1 page for
> 4k AMSDU, 2 for 8k and 3 for 12k (still using sg, but without data_size
> change to avoid 32B copying).
From my point of view it is better to avoid copying if it is possible. Are you
sure there is no difference?
Regards,
Lorenzo
>
> Stanislaw
On Fri, Jun 14, 2019 at 12:22:48PM +0200, Lorenzo Bianconi wrote:
> > On Thu, Jun 13, 2019 at 11:43:13PM +0200, Lorenzo Bianconi wrote:
> > > Set usb buffer size taking into account skb_shared_info in order to
> > > not always copy the first part of received frames if A-MSDU is enabled
> > > for SG capable devices. Moreover align usb buffer size to max_ep
> > > boundaries and set buf_size to PAGE_SIZE even for sg case
> >
> > I think this should not be applied to wirless-drivers, only first patch
> > that fix the bug and optimizations should be done in -next.
>
> ack, right. I think patch 2/3 and 3/3 can go directly in Felix's tree
>
> >
> > > + int i, data_size;
> > >
> > > + data_size = rounddown(SKB_WITH_OVERHEAD(q->buf_size),
> > > + dev->usb.in_ep[MT_EP_IN_PKT_RX].max_packet);
> > > for (i = 0; i < nsgs; i++) {
> > > struct page *page;
> > > void *data;
> > > @@ -302,7 +304,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
> > >
> > > page = virt_to_head_page(data);
> > > offset = data - page_address(page);
> > > - sg_set_page(&urb->sg[i], page, q->buf_size, offset);
> > > + sg_set_page(&urb->sg[i], page, data_size, offset);
> > <snip>
> > > - q->buf_size = dev->usb.sg_en ? MT_RX_BUF_SIZE : PAGE_SIZE;
> > > q->ndesc = MT_NUM_RX_ENTRIES;
> > > + q->buf_size = PAGE_SIZE;
> > > +
> >
> > This should be associated with decrease of MT_SG_MAX_SIZE to value that
> > is actually needed and currently this is 2 for 4k AMSDU.
>
> MT_SG_MAX_SIZE is used even on tx side and I do not think we will end up with a
> huge difference here
So use different value as argument for mt76u_fill_rx_sg() in
mt76u_rx_urb_alloc(). After changing buf_size to PAGE_SIZE we will
allocate 8 pages per rx queue entry, but only 2 pages will be used
(with data_size change, 1 without data_size change). Or I'm wrong?
> > However I don't think allocating 2 pages to avoid ieee80211 header and SNAP
> > copy is worth to do. For me best approach would be allocate 1 page for
> > 4k AMSDU, 2 for 8k and 3 for 12k (still using sg, but without data_size
> > change to avoid 32B copying).
>
> From my point of view it is better to avoid copying if it is possible. Are you
> sure there is no difference?
I do not understand what you mean by difference here.
Stanislaw
On Fri, Jun 14, 2019 at 12:11:17PM +0200, Lorenzo Bianconi wrote:
> > On Thu, Jun 13, 2019 at 11:43:11PM +0200, Lorenzo Bianconi wrote:
> > > Commit f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes
> > > for rx") breaks A-MSDU support. When A-MSDU is enable the device can
> > > receive frames up to q->buf_size but they will be discarded in
> > > mt76u_process_rx_entry since there is no enough room for
> > > skb_shared_info. Fix the issue reallocating the skb and copying in the
> > > linear area the first 128B of the received frames and in the frag_list
> > > the remaining part.
> > >
> > > Fixes: f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes for rx")
> > > Signed-off-by: Lorenzo Bianconi <[email protected]>
> > > ---
> > > drivers/net/wireless/mediatek/mt76/mt76.h | 1 +
> > > drivers/net/wireless/mediatek/mt76/usb.c | 49 ++++++++++++++++++-----
> > > 2 files changed, 41 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
> > > index 8ecbf81a906f..889b76deb703 100644
> > > --- a/drivers/net/wireless/mediatek/mt76/mt76.h
> > > +++ b/drivers/net/wireless/mediatek/mt76/mt76.h
> > > @@ -30,6 +30,7 @@
> > > #define MT_TX_RING_SIZE 256
> > > #define MT_MCU_RING_SIZE 32
> > > #define MT_RX_BUF_SIZE 2048
> > > +#define MT_SKB_HEAD_LEN 128
> > >
> > > struct mt76_dev;
> > > struct mt76_wcid;
> > > diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
> > > index bbaa1365bbda..12d60d31cb51 100644
> > > --- a/drivers/net/wireless/mediatek/mt76/usb.c
> > > +++ b/drivers/net/wireless/mediatek/mt76/usb.c
> > > @@ -429,6 +429,45 @@ static int mt76u_get_rx_entry_len(u8 *data, u32 data_len)
> > > return dma_len;
> > > }
> > >
> > > +static struct sk_buff *
> > > +mt76u_build_rx_skb(u8 *data, int len, int buf_size)
> > > +{
> > > + struct sk_buff *skb;
> > > +
> > > + if (SKB_WITH_OVERHEAD(buf_size) < MT_DMA_HDR_LEN + len) {
> > > + struct page *page;
> > > + int offset;
> > > +
> > > + /* slow path, not enough space for data and
> > > + * skb_shared_info
> > > + */
> > > + skb = alloc_skb(MT_SKB_HEAD_LEN, GFP_ATOMIC);
> > > + if (!skb)
> > > + return NULL;
> > > +
> > > + skb_put_data(skb, data + MT_DMA_HDR_LEN, MT_SKB_HEAD_LEN);
> >
> > I looked how rx amsdu is processed in mac80211 and it is decomposed and
> > copied into newly allocated individual skb's, see ieee80211_amsdu_to_8023s()
> >
> > So copy L3 & L4 headers doesn't do anything good here, actually seems to
> > be better to have them in frag as __ieee80211_amsdu_copy_frag() do some
> > magic to avid copy.
>
> Looking at __ieee80211_amsdu_copy() now I got why other drivers copy hdrlen +
> 8, thx :)
> In our case reuse_frag is true in __ieee80211_amsdu_copy, so we will end up
I don't think reuse_frag is true in our case since skb->head_frag is
not set when we use alloc_skb().
Stanislaw
On Fri, Jun 14, 2019 at 12:20:59PM +0200, Johannes Berg wrote:
> On Fri, 2019-06-14 at 12:11 +0200, Lorenzo Bianconi wrote:
>
> > Looking at __ieee80211_amsdu_copy() now I got why other drivers copy hdrlen +
> > 8, thx :)
> > In our case reuse_frag is true in __ieee80211_amsdu_copy, so we will end up
> > copying 32B + ether_len. Anyway I think 32 is a little bit too low and we could get
> > better performances increasing it a little bit.
> > A typical use case (e.g IPv6 + TCP):
> >
> > IPv6 = 40B, TCP = 32B --> so 72B..I guess 128B is a good value :)
> > @Felix, Johannes: what do you think?
>
> I think while we might *allocate* more, I don't think we should *copy*
> more, since then the TCP payload will no longer be in pages.
>
> It'd probably be better to implement leaving enough tailroom (allocate
> 128), but copying nothing, unless the *entire* packet fits.
iwl4965 put entire packet in fragment in il4965_pass_packet_to_mac80211() .
Initially I thought this is a bug, since mac80211 require header be
in the linear area, but looks like ieee80211_rx_monitor() copy header
before rest of mac80211 check it, so 4965 is fine.
Anyway I think the driver should put ieee80211 header in linear area
and iwlwifi & mt7601u implementation is somewhat optimal.
Stanislaw
On Fri, 2019-06-14 at 13:31 +0200, Stanislaw Gruszka wrote:
> On Fri, Jun 14, 2019 at 12:20:59PM +0200, Johannes Berg wrote:
> > On Fri, 2019-06-14 at 12:11 +0200, Lorenzo Bianconi wrote:
> >
> > > Looking at __ieee80211_amsdu_copy() now I got why other drivers copy hdrlen +
> > > 8, thx :)
> > > In our case reuse_frag is true in __ieee80211_amsdu_copy, so we will end up
> > > copying 32B + ether_len. Anyway I think 32 is a little bit too low and we could get
> > > better performances increasing it a little bit.
> > > A typical use case (e.g IPv6 + TCP):
> > >
> > > IPv6 = 40B, TCP = 32B --> so 72B..I guess 128B is a good value :)
> > > @Felix, Johannes: what do you think?
> >
> > I think while we might *allocate* more, I don't think we should *copy*
> > more, since then the TCP payload will no longer be in pages.
> >
> > It'd probably be better to implement leaving enough tailroom (allocate
> > 128), but copying nothing, unless the *entire* packet fits.
>
> iwl4965 put entire packet in fragment in il4965_pass_packet_to_mac80211() .
> Initially I thought this is a bug, since mac80211 require header be
> in the linear area, but looks like ieee80211_rx_monitor() copy header
> before rest of mac80211 check it, so 4965 is fine.
Mac80211 should not assume anything about header being present or not,
just like the rest of the network stack.
> Anyway I think the driver should put ieee80211 header in linear area
> and iwlwifi & mt7601u implementation is somewhat optimal.
Yes, it's just an optimisation to do the copy-break (copy small packets
completely) or to copy the header already (since we may have better ways
to do so than skb_copy_bits if we still have a virt pointer to the page,
rather than just the page pointer).
johannes
> On Fri, Jun 14, 2019 at 12:11:17PM +0200, Lorenzo Bianconi wrote:
> > > On Thu, Jun 13, 2019 at 11:43:11PM +0200, Lorenzo Bianconi wrote:
> > > > Commit f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes
> > > > for rx") breaks A-MSDU support. When A-MSDU is enable the device can
> > > > receive frames up to q->buf_size but they will be discarded in
> > > > mt76u_process_rx_entry since there is no enough room for
> > > > skb_shared_info. Fix the issue reallocating the skb and copying in the
> > > > linear area the first 128B of the received frames and in the frag_list
> > > > the remaining part.
> > > >
> > > > Fixes: f8f527b16db5 ("mt76: usb: use EP max packet aligned buffer sizes for rx")
> > > > Signed-off-by: Lorenzo Bianconi <[email protected]>
> > > > ---
> > > > drivers/net/wireless/mediatek/mt76/mt76.h | 1 +
> > > > drivers/net/wireless/mediatek/mt76/usb.c | 49 ++++++++++++++++++-----
> > > > 2 files changed, 41 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
> > > > index 8ecbf81a906f..889b76deb703 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/mt76.h
> > > > +++ b/drivers/net/wireless/mediatek/mt76/mt76.h
> > > > @@ -30,6 +30,7 @@
> > > > #define MT_TX_RING_SIZE 256
> > > > #define MT_MCU_RING_SIZE 32
> > > > #define MT_RX_BUF_SIZE 2048
> > > > +#define MT_SKB_HEAD_LEN 128
> > > >
> > > > struct mt76_dev;
> > > > struct mt76_wcid;
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
> > > > index bbaa1365bbda..12d60d31cb51 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/usb.c
> > > > +++ b/drivers/net/wireless/mediatek/mt76/usb.c
> > > > @@ -429,6 +429,45 @@ static int mt76u_get_rx_entry_len(u8 *data, u32 data_len)
> > > > return dma_len;
> > > > }
> > > >
> > > > +static struct sk_buff *
> > > > +mt76u_build_rx_skb(u8 *data, int len, int buf_size)
> > > > +{
> > > > + struct sk_buff *skb;
> > > > +
> > > > + if (SKB_WITH_OVERHEAD(buf_size) < MT_DMA_HDR_LEN + len) {
> > > > + struct page *page;
> > > > + int offset;
> > > > +
> > > > + /* slow path, not enough space for data and
> > > > + * skb_shared_info
> > > > + */
> > > > + skb = alloc_skb(MT_SKB_HEAD_LEN, GFP_ATOMIC);
> > > > + if (!skb)
> > > > + return NULL;
> > > > +
> > > > + skb_put_data(skb, data + MT_DMA_HDR_LEN, MT_SKB_HEAD_LEN);
> > >
> > > I looked how rx amsdu is processed in mac80211 and it is decomposed and
> > > copied into newly allocated individual skb's, see ieee80211_amsdu_to_8023s()
> > >
> > > So copy L3 & L4 headers doesn't do anything good here, actually seems to
> > > be better to have them in frag as __ieee80211_amsdu_copy_frag() do some
> > > magic to avid copy.
> >
> > Looking at __ieee80211_amsdu_copy() now I got why other drivers copy hdrlen +
> > 8, thx :)
> > In our case reuse_frag is true in __ieee80211_amsdu_copy, so we will end up
>
> I don't think reuse_frag is true in our case since skb->head_frag is
> not set when we use alloc_skb().
Oh, right. In this case it is probably better to use netdev_alloc_skb().
I will repost using the approach used in mt7601u since for the moment it will
not make any difference to copy more data.
Regards,
Lorenzo
>
> Stanislaw
> On Fri, Jun 14, 2019 at 12:22:48PM +0200, Lorenzo Bianconi wrote:
> > > On Thu, Jun 13, 2019 at 11:43:13PM +0200, Lorenzo Bianconi wrote:
> > > > Set usb buffer size taking into account skb_shared_info in order to
> > > > not always copy the first part of received frames if A-MSDU is enabled
> > > > for SG capable devices. Moreover align usb buffer size to max_ep
> > > > boundaries and set buf_size to PAGE_SIZE even for sg case
> > >
> > > I think this should not be applied to wirless-drivers, only first patch
> > > that fix the bug and optimizations should be done in -next.
> >
> > ack, right. I think patch 2/3 and 3/3 can go directly in Felix's tree
> >
> > >
> > > > + int i, data_size;
> > > >
> > > > + data_size = rounddown(SKB_WITH_OVERHEAD(q->buf_size),
> > > > + dev->usb.in_ep[MT_EP_IN_PKT_RX].max_packet);
> > > > for (i = 0; i < nsgs; i++) {
> > > > struct page *page;
> > > > void *data;
> > > > @@ -302,7 +304,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
> > > >
> > > > page = virt_to_head_page(data);
> > > > offset = data - page_address(page);
> > > > - sg_set_page(&urb->sg[i], page, q->buf_size, offset);
> > > > + sg_set_page(&urb->sg[i], page, data_size, offset);
> > > <snip>
> > > > - q->buf_size = dev->usb.sg_en ? MT_RX_BUF_SIZE : PAGE_SIZE;
> > > > q->ndesc = MT_NUM_RX_ENTRIES;
> > > > + q->buf_size = PAGE_SIZE;
> > > > +
> > >
> > > This should be associated with decrease of MT_SG_MAX_SIZE to value that
> > > is actually needed and currently this is 2 for 4k AMSDU.
> >
> > MT_SG_MAX_SIZE is used even on tx side and I do not think we will end up with a
> > huge difference here
>
> So use different value as argument for mt76u_fill_rx_sg() in
> mt76u_rx_urb_alloc(). After changing buf_size to PAGE_SIZE we will
> allocate 8 pages per rx queue entry, but only 2 pages will be used
> (with data_size change, 1 without data_size change). Or I'm wrong?
yes, it is right (we will use two pages with data_size change). Maybe better to
use 4 pages for each rx queue entry? (otherwise we will probably change it in
the future)
>
> > > However I don't think allocating 2 pages to avoid ieee80211 header and SNAP
> > > copy is worth to do. For me best approach would be allocate 1 page for
> > > 4k AMSDU, 2 for 8k and 3 for 12k (still using sg, but without data_size
> > > change to avoid 32B copying).
> >
> > From my point of view it is better to avoid copying if it is possible. Are you
> > sure there is no difference?
>
> I do not understand what you mean by difference here.
tpt differences, not sure if there are any
Regards,
Lorenzo
>
> Stanislaw
On Fri, Jun 14, 2019 at 02:46:36PM +0200, Lorenzo Bianconi wrote:
> > >
> > > ack, right. I think patch 2/3 and 3/3 can go directly in Felix's tree
> > >
> > > >
> > > > > + int i, data_size;
> > > > >
> > > > > + data_size = rounddown(SKB_WITH_OVERHEAD(q->buf_size),
> > > > > + dev->usb.in_ep[MT_EP_IN_PKT_RX].max_packet);
> > > > > for (i = 0; i < nsgs; i++) {
> > > > > struct page *page;
> > > > > void *data;
> > > > > @@ -302,7 +304,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
> > > > >
> > > > > page = virt_to_head_page(data);
> > > > > offset = data - page_address(page);
> > > > > - sg_set_page(&urb->sg[i], page, q->buf_size, offset);
> > > > > + sg_set_page(&urb->sg[i], page, data_size, offset);
> > > > <snip>
> > > > > - q->buf_size = dev->usb.sg_en ? MT_RX_BUF_SIZE : PAGE_SIZE;
> > > > > q->ndesc = MT_NUM_RX_ENTRIES;
> > > > > + q->buf_size = PAGE_SIZE;
> > > > > +
> > > >
> > > > This should be associated with decrease of MT_SG_MAX_SIZE to value that
> > > > is actually needed and currently this is 2 for 4k AMSDU.
> > >
> > > MT_SG_MAX_SIZE is used even on tx side and I do not think we will end up with a
> > > huge difference here
> >
> > So use different value as argument for mt76u_fill_rx_sg() in
> > mt76u_rx_urb_alloc(). After changing buf_size to PAGE_SIZE we will
> > allocate 8 pages per rx queue entry, but only 2 pages will be used
> > (with data_size change, 1 without data_size change). Or I'm wrong?
>
> yes, it is right (we will use two pages with data_size change). Maybe better to
> use 4 pages for each rx queue entry? (otherwise we will probably change it in
> the future)
We should not allocate more than is required. If support for bigger
rx AMSDUs will be added and announced in vht/ht capabilities to remote
stations, then increase of number of segments will be needed.
> > > > However I don't think allocating 2 pages to avoid ieee80211 header and SNAP
> > > > copy is worth to do. For me best approach would be allocate 1 page for
> > > > 4k AMSDU, 2 for 8k and 3 for 12k (still using sg, but without data_size
> > > > change to avoid 32B copying).
> > >
> > > From my point of view it is better to avoid copying if it is possible. Are you
> > > sure there is no difference?
> >
> > I do not understand what you mean by difference here.
>
> tpt differences, not sure if there are any
I would not expect any measurable difference in tpt nor in cpu usage
either way.
But I think, if some AMSDU subframe will be spited into two fragments,
data most likely will need to be linearised/copied, at some point before
passed to application, what will overcome any benefit of avoiding coping
802.11 header. Thought, I don't think this somehow will be visible in
benchmarking.
Stanislaw
>
> On Fri, Jun 14, 2019 at 12:20:59PM +0200, Johannes Berg wrote:
> > On Fri, 2019-06-14 at 12:11 +0200, Lorenzo Bianconi wrote:
> >
> > > Looking at __ieee80211_amsdu_copy() now I got why other drivers copy hdrlen +
> > > 8, thx :)
> > > In our case reuse_frag is true in __ieee80211_amsdu_copy, so we will end up
> > > copying 32B + ether_len. Anyway I think 32 is a little bit too low and we could get
> > > better performances increasing it a little bit.
> > > A typical use case (e.g IPv6 + TCP):
> > >
> > > IPv6 = 40B, TCP = 32B --> so 72B..I guess 128B is a good value :)
> > > @Felix, Johannes: what do you think?
> >
> > I think while we might *allocate* more, I don't think we should *copy*
> > more, since then the TCP payload will no longer be in pages.
> >
> > It'd probably be better to implement leaving enough tailroom (allocate
> > 128), but copying nothing, unless the *entire* packet fits.
>
> iwl4965 put entire packet in fragment in il4965_pass_packet_to_mac80211() .
> Initially I thought this is a bug, since mac80211 require header be
> in the linear area, but looks like ieee80211_rx_monitor() copy header
> before rest of mac80211 check it, so 4965 is fine.
>
> Anyway I think the driver should put ieee80211 header in linear area
> and iwlwifi & mt7601u implementation is somewhat optimal.
Actually the case is a little bit different for mt76 since we need
even mt76x02_rxwi in the linear area of the received skb.
Taking that into account the requested size to copy will be:
32 + 802.11 hdr + SNAP hdr = ~ 70B
Moreover to pass rxwi size to usb module we need to add a field in
mt76_driver_ops (e.g rxwi_size).
I will carry out some tests and if there are no differences I will
post a single patch to wireless-drivers using 128B as default size
Regards,
Lorenzo
>
> Stanislaw
> On Fri, Jun 14, 2019 at 02:46:36PM +0200, Lorenzo Bianconi wrote:
> > > >
> > > > ack, right. I think patch 2/3 and 3/3 can go directly in Felix's tree
> > > >
> > > > >
> > > > > > + int i, data_size;
> > > > > >
> > > > > > + data_size = rounddown(SKB_WITH_OVERHEAD(q->buf_size),
> > > > > > + dev->usb.in_ep[MT_EP_IN_PKT_RX].max_packet);
> > > > > > for (i = 0; i < nsgs; i++) {
> > > > > > struct page *page;
> > > > > > void *data;
> > > > > > @@ -302,7 +304,7 @@ mt76u_fill_rx_sg(struct mt76_dev *dev, struct mt76_queue *q, struct urb *urb,
> > > > > >
> > > > > > page = virt_to_head_page(data);
> > > > > > offset = data - page_address(page);
> > > > > > - sg_set_page(&urb->sg[i], page, q->buf_size, offset);
> > > > > > + sg_set_page(&urb->sg[i], page, data_size, offset);
> > > > > <snip>
> > > > > > - q->buf_size = dev->usb.sg_en ? MT_RX_BUF_SIZE : PAGE_SIZE;
> > > > > > q->ndesc = MT_NUM_RX_ENTRIES;
> > > > > > + q->buf_size = PAGE_SIZE;
> > > > > > +
> > > > >
> > > > > This should be associated with decrease of MT_SG_MAX_SIZE to value that
> > > > > is actually needed and currently this is 2 for 4k AMSDU.
> > > >
> > > > MT_SG_MAX_SIZE is used even on tx side and I do not think we will end up with a
> > > > huge difference here
> > >
> > > So use different value as argument for mt76u_fill_rx_sg() in
> > > mt76u_rx_urb_alloc(). After changing buf_size to PAGE_SIZE we will
> > > allocate 8 pages per rx queue entry, but only 2 pages will be used
> > > (with data_size change, 1 without data_size change). Or I'm wrong?
> >
> > yes, it is right (we will use two pages with data_size change). Maybe better to
> > use 4 pages for each rx queue entry? (otherwise we will probably change it in
> > the future)
>
> We should not allocate more than is required. If support for bigger
> rx AMSDUs will be added and announced in vht/ht capabilities to remote
> stations, then increase of number of segments will be needed.
>
> > > > > However I don't think allocating 2 pages to avoid ieee80211 header and SNAP
> > > > > copy is worth to do. For me best approach would be allocate 1 page for
> > > > > 4k AMSDU, 2 for 8k and 3 for 12k (still using sg, but without data_size
> > > > > change to avoid 32B copying).
> > > >
> > > > From my point of view it is better to avoid copying if it is possible. Are you
> > > > sure there is no difference?
> > >
> > > I do not understand what you mean by difference here.
> >
> > tpt differences, not sure if there are any
>
> I would not expect any measurable difference in tpt nor in cpu usage
> either way.
>
> But I think, if some AMSDU subframe will be spited into two fragments,
> data most likely will need to be linearised/copied, at some point before
> passed to application, what will overcome any benefit of avoiding coping
> 802.11 header. Thought, I don't think this somehow will be visible in
> benchmarking.
Sorry for the late reply. I think so.
I will post a v4 soon.
Regards,
Lorenzo
>
> Stanislaw