2016-02-05 10:41:11

by Felix Fietkau

[permalink] [raw]
Subject: [RFC v2] mac80211: add A-MSDU tx support

Requires software tx queueing support. frag_list support (for zero-copy)
is optional.

Signed-off-by: Felix Fietkau <[email protected]>
---
include/net/mac80211.h | 14 +++++
net/mac80211/agg-tx.c | 5 ++
net/mac80211/debugfs.c | 2 +
net/mac80211/ieee80211_i.h | 1 +
net/mac80211/tx.c | 151 +++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 173 insertions(+)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5714774..f50bbd2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -709,6 +709,7 @@ enum mac80211_tx_info_flags {
* @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
* frame (PS-Poll or uAPSD).
* @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
+ * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
*
* These flags are used in tx_info->control.flags.
*/
@@ -716,6 +717,7 @@ enum mac80211_tx_control_flags {
IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0),
IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1),
IEEE80211_TX_CTRL_RATE_INJECT = BIT(2),
+ IEEE80211_TX_CTRL_AMSDU = BIT(3),
};

/*
@@ -1964,6 +1966,12 @@ struct ieee80211_txq {
* order and does not need to manage its own reorder buffer or BA session
* timeout.
*
+ * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
+ * A-MSDU frames. Requires software tx queueing support.
+ *
+ * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
+ * skbs, needed for zero-copy software A-MSDU.
+ *
* @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
*/
enum ieee80211_hw_flags {
@@ -2001,6 +2009,8 @@ enum ieee80211_hw_flags {
IEEE80211_HW_BEACON_TX_STATUS,
IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
+ IEEE80211_HW_TX_AMSDU,
+ IEEE80211_HW_TX_FRAG_LIST,

/* keep last, obviously */
NUM_IEEE80211_HW_FLAGS
@@ -2073,6 +2083,9 @@ enum ieee80211_hw_flags {
* size is smaller (an example is LinkSys WRT120N with FW v1.0.07
* build 002 Jun 18 2012).
*
+ * @max_tx_amsdu_subframes: maximum number of subframes used in software
+ * A-MSDU aggregation
+ *
* @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
* (if %IEEE80211_HW_QUEUE_CONTROL is set)
*
@@ -2127,6 +2140,7 @@ struct ieee80211_hw {
u8 max_rate_tries;
u8 max_rx_aggregation_subframes;
u8 max_tx_aggregation_subframes;
+ u8 max_tx_amsdu_subframes;
u8 offchannel_tx_hw_queue;
u8 radiotap_mcs_details;
u16 radiotap_vht_details;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f..42fa810 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
size_t len)
{
struct tid_ampdu_tx *tid_tx;
+ struct ieee80211_txq *txq;
u16 capab, tid;
u8 buf_size;
bool amsdu;
@@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);

+ txq = sta->sta.txq[tid];
+ if (!amsdu && txq)
+ set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
+
mutex_lock(&sta->ampdu_mlme.mtx);

tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index e433d0c..847779d 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -127,6 +127,8 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
FLAG(BEACON_TX_STATUS),
FLAG(NEEDS_UNIQUE_STA_ADDR),
FLAG(SUPPORTS_REORDERING_BUFFER),
+ FLAG(TX_AMSDU),
+ FLAG(TX_FRAG_LIST),

/* keep last for the build bug below */
(void *)0x1
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a49c103..e68d8db 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -799,6 +799,7 @@ struct mac80211_qos_map {
enum txq_info_flags {
IEEE80211_TXQ_STOP,
IEEE80211_TXQ_AMPDU,
+ IEEE80211_TXQ_NO_AMSDU,
};

struct txq_info {
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a5aa275..f681048 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
out:
spin_unlock_bh(&txqi->queue.lock);

+ if (skb && skb_has_frag_list(skb) &&
+ !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+ skb_linearize(skb);
+
return skb;
}
EXPORT_SYMBOL(ieee80211_tx_dequeue);
@@ -2763,6 +2767,149 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
kfree_rcu(fast_tx, rcu_head);
}

+static int ieee80211_amsdu_pad(struct sk_buff *skb, int subframe_len)
+{
+ int amsdu_len = subframe_len + sizeof(struct ethhdr);
+ int padding = (4 - amsdu_len) & 3;
+
+ if (padding)
+ memset(skb_put(skb, padding), 0, padding);
+
+ return padding;
+}
+
+static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_fast_tx *fast_tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr;
+ struct ethhdr amsdu_hdr;
+ int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
+ int subframe_len = skb->len - hdr_len;
+ void *data;
+ u8 *qc;
+
+ if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
+ return true;
+
+ if (skb_headroom(skb) < sizeof(amsdu_hdr) || skb_tailroom(skb) < 3) {
+ I802_DEBUG_INC(local->tx_expand_skb_head);
+
+ if (pskb_expand_head(skb, sizeof(amsdu_hdr), 3, GFP_ATOMIC)) {
+ wiphy_debug(local->hw.wiphy,
+ "failed to reallocate TX buffer\n");
+ return false;
+ }
+ }
+
+ subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
+
+ amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
+ memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
+ memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+
+ data = skb_push(skb, sizeof(amsdu_hdr));
+ memmove(data, data + sizeof(amsdu_hdr), hdr_len);
+ memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+
+ hdr = data;
+ qc = ieee80211_get_qos_ctl(hdr);
+ *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+
+ info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
+
+ return true;
+}
+
+static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_fast_tx *fast_tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ struct ieee80211_txq *txq = sta->sta.txq[tid];
+ struct txq_info *txqi;
+ struct sk_buff **frag_tail, *head;
+ int subframe_len = skb->len - ETH_ALEN;
+ int max_amsdu_len;
+ __be16 len;
+ void *data;
+ bool ret = false;
+ int n = 1;
+
+ if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
+ return false;
+
+ if (!txq)
+ return false;
+
+ txqi = to_txq_info(txq);
+ if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
+ return false;
+
+ /*
+ * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
+ * sessions are started/stopped without txq flush, use the limit here
+ * to avoid having to de-aggregate later.
+ */
+ max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
+
+ spin_lock_bh(&txqi->queue.lock);
+
+ head = skb_peek_tail(&txqi->queue);
+ if (!head)
+ goto out;
+
+ if (skb->len + head->len > max_amsdu_len)
+ goto out;
+
+ if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
+ goto out;
+
+ frag_tail = &skb_shinfo(head)->frag_list;
+ while (*frag_tail) {
+ frag_tail = &(*frag_tail)->next;
+ n++;
+ }
+
+ if (local->hw.max_tx_amsdu_subframes &&
+ n > local->hw.max_tx_amsdu_subframes)
+ goto out;
+
+ if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 3) {
+ I802_DEBUG_INC(local->tx_expand_skb_head);
+
+ if (pskb_expand_head(skb, 8, 3, GFP_ATOMIC)) {
+ wiphy_debug(local->hw.wiphy,
+ "failed to reallocate TX buffer\n");
+ goto out;
+ }
+ }
+
+ subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
+
+ ret = true;
+ data = skb_push(skb, ETH_ALEN + 2);
+ memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
+
+ data += 2 * ETH_ALEN;
+ len = cpu_to_be16(subframe_len);
+ memcpy(data, &len, 2);
+ memcpy(data + 2, rfc1042_header, ETH_ALEN);
+
+ head->len += skb->len;
+ head->data_len += skb->len;
+ *frag_tail = skb;
+
+out:
+ spin_unlock_bh(&txqi->queue.lock);
+
+ return ret;
+}
+
static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
struct net_device *dev, struct sta_info *sta,
struct ieee80211_fast_tx *fast_tx,
@@ -2817,6 +2964,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,

ieee80211_tx_stats(dev, skb->len + extra_head);

+ if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
+ ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
+ return true;
+
/* will not be crypto-handled beyond what we do here, so use false
* as the may-encrypt argument for the resize to not account for
* more room than we already have in 'extra_head'
--
2.2.2



2016-02-07 13:21:54

by Felix Fietkau

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On 2016-02-07 12:56, Emmanuel Grumbach wrote:
>>> well.. Yes, you can't assume that you'll have one descriptor for one
>>> MSDU payload (unless the driver doesn't advertise SG to the
>>> netstack).
>> Okay, please make a suggestion describing the exact kinds of limits you
>> would need for iwlwifi.
>
> Are athX devices able to handle MPDUs with any number of frags? Say if
> you have 30 different physically contiguous fragments, the DMA would
> be able to load all these into one single packet and send it to the
> air?
I think athX devices have no limitations there. I'm not testing this
with atheros devices though - ath9k does not have mac80211
per-sta-per-tid queueing support yet. I'm working with MediaTek MT76x2
chipsets with my mt76 driver, which I will upstream soon.

> iwlwifi currently has the limitation of 20 Transmit Buffers (BTs)
> which I mentioned earlier. I guess it'd be nice if the driver would be
> able to advertise how many fragments it can handle. Then, you'd need
> to stop the A-MSDU building if you'd cross this boundary?
>
> You can look at skb_shinfo(skb)->nr_frags to know how many frags you
> have for each skb. On top of that, you need 1 frag for each subframe
> (subframe header).
I implemented all of your suggestions in RFC v3, let me know if
anything's missing.

- Felix

2016-02-07 07:25:33

by Emmanuel Grumbach

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On Fri, Feb 5, 2016 at 12:41 PM, Felix Fietkau <[email protected]> wrote:
> Requires software tx queueing support. frag_list support (for zero-copy)
> is optional.
>
> Signed-off-by: Felix Fietkau <[email protected]>

Looks nice!
This would allow us to create aggregates of TCP Acks, the problem is
that when you are mostly receiving data, the hardware queues are
pretty much empty (nothing besides the TCP Acks which should go out
quickly) so that packets don't pile up in the software queues and
hence you don't have enough material to build an A-MSDU.
I guess that for AP oriented devices, this is ideal solution since you
can't rely on TSO (packets are not locally generated) and this allows
to build an A-MSDU without adding more latency since you build an
A-MSDU with packets that are already in the queue waiting instead of
delaying transmission of the first packet.
IIRC, the latter was the approach chose by the new Marvell driver
posted a few weeks ago. This approach is better in my eyes.
For iwlwifi which is much more station oriented (of GO which is
basically an AP with locally generated traffic), I took the TSO
approach. I guess we could try to change iwlwifi to use your tx
queues, and check how that works. This would allow us to have A-MSDU
on bridged traffic as well, although this use case is much less common
for Intel devices.

One small question below.

[snip]

> +
> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
> + struct sta_info *sta,
> + struct ieee80211_fast_tx *fast_tx,
> + struct sk_buff *skb)
> +{
> + struct ieee80211_local *local = sdata->local;
> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
> + struct ieee80211_txq *txq = sta->sta.txq[tid];
> + struct txq_info *txqi;
> + struct sk_buff **frag_tail, *head;
> + int subframe_len = skb->len - ETH_ALEN;
> + int max_amsdu_len;
> + __be16 len;
> + void *data;
> + bool ret = false;
> + int n = 1;
> +
> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
> + return false;
> +
> + if (!txq)
> + return false;
> +
> + txqi = to_txq_info(txq);
> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
> + return false;
> +
> + /*
> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
> + * sessions are started/stopped without txq flush, use the limit here
> + * to avoid having to de-aggregate later.
> + */
> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);

So you can't get 10K A-MSDUs? I don't see where you check that you
have an A-MPDU session here. You seem to be applying the 4095 limit
also for streams that are not an A-MPDU?
I guess you could check if the sta is a VHT peer, in that case, no
limit applies.

> +
> + spin_lock_bh(&txqi->queue.lock);
> +
> + head = skb_peek_tail(&txqi->queue);
> + if (!head)
> + goto out;
> +
> + if (skb->len + head->len > max_amsdu_len)
> + goto out;
> +
> + if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
> + goto out;
> +
> + frag_tail = &skb_shinfo(head)->frag_list;
> + while (*frag_tail) {
> + frag_tail = &(*frag_tail)->next;
> + n++;
> + }
> +
> + if (local->hw.max_tx_amsdu_subframes &&
> + n > local->hw.max_tx_amsdu_subframes)
> + goto out;
> +
> + if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 3) {
> + I802_DEBUG_INC(local->tx_expand_skb_head);
> +
> + if (pskb_expand_head(skb, 8, 3, GFP_ATOMIC)) {
> + wiphy_debug(local->hw.wiphy,
> + "failed to reallocate TX buffer\n");
> + goto out;
> + }
> + }
> +
> + subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
> +
> + ret = true;
> + data = skb_push(skb, ETH_ALEN + 2);
> + memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
> +
> + data += 2 * ETH_ALEN;
> + len = cpu_to_be16(subframe_len);
> + memcpy(data, &len, 2);
> + memcpy(data + 2, rfc1042_header, ETH_ALEN);
> +
> + head->len += skb->len;
> + head->data_len += skb->len;
> + *frag_tail = skb;
> +
> +out:
> + spin_unlock_bh(&txqi->queue.lock);
> +
> + return ret;
> +}
> +
> static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
> struct net_device *dev, struct sta_info *sta,
> struct ieee80211_fast_tx *fast_tx,
> @@ -2817,6 +2964,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>
> ieee80211_tx_stats(dev, skb->len + extra_head);
>
> + if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
> + ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
> + return true;
> +
> /* will not be crypto-handled beyond what we do here, so use false
> * as the may-encrypt argument for the resize to not account for
> * more room than we already have in 'extra_head'
> --
> 2.2.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2016-02-07 10:06:52

by Emmanuel Grumbach

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On Sun, Feb 7, 2016 at 11:08 AM, Felix Fietkau <[email protected]> wrote:
> On 2016-02-07 08:25, Emmanuel Grumbach wrote:
>> On Fri, Feb 5, 2016 at 12:41 PM, Felix Fietkau <[email protected]> wrote:
>>> Requires software tx queueing support. frag_list support (for zero-copy)
>>> is optional.
>>>
>>> Signed-off-by: Felix Fietkau <[email protected]>
>>
>> Looks nice!
>> This would allow us to create aggregates of TCP Acks, the problem is
>> that when you are mostly receiving data, the hardware queues are
>> pretty much empty (nothing besides the TCP Acks which should go out
>> quickly) so that packets don't pile up in the software queues and
>> hence you don't have enough material to build an A-MSDU.
>> I guess that for AP oriented devices, this is ideal solution since you
>> can't rely on TSO (packets are not locally generated) and this allows
>> to build an A-MSDU without adding more latency since you build an
>> A-MSDU with packets that are already in the queue waiting instead of
>> delaying transmission of the first packet.
>> IIRC, the latter was the approach chose by the new Marvell driver
>> posted a few weeks ago. This approach is better in my eyes.
>> For iwlwifi which is much more station oriented (of GO which is
>> basically an AP with locally generated traffic), I took the TSO
>> approach. I guess we could try to change iwlwifi to use your tx
>> queues, and check how that works. This would allow us to have A-MSDU
>> on bridged traffic as well, although this use case is much less common
>> for Intel devices.


> Can the iwlwifi firmware maintain per-sta per-tid queues? Because that
> way you would get the most benefits from using that tx queueing
> infrastructure.

Well... iwlwifi and athXk are very different. iwlwifi really has the
concept of queues and not flat descriptors.
Any Tx descriptor lives in the context of a Tx queue which is
per-sta-per tid when A-MPDUs is enabled.
We are now moving to a scheme in which we will have per-sta-per-tid
queue regardless of the A-MPDU state which will make much sense to tie
to the tx queueing infrastructure.
Thing is that in that case I am afraid we will not have enough packets
in the software tx queue to get A-MSDUs from your code. with TSO, it
is easier :) Still worth trying to work with this instead of TSO and
see how it goes. That won't happen anytime soon though.

>
>>> +
>>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>>> + struct sta_info *sta,
>>> + struct ieee80211_fast_tx *fast_tx,
>>> + struct sk_buff *skb)
>>> +{
>>> + struct ieee80211_local *local = sdata->local;
>>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>>> + struct txq_info *txqi;
>>> + struct sk_buff **frag_tail, *head;
>>> + int subframe_len = skb->len - ETH_ALEN;
>>> + int max_amsdu_len;
>>> + __be16 len;
>>> + void *data;
>>> + bool ret = false;
>>> + int n = 1;
>>> +
>>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>>> + return false;
>>> +
>>> + if (!txq)
>>> + return false;
>>> +
>>> + txqi = to_txq_info(txq);
>>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>>> + return false;
>>> +
>>> + /*
>>> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>>> + * sessions are started/stopped without txq flush, use the limit here
>>> + * to avoid having to de-aggregate later.
>>> + */
>>> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
>>
>> So you can't get 10K A-MSDUs? I don't see where you check that you
>> have an A-MPDU session here. You seem to be applying the 4095 limit
>> also for streams that are not an A-MPDU?
>> I guess you could check if the sta is a VHT peer, in that case, no
>> limit applies.
> The explanation for the missing A-MPDU change is in that comment -
> checking for an active A-MPDU session would make it unnecessarily complex.
> Good point about checking for VHT capabilities to remove this limit, I
> will add that.
>
> - Felix

2016-02-07 11:56:32

by Emmanuel Grumbach

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On Sun, Feb 7, 2016 at 1:49 PM, Felix Fietkau <[email protected]> wrote:
> On 2016-02-07 12:32, Emmanuel Grumbach wrote:
>>>>>>>> +
>>>>>>>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>>>>>>>> + struct sta_info *sta,
>>>>>>>> + struct ieee80211_fast_tx *fast_tx,
>>>>>>>> + struct sk_buff *skb)
>>>>>>>> +{
>>>>>>>> + struct ieee80211_local *local = sdata->local;
>>>>>>>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>>>>>>>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>>>>>>>> + struct txq_info *txqi;
>>>>>>>> + struct sk_buff **frag_tail, *head;
>>>>>>>> + int subframe_len = skb->len - ETH_ALEN;
>>>>>>>> + int max_amsdu_len;
>>>>>>>> + __be16 len;
>>>>>>>> + void *data;
>>>>>>>> + bool ret = false;
>>>>>>>> + int n = 1;
>>>>>>>> +
>>>>>>>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>>>>>>>> + return false;
>>>>>>>> +
>>>>>>>> + if (!txq)
>>>>>>>> + return false;
>>>>>>>> +
>>>>>>>> + txqi = to_txq_info(txq);
>>>>>>>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>>>>>>>> + return false;
>>>>>>>> +
>>>>>>>> + /*
>>>>>>>> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>>>>>>>> + * sessions are started/stopped without txq flush, use the limit here
>>>>>>>> + * to avoid having to de-aggregate later.
>>>>>>>> + */
>>>>>>>> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
>>>>>>>
>>>>>>> So you can't get 10K A-MSDUs? I don't see where you check that you
>>>>>>> have an A-MPDU session here. You seem to be applying the 4095 limit
>>>>>>> also for streams that are not an A-MPDU?
>>>>>>> I guess you could check if the sta is a VHT peer, in that case, no
>>>>>>> limit applies.
>>>>>> The explanation for the missing A-MPDU change is in that comment -
>>>>>> checking for an active A-MPDU session would make it unnecessarily complex.
>>>>>> Good point about checking for VHT capabilities to remove this limit, I
>>>>>> will add that.
>>>>
>>>> Yes - I read the comment, but it seemed very sub-optimal to limit all
>>>> the A-MSDUs to 4K. With TSO I can get up to 10K and it really helps
>>>> TPT.
>>> This was built with the assumption that most scenarios use A-MPDU anyway
>>> and thus don't need really large A-MSDUs.
>>
>> Yes - so that's interesting. We can chose to have long A-MSDUs inside
>> a short (in terms of number of MPDUs) A-MPDU, of with shorter A-MSDU
>> and squeeze more of these into a single A-MDPU.
>> The first intuition says that we'd better have more MPDUs because of
>> the CRC check for each MPDU which doesn't exist in A-MSDU. OTOH, I
>> remember I could clearly see that I get a higher TPT with longer
>> A-MSDUs. Maybe I wasn't looking right at the size of the A-MPDU? I
>> guess I'd need to go back to the table with all the values we had, but
>> since we pretty much got what we wanted, I am not sure I will able to
>> find time for this :)
> I think it also depends on the environment. I'd guess that under very
> ideal conditions with very few retransmissions, really long A-MSDU might
> have some performance benefits, but I don't think that'll hold if the
> conditions are less than ideal and you have rate fluctuation and
> retransmissions.
>
>>>> One more point. In VHT, there may be a limit on the numbers of
>>>> subframes in the A-MSDU. I don't see you handle that. Maybe I missed
>>>> it?
>>> I haven't looked at that much yet. Right now the driver can only specify
>>> a limit for the number of subframes.
>>
>> I am talking about a limitation the peer can advertise. Check this out:
>> https://git.kernel.org/cgit/linux/kernel/git/jberg/mac80211-next.git/tree/net/mac80211/cfg.c#n1134
>>
>> I couldn't see the limit the driver can specify in your code. I may
>> very well have missed it.
> I missed that one. Will add it in the next patch.
>
>>>> This is an order 4 allocation, for each A-MSDU. Note
>>>> that iwlwifi (and probably other drivers) can handle gather DMA in Tx,
>>>> but they have a limited number of frags they can handle. iwlwifi e.g.
>>>> can handle up to 20 frags, but 3 are taken for "paperwork". You'll
>>>> have 2 frags per subframe at least (assuming that each subframe's
>>>> payload is nicely contiguous and not on a page boundary). I think that
>>>> it may be worthwhile to ask the driver how many frags it is supposed
>>>> to handle. I can't promise iwlwifi will use it, but I guess it will be
>>>> useful for someone.
>>> You mean an extra frag limit in addition to the driver subframe limit,
>>> in case individual subframes are fragmented as well?
>>>
>>
>> well.. Yes, you can't assume that you'll have one descriptor for one
>> MSDU payload (unless the driver doesn't advertise SG to the
>> netstack).
> Okay, please make a suggestion describing the exact kinds of limits you
> would need for iwlwifi.

Are athX devices able to handle MPDUs with any number of frags? Say if
you have 30 different physically contiguous fragments, the DMA would
be able to load all these into one single packet and send it to the
air?
iwlwifi currently has the limitation of 20 Transmit Buffers (BTs)
which I mentioned earlier. I guess it'd be nice if the driver would be
able to advertise how many fragments it can handle. Then, you'd need
to stop the A-MSDU building if you'd cross this boundary?

You can look at skb_shinfo(skb)->nr_frags to know how many frags you
have for each skb. On top of that, you need 1 frag for each subframe
(subframe header).

2016-02-07 11:32:46

by Emmanuel Grumbach

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On Sun, Feb 7, 2016 at 12:48 PM, Felix Fietkau <[email protected]> wrote:
> On 2016-02-07 11:22, Emmanuel Grumbach wrote:
>> On Sun, Feb 7, 2016 at 12:06 PM, Emmanuel Grumbach <[email protected]> wrote:
>>> On Sun, Feb 7, 2016 at 11:08 AM, Felix Fietkau <[email protected]> wrote:
>>>> On 2016-02-07 08:25, Emmanuel Grumbach wrote:
>>>>> On Fri, Feb 5, 2016 at 12:41 PM, Felix Fietkau <[email protected]> wrote:
>>>>>> Requires software tx queueing support. frag_list support (for zero-copy)
>>>>>> is optional.
>>>>>>
>>>>>> Signed-off-by: Felix Fietkau <[email protected]>
>>>>>
>>>>> Looks nice!
>>>>> This would allow us to create aggregates of TCP Acks, the problem is
>>>>> that when you are mostly receiving data, the hardware queues are
>>>>> pretty much empty (nothing besides the TCP Acks which should go out
>>>>> quickly) so that packets don't pile up in the software queues and
>>>>> hence you don't have enough material to build an A-MSDU.
>>>>> I guess that for AP oriented devices, this is ideal solution since you
>>>>> can't rely on TSO (packets are not locally generated) and this allows
>>>>> to build an A-MSDU without adding more latency since you build an
>>>>> A-MSDU with packets that are already in the queue waiting instead of
>>>>> delaying transmission of the first packet.
>>>>> IIRC, the latter was the approach chose by the new Marvell driver
>>>>> posted a few weeks ago. This approach is better in my eyes.
>>>>> For iwlwifi which is much more station oriented (of GO which is
>>>>> basically an AP with locally generated traffic), I took the TSO
>>>>> approach. I guess we could try to change iwlwifi to use your tx
>>>>> queues, and check how that works. This would allow us to have A-MSDU
>>>>> on bridged traffic as well, although this use case is much less common
>>>>> for Intel devices.
>>>
>>>
>>>> Can the iwlwifi firmware maintain per-sta per-tid queues? Because that
>>>> way you would get the most benefits from using that tx queueing
>>>> infrastructure.
>>>
>>> Well... iwlwifi and athXk are very different. iwlwifi really has the
>>> concept of queues and not flat descriptors.
>>> Any Tx descriptor lives in the context of a Tx queue which is
>>> per-sta-per tid when A-MPDUs is enabled.
>>> We are now moving to a scheme in which we will have per-sta-per-tid
>>> queue regardless of the A-MPDU state which will make much sense to tie
>>> to the tx queueing infrastructure.
>>> Thing is that in that case I am afraid we will not have enough packets
>>> in the software tx queue to get A-MSDUs from your code. with TSO, it
>>> is easier :) Still worth trying to work with this instead of TSO and
>>> see how it goes. That won't happen anytime soon though.
>>>
>>>>
>>>>>> +
>>>>>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>>>>>> + struct sta_info *sta,
>>>>>> + struct ieee80211_fast_tx *fast_tx,
>>>>>> + struct sk_buff *skb)
>>>>>> +{
>>>>>> + struct ieee80211_local *local = sdata->local;
>>>>>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>>>>>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>>>>>> + struct txq_info *txqi;
>>>>>> + struct sk_buff **frag_tail, *head;
>>>>>> + int subframe_len = skb->len - ETH_ALEN;
>>>>>> + int max_amsdu_len;
>>>>>> + __be16 len;
>>>>>> + void *data;
>>>>>> + bool ret = false;
>>>>>> + int n = 1;
>>>>>> +
>>>>>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>>>>>> + return false;
>>>>>> +
>>>>>> + if (!txq)
>>>>>> + return false;
>>>>>> +
>>>>>> + txqi = to_txq_info(txq);
>>>>>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>>>>>> + return false;
>>>>>> +
>>>>>> + /*
>>>>>> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>>>>>> + * sessions are started/stopped without txq flush, use the limit here
>>>>>> + * to avoid having to de-aggregate later.
>>>>>> + */
>>>>>> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
>>>>>
>>>>> So you can't get 10K A-MSDUs? I don't see where you check that you
>>>>> have an A-MPDU session here. You seem to be applying the 4095 limit
>>>>> also for streams that are not an A-MPDU?
>>>>> I guess you could check if the sta is a VHT peer, in that case, no
>>>>> limit applies.
>>>> The explanation for the missing A-MPDU change is in that comment -
>>>> checking for an active A-MPDU session would make it unnecessarily complex.
>>>> Good point about checking for VHT capabilities to remove this limit, I
>>>> will add that.
>>
>> Yes - I read the comment, but it seemed very sub-optimal to limit all
>> the A-MSDUs to 4K. With TSO I can get up to 10K and it really helps
>> TPT.
> This was built with the assumption that most scenarios use A-MPDU anyway
> and thus don't need really large A-MSDUs.

Yes - so that's interesting. We can chose to have long A-MSDUs inside
a short (in terms of number of MPDUs) A-MPDU, of with shorter A-MSDU
and squeeze more of these into a single A-MDPU.
The first intuition says that we'd better have more MPDUs because of
the CRC check for each MPDU which doesn't exist in A-MSDU. OTOH, I
remember I could clearly see that I get a higher TPT with longer
A-MSDUs. Maybe I wasn't looking right at the size of the A-MPDU? I
guess I'd need to go back to the table with all the values we had, but
since we pretty much got what we wanted, I am not sure I will able to
find time for this :)

>
>> One more point. In VHT, there may be a limit on the numbers of
>> subframes in the A-MSDU. I don't see you handle that. Maybe I missed
>> it?
> I haven't looked at that much yet. Right now the driver can only specify
> a limit for the number of subframes.

I am talking about a limitation the peer can advertise. Check this out:
https://git.kernel.org/cgit/linux/kernel/git/jberg/mac80211-next.git/tree/net/mac80211/cfg.c#n1134

I couldn't see the limit the driver can specify in your code. I may
very well have missed it.

>
>> And... in case the driver doesn't handle frag_list, you linearize the
>> skb which is pretty much the only thing you can do at this stage. But,
>> when you'll lift the 4095 bytes limit, you'll get 11K A-MSDU,
>> linarizing such a long packet is really putting the memory manager
>> under pressure.
> I added no-frag_list support primarily for debugging purposes, it's not
> supposed to perform well.

ok.

>
>> This is an order 4 allocation, for each A-MSDU. Note
>> that iwlwifi (and probably other drivers) can handle gather DMA in Tx,
>> but they have a limited number of frags they can handle. iwlwifi e.g.
>> can handle up to 20 frags, but 3 are taken for "paperwork". You'll
>> have 2 frags per subframe at least (assuming that each subframe's
>> payload is nicely contiguous and not on a page boundary). I think that
>> it may be worthwhile to ask the driver how many frags it is supposed
>> to handle. I can't promise iwlwifi will use it, but I guess it will be
>> useful for someone.
> You mean an extra frag limit in addition to the driver subframe limit,
> in case individual subframes are fragmented as well?
>

well.. Yes, you can't assume that you'll have one descriptor for one
MSDU payload (unless the driver doesn't advertise SG to the
netstack).

> - Felix

2016-02-07 10:49:01

by Felix Fietkau

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On 2016-02-07 11:22, Emmanuel Grumbach wrote:
> On Sun, Feb 7, 2016 at 12:06 PM, Emmanuel Grumbach <[email protected]> wrote:
>> On Sun, Feb 7, 2016 at 11:08 AM, Felix Fietkau <[email protected]> wrote:
>>> On 2016-02-07 08:25, Emmanuel Grumbach wrote:
>>>> On Fri, Feb 5, 2016 at 12:41 PM, Felix Fietkau <[email protected]> wrote:
>>>>> Requires software tx queueing support. frag_list support (for zero-copy)
>>>>> is optional.
>>>>>
>>>>> Signed-off-by: Felix Fietkau <[email protected]>
>>>>
>>>> Looks nice!
>>>> This would allow us to create aggregates of TCP Acks, the problem is
>>>> that when you are mostly receiving data, the hardware queues are
>>>> pretty much empty (nothing besides the TCP Acks which should go out
>>>> quickly) so that packets don't pile up in the software queues and
>>>> hence you don't have enough material to build an A-MSDU.
>>>> I guess that for AP oriented devices, this is ideal solution since you
>>>> can't rely on TSO (packets are not locally generated) and this allows
>>>> to build an A-MSDU without adding more latency since you build an
>>>> A-MSDU with packets that are already in the queue waiting instead of
>>>> delaying transmission of the first packet.
>>>> IIRC, the latter was the approach chose by the new Marvell driver
>>>> posted a few weeks ago. This approach is better in my eyes.
>>>> For iwlwifi which is much more station oriented (of GO which is
>>>> basically an AP with locally generated traffic), I took the TSO
>>>> approach. I guess we could try to change iwlwifi to use your tx
>>>> queues, and check how that works. This would allow us to have A-MSDU
>>>> on bridged traffic as well, although this use case is much less common
>>>> for Intel devices.
>>
>>
>>> Can the iwlwifi firmware maintain per-sta per-tid queues? Because that
>>> way you would get the most benefits from using that tx queueing
>>> infrastructure.
>>
>> Well... iwlwifi and athXk are very different. iwlwifi really has the
>> concept of queues and not flat descriptors.
>> Any Tx descriptor lives in the context of a Tx queue which is
>> per-sta-per tid when A-MPDUs is enabled.
>> We are now moving to a scheme in which we will have per-sta-per-tid
>> queue regardless of the A-MPDU state which will make much sense to tie
>> to the tx queueing infrastructure.
>> Thing is that in that case I am afraid we will not have enough packets
>> in the software tx queue to get A-MSDUs from your code. with TSO, it
>> is easier :) Still worth trying to work with this instead of TSO and
>> see how it goes. That won't happen anytime soon though.
>>
>>>
>>>>> +
>>>>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>>>>> + struct sta_info *sta,
>>>>> + struct ieee80211_fast_tx *fast_tx,
>>>>> + struct sk_buff *skb)
>>>>> +{
>>>>> + struct ieee80211_local *local = sdata->local;
>>>>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>>>>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>>>>> + struct txq_info *txqi;
>>>>> + struct sk_buff **frag_tail, *head;
>>>>> + int subframe_len = skb->len - ETH_ALEN;
>>>>> + int max_amsdu_len;
>>>>> + __be16 len;
>>>>> + void *data;
>>>>> + bool ret = false;
>>>>> + int n = 1;
>>>>> +
>>>>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>>>>> + return false;
>>>>> +
>>>>> + if (!txq)
>>>>> + return false;
>>>>> +
>>>>> + txqi = to_txq_info(txq);
>>>>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>>>>> + return false;
>>>>> +
>>>>> + /*
>>>>> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>>>>> + * sessions are started/stopped without txq flush, use the limit here
>>>>> + * to avoid having to de-aggregate later.
>>>>> + */
>>>>> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
>>>>
>>>> So you can't get 10K A-MSDUs? I don't see where you check that you
>>>> have an A-MPDU session here. You seem to be applying the 4095 limit
>>>> also for streams that are not an A-MPDU?
>>>> I guess you could check if the sta is a VHT peer, in that case, no
>>>> limit applies.
>>> The explanation for the missing A-MPDU change is in that comment -
>>> checking for an active A-MPDU session would make it unnecessarily complex.
>>> Good point about checking for VHT capabilities to remove this limit, I
>>> will add that.
>
> Yes - I read the comment, but it seemed very sub-optimal to limit all
> the A-MSDUs to 4K. With TSO I can get up to 10K and it really helps
> TPT.
This was built with the assumption that most scenarios use A-MPDU anyway
and thus don't need really large A-MSDUs.

> One more point. In VHT, there may be a limit on the numbers of
> subframes in the A-MSDU. I don't see you handle that. Maybe I missed
> it?
I haven't looked at that much yet. Right now the driver can only specify
a limit for the number of subframes.

> And... in case the driver doesn't handle frag_list, you linearize the
> skb which is pretty much the only thing you can do at this stage. But,
> when you'll lift the 4095 bytes limit, you'll get 11K A-MSDU,
> linarizing such a long packet is really putting the memory manager
> under pressure.
I added no-frag_list support primarily for debugging purposes, it's not
supposed to perform well.

> This is an order 4 allocation, for each A-MSDU. Note
> that iwlwifi (and probably other drivers) can handle gather DMA in Tx,
> but they have a limited number of frags they can handle. iwlwifi e.g.
> can handle up to 20 frags, but 3 are taken for "paperwork". You'll
> have 2 frags per subframe at least (assuming that each subframe's
> payload is nicely contiguous and not on a page boundary). I think that
> it may be worthwhile to ask the driver how many frags it is supposed
> to handle. I can't promise iwlwifi will use it, but I guess it will be
> useful for someone.
You mean an extra frag limit in addition to the driver subframe limit,
in case individual subframes are fragmented as well?

- Felix

2016-02-07 11:49:26

by Felix Fietkau

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On 2016-02-07 12:32, Emmanuel Grumbach wrote:
>>>>>>> +
>>>>>>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>>>>>>> + struct sta_info *sta,
>>>>>>> + struct ieee80211_fast_tx *fast_tx,
>>>>>>> + struct sk_buff *skb)
>>>>>>> +{
>>>>>>> + struct ieee80211_local *local = sdata->local;
>>>>>>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>>>>>>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>>>>>>> + struct txq_info *txqi;
>>>>>>> + struct sk_buff **frag_tail, *head;
>>>>>>> + int subframe_len = skb->len - ETH_ALEN;
>>>>>>> + int max_amsdu_len;
>>>>>>> + __be16 len;
>>>>>>> + void *data;
>>>>>>> + bool ret = false;
>>>>>>> + int n = 1;
>>>>>>> +
>>>>>>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>>>>>>> + return false;
>>>>>>> +
>>>>>>> + if (!txq)
>>>>>>> + return false;
>>>>>>> +
>>>>>>> + txqi = to_txq_info(txq);
>>>>>>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>>>>>>> + return false;
>>>>>>> +
>>>>>>> + /*
>>>>>>> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>>>>>>> + * sessions are started/stopped without txq flush, use the limit here
>>>>>>> + * to avoid having to de-aggregate later.
>>>>>>> + */
>>>>>>> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
>>>>>>
>>>>>> So you can't get 10K A-MSDUs? I don't see where you check that you
>>>>>> have an A-MPDU session here. You seem to be applying the 4095 limit
>>>>>> also for streams that are not an A-MPDU?
>>>>>> I guess you could check if the sta is a VHT peer, in that case, no
>>>>>> limit applies.
>>>>> The explanation for the missing A-MPDU change is in that comment -
>>>>> checking for an active A-MPDU session would make it unnecessarily complex.
>>>>> Good point about checking for VHT capabilities to remove this limit, I
>>>>> will add that.
>>>
>>> Yes - I read the comment, but it seemed very sub-optimal to limit all
>>> the A-MSDUs to 4K. With TSO I can get up to 10K and it really helps
>>> TPT.
>> This was built with the assumption that most scenarios use A-MPDU anyway
>> and thus don't need really large A-MSDUs.
>
> Yes - so that's interesting. We can chose to have long A-MSDUs inside
> a short (in terms of number of MPDUs) A-MPDU, of with shorter A-MSDU
> and squeeze more of these into a single A-MDPU.
> The first intuition says that we'd better have more MPDUs because of
> the CRC check for each MPDU which doesn't exist in A-MSDU. OTOH, I
> remember I could clearly see that I get a higher TPT with longer
> A-MSDUs. Maybe I wasn't looking right at the size of the A-MPDU? I
> guess I'd need to go back to the table with all the values we had, but
> since we pretty much got what we wanted, I am not sure I will able to
> find time for this :)
I think it also depends on the environment. I'd guess that under very
ideal conditions with very few retransmissions, really long A-MSDU might
have some performance benefits, but I don't think that'll hold if the
conditions are less than ideal and you have rate fluctuation and
retransmissions.

>>> One more point. In VHT, there may be a limit on the numbers of
>>> subframes in the A-MSDU. I don't see you handle that. Maybe I missed
>>> it?
>> I haven't looked at that much yet. Right now the driver can only specify
>> a limit for the number of subframes.
>
> I am talking about a limitation the peer can advertise. Check this out:
> https://git.kernel.org/cgit/linux/kernel/git/jberg/mac80211-next.git/tree/net/mac80211/cfg.c#n1134
>
> I couldn't see the limit the driver can specify in your code. I may
> very well have missed it.
I missed that one. Will add it in the next patch.

>>> This is an order 4 allocation, for each A-MSDU. Note
>>> that iwlwifi (and probably other drivers) can handle gather DMA in Tx,
>>> but they have a limited number of frags they can handle. iwlwifi e.g.
>>> can handle up to 20 frags, but 3 are taken for "paperwork". You'll
>>> have 2 frags per subframe at least (assuming that each subframe's
>>> payload is nicely contiguous and not on a page boundary). I think that
>>> it may be worthwhile to ask the driver how many frags it is supposed
>>> to handle. I can't promise iwlwifi will use it, but I guess it will be
>>> useful for someone.
>> You mean an extra frag limit in addition to the driver subframe limit,
>> in case individual subframes are fragmented as well?
>>
>
> well.. Yes, you can't assume that you'll have one descriptor for one
> MSDU payload (unless the driver doesn't advertise SG to the
> netstack).
Okay, please make a suggestion describing the exact kinds of limits you
would need for iwlwifi.

- Felix

2016-02-07 10:22:50

by Emmanuel Grumbach

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On Sun, Feb 7, 2016 at 12:06 PM, Emmanuel Grumbach <[email protected]> wrote:
> On Sun, Feb 7, 2016 at 11:08 AM, Felix Fietkau <[email protected]> wrote:
>> On 2016-02-07 08:25, Emmanuel Grumbach wrote:
>>> On Fri, Feb 5, 2016 at 12:41 PM, Felix Fietkau <[email protected]> wrote:
>>>> Requires software tx queueing support. frag_list support (for zero-copy)
>>>> is optional.
>>>>
>>>> Signed-off-by: Felix Fietkau <[email protected]>
>>>
>>> Looks nice!
>>> This would allow us to create aggregates of TCP Acks, the problem is
>>> that when you are mostly receiving data, the hardware queues are
>>> pretty much empty (nothing besides the TCP Acks which should go out
>>> quickly) so that packets don't pile up in the software queues and
>>> hence you don't have enough material to build an A-MSDU.
>>> I guess that for AP oriented devices, this is ideal solution since you
>>> can't rely on TSO (packets are not locally generated) and this allows
>>> to build an A-MSDU without adding more latency since you build an
>>> A-MSDU with packets that are already in the queue waiting instead of
>>> delaying transmission of the first packet.
>>> IIRC, the latter was the approach chose by the new Marvell driver
>>> posted a few weeks ago. This approach is better in my eyes.
>>> For iwlwifi which is much more station oriented (of GO which is
>>> basically an AP with locally generated traffic), I took the TSO
>>> approach. I guess we could try to change iwlwifi to use your tx
>>> queues, and check how that works. This would allow us to have A-MSDU
>>> on bridged traffic as well, although this use case is much less common
>>> for Intel devices.
>
>
>> Can the iwlwifi firmware maintain per-sta per-tid queues? Because that
>> way you would get the most benefits from using that tx queueing
>> infrastructure.
>
> Well... iwlwifi and athXk are very different. iwlwifi really has the
> concept of queues and not flat descriptors.
> Any Tx descriptor lives in the context of a Tx queue which is
> per-sta-per tid when A-MPDUs is enabled.
> We are now moving to a scheme in which we will have per-sta-per-tid
> queue regardless of the A-MPDU state which will make much sense to tie
> to the tx queueing infrastructure.
> Thing is that in that case I am afraid we will not have enough packets
> in the software tx queue to get A-MSDUs from your code. with TSO, it
> is easier :) Still worth trying to work with this instead of TSO and
> see how it goes. That won't happen anytime soon though.
>
>>
>>>> +
>>>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>>>> + struct sta_info *sta,
>>>> + struct ieee80211_fast_tx *fast_tx,
>>>> + struct sk_buff *skb)
>>>> +{
>>>> + struct ieee80211_local *local = sdata->local;
>>>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>>>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>>>> + struct txq_info *txqi;
>>>> + struct sk_buff **frag_tail, *head;
>>>> + int subframe_len = skb->len - ETH_ALEN;
>>>> + int max_amsdu_len;
>>>> + __be16 len;
>>>> + void *data;
>>>> + bool ret = false;
>>>> + int n = 1;
>>>> +
>>>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>>>> + return false;
>>>> +
>>>> + if (!txq)
>>>> + return false;
>>>> +
>>>> + txqi = to_txq_info(txq);
>>>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>>>> + return false;
>>>> +
>>>> + /*
>>>> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>>>> + * sessions are started/stopped without txq flush, use the limit here
>>>> + * to avoid having to de-aggregate later.
>>>> + */
>>>> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
>>>
>>> So you can't get 10K A-MSDUs? I don't see where you check that you
>>> have an A-MPDU session here. You seem to be applying the 4095 limit
>>> also for streams that are not an A-MPDU?
>>> I guess you could check if the sta is a VHT peer, in that case, no
>>> limit applies.
>> The explanation for the missing A-MPDU change is in that comment -
>> checking for an active A-MPDU session would make it unnecessarily complex.
>> Good point about checking for VHT capabilities to remove this limit, I
>> will add that.

Yes - I read the comment, but it seemed very sub-optimal to limit all
the A-MSDUs to 4K. With TSO I can get up to 10K and it really helps
TPT.
One more point. In VHT, there may be a limit on the numbers of
subframes in the A-MSDU. I don't see you handle that. Maybe I missed
it?

And... in case the driver doesn't handle frag_list, you linearize the
skb which is pretty much the only thing you can do at this stage. But,
when you'll lift the 4095 bytes limit, you'll get 11K A-MSDU,
linarizing such a long packet is really putting the memory manager
under pressure. This is an order 4 allocation, for each A-MSDU. Note
that iwlwifi (and probably other drivers) can handle gather DMA in Tx,
but they have a limited number of frags they can handle. iwlwifi e.g.
can handle up to 20 frags, but 3 are taken for "paperwork". You'll
have 2 frags per subframe at least (assuming that each subframe's
payload is nicely contiguous and not on a page boundary). I think that
it may be worthwhile to ask the driver how many frags it is supposed
to handle. I can't promise iwlwifi will use it, but I guess it will be
useful for someone.


>>
>> - Felix

2016-02-07 09:08:22

by Felix Fietkau

[permalink] [raw]
Subject: Re: [RFC v2] mac80211: add A-MSDU tx support

On 2016-02-07 08:25, Emmanuel Grumbach wrote:
> On Fri, Feb 5, 2016 at 12:41 PM, Felix Fietkau <[email protected]> wrote:
>> Requires software tx queueing support. frag_list support (for zero-copy)
>> is optional.
>>
>> Signed-off-by: Felix Fietkau <[email protected]>
>
> Looks nice!
> This would allow us to create aggregates of TCP Acks, the problem is
> that when you are mostly receiving data, the hardware queues are
> pretty much empty (nothing besides the TCP Acks which should go out
> quickly) so that packets don't pile up in the software queues and
> hence you don't have enough material to build an A-MSDU.
> I guess that for AP oriented devices, this is ideal solution since you
> can't rely on TSO (packets are not locally generated) and this allows
> to build an A-MSDU without adding more latency since you build an
> A-MSDU with packets that are already in the queue waiting instead of
> delaying transmission of the first packet.
> IIRC, the latter was the approach chose by the new Marvell driver
> posted a few weeks ago. This approach is better in my eyes.
> For iwlwifi which is much more station oriented (of GO which is
> basically an AP with locally generated traffic), I took the TSO
> approach. I guess we could try to change iwlwifi to use your tx
> queues, and check how that works. This would allow us to have A-MSDU
> on bridged traffic as well, although this use case is much less common
> for Intel devices.
Can the iwlwifi firmware maintain per-sta per-tid queues? Because that
way you would get the most benefits from using that tx queueing
infrastructure.

>> +
>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>> + struct sta_info *sta,
>> + struct ieee80211_fast_tx *fast_tx,
>> + struct sk_buff *skb)
>> +{
>> + struct ieee80211_local *local = sdata->local;
>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>> + struct txq_info *txqi;
>> + struct sk_buff **frag_tail, *head;
>> + int subframe_len = skb->len - ETH_ALEN;
>> + int max_amsdu_len;
>> + __be16 len;
>> + void *data;
>> + bool ret = false;
>> + int n = 1;
>> +
>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>> + return false;
>> +
>> + if (!txq)
>> + return false;
>> +
>> + txqi = to_txq_info(txq);
>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>> + return false;
>> +
>> + /*
>> + * A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>> + * sessions are started/stopped without txq flush, use the limit here
>> + * to avoid having to de-aggregate later.
>> + */
>> + max_amsdu_len = min_t(int, sta->sta.max_amsdu_len, 4095);
>
> So you can't get 10K A-MSDUs? I don't see where you check that you
> have an A-MPDU session here. You seem to be applying the 4095 limit
> also for streams that are not an A-MPDU?
> I guess you could check if the sta is a VHT peer, in that case, no
> limit applies.
The explanation for the missing A-MPDU change is in that comment -
checking for an active A-MPDU session would make it unnecessarily complex.
Good point about checking for VHT capabilities to remove this limit, I
will add that.

- Felix