2016-02-07 13:21:33

by Felix Fietkau

[permalink] [raw]
Subject: [RFC v3] mac80211: add A-MSDU tx support

Requires software tx queueing support. frag_list support (for zero-copy)
is optional.

Signed-off-by: Felix Fietkau <[email protected]>
---
include/net/mac80211.h | 17 +++++
net/mac80211/agg-tx.c | 5 ++
net/mac80211/debugfs.c | 2 +
net/mac80211/ieee80211_i.h | 1 +
net/mac80211/tx.c | 168 +++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 193 insertions(+)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5714774..31dca81 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -709,6 +709,7 @@ enum mac80211_tx_info_flags {
* @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
* frame (PS-Poll or uAPSD).
* @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
+ * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
*
* These flags are used in tx_info->control.flags.
*/
@@ -716,6 +717,7 @@ enum mac80211_tx_control_flags {
IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0),
IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1),
IEEE80211_TX_CTRL_RATE_INJECT = BIT(2),
+ IEEE80211_TX_CTRL_AMSDU = BIT(3),
};

/*
@@ -1964,6 +1966,12 @@ struct ieee80211_txq {
* order and does not need to manage its own reorder buffer or BA session
* timeout.
*
+ * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
+ * A-MSDU frames. Requires software tx queueing support.
+ *
+ * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
+ * skbs, needed for zero-copy software A-MSDU.
+ *
* @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
*/
enum ieee80211_hw_flags {
@@ -2001,6 +2009,8 @@ enum ieee80211_hw_flags {
IEEE80211_HW_BEACON_TX_STATUS,
IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
+ IEEE80211_HW_TX_AMSDU,
+ IEEE80211_HW_TX_FRAG_LIST,

/* keep last, obviously */
NUM_IEEE80211_HW_FLAGS
@@ -2073,6 +2083,11 @@ enum ieee80211_hw_flags {
* size is smaller (an example is LinkSys WRT120N with FW v1.0.07
* build 002 Jun 18 2012).
*
+ * @max_tx_amsdu_subframes: maximum number of subframes used in software
+ * A-MSDU aggregation
+ *
+ * @max_tx_fragments: maximum fragments per (A-)MSDU.
+ *
* @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
* (if %IEEE80211_HW_QUEUE_CONTROL is set)
*
@@ -2127,6 +2142,8 @@ struct ieee80211_hw {
u8 max_rate_tries;
u8 max_rx_aggregation_subframes;
u8 max_tx_aggregation_subframes;
+ u8 max_tx_amsdu_subframes;
+ u8 max_tx_fragments;
u8 offchannel_tx_hw_queue;
u8 radiotap_mcs_details;
u16 radiotap_vht_details;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f..42fa810 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
size_t len)
{
struct tid_ampdu_tx *tid_tx;
+ struct ieee80211_txq *txq;
u16 capab, tid;
u8 buf_size;
bool amsdu;
@@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);

+ txq = sta->sta.txq[tid];
+ if (!amsdu && txq)
+ set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
+
mutex_lock(&sta->ampdu_mlme.mtx);

tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index e433d0c..847779d 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -127,6 +127,8 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
FLAG(BEACON_TX_STATUS),
FLAG(NEEDS_UNIQUE_STA_ADDR),
FLAG(SUPPORTS_REORDERING_BUFFER),
+ FLAG(TX_AMSDU),
+ FLAG(TX_FRAG_LIST),

/* keep last for the build bug below */
(void *)0x1
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a49c103..e68d8db 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -799,6 +799,7 @@ struct mac80211_qos_map {
enum txq_info_flags {
IEEE80211_TXQ_STOP,
IEEE80211_TXQ_AMPDU,
+ IEEE80211_TXQ_NO_AMSDU,
};

struct txq_info {
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a5aa275..f37f729 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
out:
spin_unlock_bh(&txqi->queue.lock);

+ if (skb && skb_has_frag_list(skb) &&
+ !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+ skb_linearize(skb);
+
return skb;
}
EXPORT_SYMBOL(ieee80211_tx_dequeue);
@@ -2763,6 +2767,166 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
kfree_rcu(fast_tx, rcu_head);
}

+static int ieee80211_amsdu_pad(struct sk_buff *skb, int subframe_len)
+{
+ int amsdu_len = subframe_len + sizeof(struct ethhdr);
+ int padding = (4 - amsdu_len) & 3;
+
+ if (padding)
+ memset(skb_put(skb, padding), 0, padding);
+
+ return padding;
+}
+
+static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_fast_tx *fast_tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr;
+ struct ethhdr amsdu_hdr;
+ int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
+ int subframe_len = skb->len - hdr_len;
+ void *data;
+ u8 *qc;
+
+ if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
+ return true;
+
+ if (skb_headroom(skb) < sizeof(amsdu_hdr) || skb_tailroom(skb) < 3) {
+ I802_DEBUG_INC(local->tx_expand_skb_head);
+
+ if (pskb_expand_head(skb, sizeof(amsdu_hdr), 3, GFP_ATOMIC)) {
+ wiphy_debug(local->hw.wiphy,
+ "failed to reallocate TX buffer\n");
+ return false;
+ }
+ }
+
+ subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
+
+ amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
+ memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
+ memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+
+ data = skb_push(skb, sizeof(amsdu_hdr));
+ memmove(data, data + sizeof(amsdu_hdr), hdr_len);
+ memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+
+ hdr = data;
+ qc = ieee80211_get_qos_ctl(hdr);
+ *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+
+ info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
+
+ return true;
+}
+
+static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_fast_tx *fast_tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ struct ieee80211_txq *txq = sta->sta.txq[tid];
+ struct txq_info *txqi;
+ struct sk_buff **frag_tail, *head;
+ int subframe_len = skb->len - ETH_ALEN;
+ u8 max_subframes = 0xff;
+ int max_frags = local->hw.max_tx_fragments;
+ int max_amsdu_len;
+ __be16 len;
+ void *data;
+ bool ret = false;
+ int n = 1, nfrags;
+
+ if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
+ return false;
+
+ if (!txq)
+ return false;
+
+ txqi = to_txq_info(txq);
+ if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
+ return false;
+
+ if (sta->sta.max_amsdu_subframes)
+ max_subframes = min(max_subframes,
+ sta->sta.max_amsdu_subframes);
+
+ if (local->hw.max_tx_amsdu_subframes)
+ max_subframes = min(max_subframes,
+ local->hw.max_tx_amsdu_subframes);
+
+ spin_lock_bh(&txqi->queue.lock);
+
+ head = skb_peek_tail(&txqi->queue);
+ if (!head)
+ goto out;
+
+ if (skb->len + head->len > max_amsdu_len)
+ goto out;
+
+ /*
+ * HT A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
+ * sessions are started/stopped without txq flush, use the limit here
+ * to avoid having to de-aggregate later.
+ */
+ if (skb->len + head->len > 4095 &&
+ !sta->sta.vht_cap.vht_supported)
+ goto out;
+
+ if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
+ goto out;
+
+ nfrags = 1 + skb_shinfo(skb)->nr_frags;
+ nfrags += 1 + skb_shinfo(head)->nr_frags;
+ frag_tail = &skb_shinfo(head)->frag_list;
+ while (*frag_tail) {
+ nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
+ frag_tail = &(*frag_tail)->next;
+ n++;
+ }
+
+ if (max_subframes && n > max_subframes)
+ goto out;
+
+ if (max_frags && nfrags > max_frags)
+ goto out;
+
+ if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 3) {
+ I802_DEBUG_INC(local->tx_expand_skb_head);
+
+ if (pskb_expand_head(skb, 8, 3, GFP_ATOMIC)) {
+ wiphy_debug(local->hw.wiphy,
+ "failed to reallocate TX buffer\n");
+ goto out;
+ }
+ }
+
+ subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
+
+ ret = true;
+ data = skb_push(skb, ETH_ALEN + 2);
+ memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
+
+ data += 2 * ETH_ALEN;
+ len = cpu_to_be16(subframe_len);
+ memcpy(data, &len, 2);
+ memcpy(data + 2, rfc1042_header, ETH_ALEN);
+
+ head->len += skb->len;
+ head->data_len += skb->len;
+ *frag_tail = skb;
+
+out:
+ spin_unlock_bh(&txqi->queue.lock);
+
+ return ret;
+}
+
static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
struct net_device *dev, struct sta_info *sta,
struct ieee80211_fast_tx *fast_tx,
@@ -2817,6 +2981,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,

ieee80211_tx_stats(dev, skb->len + extra_head);

+ if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
+ ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
+ return true;
+
/* will not be crypto-handled beyond what we do here, so use false
* as the may-encrypt argument for the resize to not account for
* more room than we already have in 'extra_head'
--
2.2.2



2016-02-07 21:03:38

by Felix Fietkau

[permalink] [raw]
Subject: Re: [RFC v3] mac80211: add A-MSDU tx support

On 2016-02-07 21:16, Emmanuel Grumbach wrote:
> On Sun, Feb 7, 2016 at 3:21 PM, Felix Fietkau <[email protected]> wrote:
>> Requires software tx queueing support. frag_list support (for zero-copy)
>> is optional.
>>
>> Signed-off-by: Felix Fietkau <[email protected]>
>> ---
>> include/net/mac80211.h | 17 +++++
>> net/mac80211/agg-tx.c | 5 ++
>> net/mac80211/debugfs.c | 2 +
>> net/mac80211/ieee80211_i.h | 1 +
>> net/mac80211/tx.c | 168 +++++++++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 193 insertions(+)
>>
>> diff --git a/include/net/mac80211.h b/include/net/mac80211.h
>> index 5714774..31dca81 100644
>> --- a/include/net/mac80211.h
>> +++ b/include/net/mac80211.h
>> @@ -2127,6 +2142,8 @@ struct ieee80211_hw {
>> u8 max_rate_tries;
>> u8 max_rx_aggregation_subframes;
>> u8 max_tx_aggregation_subframes;
>> + u8 max_tx_amsdu_subframes;
>> + u8 max_tx_fragments;
>
> I have to see I still don't understand why drivers would want to limit
> the number of subframes. Limiting the number of frags should be
> enough, don't you think?
Makes sense, I'll remove it.

>> u8 offchannel_tx_hw_queue;
>> u8 radiotap_mcs_details;
>> u16 radiotap_vht_details;

>> diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
>> index a5aa275..f37f729 100644
>> --- a/net/mac80211/tx.c
>> +++ b/net/mac80211/tx.c
>> @@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
>> out:
>> spin_unlock_bh(&txqi->queue.lock);
>>
>> + if (skb && skb_has_frag_list(skb) &&
>> + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
>> + skb_linearize(skb);
>> +
>> return skb;
>> }
>> EXPORT_SYMBOL(ieee80211_tx_dequeue);
>> @@ -2763,6 +2767,166 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
>> kfree_rcu(fast_tx, rcu_head);
>> }
>>
>> +static int ieee80211_amsdu_pad(struct sk_buff *skb, int subframe_len)
>> +{
>> + int amsdu_len = subframe_len + sizeof(struct ethhdr);
>> + int padding = (4 - amsdu_len) & 3;
>> +
>> + if (padding)
>> + memset(skb_put(skb, padding), 0, padding);
>> +
>> + return padding;
>> +}
>> +
>> +static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
>> + struct ieee80211_fast_tx *fast_tx,
>> + struct sk_buff *skb)
>> +{
>> + struct ieee80211_local *local = sdata->local;
>> + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
>> + struct ieee80211_hdr *hdr;
>> + struct ethhdr amsdu_hdr;
>> + int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
>> + int subframe_len = skb->len - hdr_len;
>> + void *data;
>> + u8 *qc;
>> +
>> + if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
>> + return true;
>> +
>> + if (skb_headroom(skb) < sizeof(amsdu_hdr) || skb_tailroom(skb) < 3) {
>> + I802_DEBUG_INC(local->tx_expand_skb_head);
>> +
>> + if (pskb_expand_head(skb, sizeof(amsdu_hdr), 3, GFP_ATOMIC)) {
>> + wiphy_debug(local->hw.wiphy,
>> + "failed to reallocate TX buffer\n");
>> + return false;
>> + }
>> + }
>> +
>> + subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
>> +
>> + amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
>> + memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
>> + memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
>> +
>> + data = skb_push(skb, sizeof(amsdu_hdr));
>> + memmove(data, data + sizeof(amsdu_hdr), hdr_len);
>> + memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
>> +
>> + hdr = data;
>> + qc = ieee80211_get_qos_ctl(hdr);
>> + *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
>> +
>> + info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
>> +
>> + return true;
>> +}
>> +
>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>> + struct sta_info *sta,
>> + struct ieee80211_fast_tx *fast_tx,
>> + struct sk_buff *skb)
>> +{
>> + struct ieee80211_local *local = sdata->local;
>> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>> + struct ieee80211_txq *txq = sta->sta.txq[tid];
>> + struct txq_info *txqi;
>> + struct sk_buff **frag_tail, *head;
>> + int subframe_len = skb->len - ETH_ALEN;
>> + u8 max_subframes = 0xff;
>> + int max_frags = local->hw.max_tx_fragments;
>> + int max_amsdu_len;
>> + __be16 len;
>> + void *data;
>> + bool ret = false;
>> + int n = 1, nfrags;
>> +
>> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>> + return false;
>> +
>> + if (!txq)
>> + return false;
>> +
>> + txqi = to_txq_info(txq);
>> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>> + return false;
>> +
>> + if (sta->sta.max_amsdu_subframes)
>> + max_subframes = min(max_subframes,
>> + sta->sta.max_amsdu_subframes);
>> +
>> + if (local->hw.max_tx_amsdu_subframes)
>> + max_subframes = min(max_subframes,
>> + local->hw.max_tx_amsdu_subframes);
>> +
>> + spin_lock_bh(&txqi->queue.lock);
>> +
>> + head = skb_peek_tail(&txqi->queue);
>> + if (!head)
>> + goto out;
>> +
>> + if (skb->len + head->len > max_amsdu_len)
>> + goto out;
>> +
>> + /*
>> + * HT A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>> + * sessions are started/stopped without txq flush, use the limit here
>> + * to avoid having to de-aggregate later.
>> + */
>> + if (skb->len + head->len > 4095 &&
>> + !sta->sta.vht_cap.vht_supported)
>> + goto out;
>> +
>> + if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
>> + goto out;
>> +
>> + nfrags = 1 + skb_shinfo(skb)->nr_frags;
>> + nfrags += 1 + skb_shinfo(head)->nr_frags;
>> + frag_tail = &skb_shinfo(head)->frag_list;
>> + while (*frag_tail) {
>> + nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
>> + frag_tail = &(*frag_tail)->next;
>> + n++;
>> + }
>> +
>> + if (max_subframes && n > max_subframes)
>
> max_subframes can't be 0. You set it to 0xff at the beginning.
Right, thanks.

- Felix

2016-02-07 20:16:30

by Emmanuel Grumbach

[permalink] [raw]
Subject: Re: [RFC v3] mac80211: add A-MSDU tx support

On Sun, Feb 7, 2016 at 3:21 PM, Felix Fietkau <[email protected]> wrote:
> Requires software tx queueing support. frag_list support (for zero-copy)
> is optional.
>
> Signed-off-by: Felix Fietkau <[email protected]>
> ---
> include/net/mac80211.h | 17 +++++
> net/mac80211/agg-tx.c | 5 ++
> net/mac80211/debugfs.c | 2 +
> net/mac80211/ieee80211_i.h | 1 +
> net/mac80211/tx.c | 168 +++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 193 insertions(+)
>
> diff --git a/include/net/mac80211.h b/include/net/mac80211.h
> index 5714774..31dca81 100644
> --- a/include/net/mac80211.h
> +++ b/include/net/mac80211.h
> @@ -709,6 +709,7 @@ enum mac80211_tx_info_flags {
> * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
> * frame (PS-Poll or uAPSD).
> * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
> + * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
> *
> * These flags are used in tx_info->control.flags.
> */
> @@ -716,6 +717,7 @@ enum mac80211_tx_control_flags {
> IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0),
> IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1),
> IEEE80211_TX_CTRL_RATE_INJECT = BIT(2),
> + IEEE80211_TX_CTRL_AMSDU = BIT(3),
> };
>
> /*
> @@ -1964,6 +1966,12 @@ struct ieee80211_txq {
> * order and does not need to manage its own reorder buffer or BA session
> * timeout.
> *
> + * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
> + * A-MSDU frames. Requires software tx queueing support.
> + *
> + * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
> + * skbs, needed for zero-copy software A-MSDU.
> + *
> * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
> */
> enum ieee80211_hw_flags {
> @@ -2001,6 +2009,8 @@ enum ieee80211_hw_flags {
> IEEE80211_HW_BEACON_TX_STATUS,
> IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
> IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
> + IEEE80211_HW_TX_AMSDU,
> + IEEE80211_HW_TX_FRAG_LIST,
>
> /* keep last, obviously */
> NUM_IEEE80211_HW_FLAGS
> @@ -2073,6 +2083,11 @@ enum ieee80211_hw_flags {
> * size is smaller (an example is LinkSys WRT120N with FW v1.0.07
> * build 002 Jun 18 2012).
> *
> + * @max_tx_amsdu_subframes: maximum number of subframes used in software
> + * A-MSDU aggregation
> + *
> + * @max_tx_fragments: maximum fragments per (A-)MSDU.
> + *
> * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
> * (if %IEEE80211_HW_QUEUE_CONTROL is set)
> *
> @@ -2127,6 +2142,8 @@ struct ieee80211_hw {
> u8 max_rate_tries;
> u8 max_rx_aggregation_subframes;
> u8 max_tx_aggregation_subframes;
> + u8 max_tx_amsdu_subframes;
> + u8 max_tx_fragments;

I have to see I still don't understand why drivers would want to limit
the number of subframes. Limiting the number of frags should be
enough, don't you think?

> u8 offchannel_tx_hw_queue;
> u8 radiotap_mcs_details;
> u16 radiotap_vht_details;
> diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
> index 4932e9f..42fa810 100644
> --- a/net/mac80211/agg-tx.c
> +++ b/net/mac80211/agg-taxx.c
> @@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
> size_t len)
> {
> struct tid_ampdu_tx *tid_tx;
> + struct ieee80211_txq *txq;
> u16 capab, tid;
> u8 buf_size;
> bool amsdu;
> @@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
> buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
> buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
>
> + txq = sta->sta.txq[tid];
> + if (!amsdu && txq)
> + set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
> +
> mutex_lock(&sta->ampdu_mlme.mtx);
>
> tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
> diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
> index e433d0c..847779d 100644
> --- a/net/mac80211/debugfs.c
> +++ b/net/mac80211/debugfs.c
> @@ -127,6 +127,8 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
> FLAG(BEACON_TX_STATUS),
> FLAG(NEEDS_UNIQUE_STA_ADDR),
> FLAG(SUPPORTS_REORDERING_BUFFER),
> + FLAG(TX_AMSDU),
> + FLAG(TX_FRAG_LIST),
>
> /* keep last for the build bug below */
> (void *)0x1
> diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
> index a49c103..e68d8db 100644
> --- a/net/mac80211/ieee80211_i.h
> +++ b/net/mac80211/ieee80211_i.h
> @@ -799,6 +799,7 @@ struct mac80211_qos_map {
> enum txq_info_flags {
> IEEE80211_TXQ_STOP,
> IEEE80211_TXQ_AMPDU,
> + IEEE80211_TXQ_NO_AMSDU,
> };
>
> struct txq_info {
> diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
> index a5aa275..f37f729 100644
> --- a/net/mac80211/tx.c
> +++ b/net/mac80211/tx.c
> @@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
> out:
> spin_unlock_bh(&txqi->queue.lock);
>
> + if (skb && skb_has_frag_list(skb) &&
> + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
> + skb_linearize(skb);
> +
> return skb;
> }
> EXPORT_SYMBOL(ieee80211_tx_dequeue);
> @@ -2763,6 +2767,166 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
> kfree_rcu(fast_tx, rcu_head);
> }
>
> +static int ieee80211_amsdu_pad(struct sk_buff *skb, int subframe_len)
> +{
> + int amsdu_len = subframe_len + sizeof(struct ethhdr);
> + int padding = (4 - amsdu_len) & 3;
> +
> + if (padding)
> + memset(skb_put(skb, padding), 0, padding);
> +
> + return padding;
> +}
> +
> +static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
> + struct ieee80211_fast_tx *fast_tx,
> + struct sk_buff *skb)
> +{
> + struct ieee80211_local *local = sdata->local;
> + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> + struct ieee80211_hdr *hdr;
> + struct ethhdr amsdu_hdr;
> + int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
> + int subframe_len = skb->len - hdr_len;
> + void *data;
> + u8 *qc;
> +
> + if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
> + return true;
> +
> + if (skb_headroom(skb) < sizeof(amsdu_hdr) || skb_tailroom(skb) < 3) {
> + I802_DEBUG_INC(local->tx_expand_skb_head);
> +
> + if (pskb_expand_head(skb, sizeof(amsdu_hdr), 3, GFP_ATOMIC)) {
> + wiphy_debug(local->hw.wiphy,
> + "failed to reallocate TX buffer\n");
> + return false;
> + }
> + }
> +
> + subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
> +
> + amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
> + memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
> + memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
> +
> + data = skb_push(skb, sizeof(amsdu_hdr));
> + memmove(data, data + sizeof(amsdu_hdr), hdr_len);
> + memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
> +
> + hdr = data;
> + qc = ieee80211_get_qos_ctl(hdr);
> + *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
> +
> + info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
> +
> + return true;
> +}
> +
> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
> + struct sta_info *sta,
> + struct ieee80211_fast_tx *fast_tx,
> + struct sk_buff *skb)
> +{
> + struct ieee80211_local *local = sdata->local;
> + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
> + struct ieee80211_txq *txq = sta->sta.txq[tid];
> + struct txq_info *txqi;
> + struct sk_buff **frag_tail, *head;
> + int subframe_len = skb->len - ETH_ALEN;
> + u8 max_subframes = 0xff;
> + int max_frags = local->hw.max_tx_fragments;
> + int max_amsdu_len;
> + __be16 len;
> + void *data;
> + bool ret = false;
> + int n = 1, nfrags;
> +
> + if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
> + return false;
> +
> + if (!txq)
> + return false;
> +
> + txqi = to_txq_info(txq);
> + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
> + return false;
> +
> + if (sta->sta.max_amsdu_subframes)
> + max_subframes = min(max_subframes,
> + sta->sta.max_amsdu_subframes);
> +
> + if (local->hw.max_tx_amsdu_subframes)
> + max_subframes = min(max_subframes,
> + local->hw.max_tx_amsdu_subframes);
> +
> + spin_lock_bh(&txqi->queue.lock);
> +
> + head = skb_peek_tail(&txqi->queue);
> + if (!head)
> + goto out;
> +
> + if (skb->len + head->len > max_amsdu_len)
> + goto out;
> +
> + /*
> + * HT A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
> + * sessions are started/stopped without txq flush, use the limit here
> + * to avoid having to de-aggregate later.
> + */
> + if (skb->len + head->len > 4095 &&
> + !sta->sta.vht_cap.vht_supported)
> + goto out;
> +
> + if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
> + goto out;
> +
> + nfrags = 1 + skb_shinfo(skb)->nr_frags;
> + nfrags += 1 + skb_shinfo(head)->nr_frags;
> + frag_tail = &skb_shinfo(head)->frag_list;
> + while (*frag_tail) {
> + nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
> + frag_tail = &(*frag_tail)->next;
> + n++;
> + }
> +
> + if (max_subframes && n > max_subframes)

max_subframes can't be 0. You set it to 0xff at the beginning.

> + goto out;
> +
> + if (max_frags && nfrags > max_frags)
> + goto out;
> +
> + if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 3) {
> + I802_DEBUG_INC(local->tx_expand_skb_head);
> +
> + if (pskb_expand_head(skb, 8, 3, GFP_ATOMIC)) {
> + wiphy_debug(local->hw.wiphy,
> + "failed to reallocate TX buffer\n");
> + goto out;
> + }
> + }
> +
> + subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
> +
> + ret = true;
> + data = skb_push(skb, ETH_ALEN + 2);
> + memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
> +
> + data += 2 * ETH_ALEN;
> + len = cpu_to_be16(subframe_len);
> + memcpy(data, &len, 2);
> + memcpy(data + 2, rfc1042_header, ETH_ALEN);
> +
> + head->len += skb->len;
> + head->data_len += skb->len;
> + *frag_tail = skb;
> +
> +out:
> + spin_unlock_bh(&txqi->queue.lock);
> +
> + return ret;
> +}
> +
> static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
> struct net_device *dev, struct sta_info *sta,
> struct ieee80211_fast_tx *fast_tx,
> @@ -2817,6 +2981,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>
> ieee80211_tx_stats(dev, skb->len + extra_head);
>
> + if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
> + ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
> + return true;
> +
> /* will not be crypto-handled beyond what we do here, so use false
> * as the may-encrypt argument for the resize to not account for
> * more room than we already have in 'extra_head'
> --
> 2.2.2
>