2024-02-09 18:47:50

by Felix Fietkau

[permalink] [raw]
Subject: [RFC] mac80211: add AQL support for broadcast packets

Excessive broadcast traffic with little competing unicast traffic can easily
flood hardware queues, leading to throughput issues. Additionally, filling
the hardware queues with too many packets breaks FQ for broadcast data.
Fix this by enabling AQL for broadcast packets.

Signed-off-by: Felix Fietkau <[email protected]>
---
include/net/cfg80211.h | 1 +
net/mac80211/debugfs.c | 19 +++++++++++++++----
net/mac80211/ieee80211_i.h | 2 ++
net/mac80211/main.c | 1 +
net/mac80211/sta_info.c | 21 ++++++++++-----------
net/mac80211/tx.c | 14 +++++++-------
6 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a499bd7e1def..2f280c629aea 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3385,6 +3385,7 @@ enum wiphy_params_flags {
/* The per TXQ device queue limit in airtime */
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000
+#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC 50000

/* The per interface airtime threshold to switch to lower queue limit */
#define IEEE80211_AQL_THRESHOLD 24000
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 74be49191e70..7ed671d16c05 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -215,11 +215,13 @@ static ssize_t aql_pending_read(struct file *file,
"VI %u us\n"
"BE %u us\n"
"BK %u us\n"
+ "BC/MC %u us\n"
"total %u us\n",
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VO]),
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VI]),
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BE]),
atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BK]),
+ atomic_read(&local->aql_bc_pending_airtime),
atomic_read(&local->aql_total_pending_airtime));
return simple_read_from_buffer(user_buf, count, ppos,
buf, len);
@@ -245,7 +247,8 @@ static ssize_t aql_txq_limit_read(struct file *file,
"VO %u %u\n"
"VI %u %u\n"
"BE %u %u\n"
- "BK %u %u\n",
+ "BK %u %u\n"
+ "BC/MC %u\n",
local->aql_txq_limit_low[IEEE80211_AC_VO],
local->aql_txq_limit_high[IEEE80211_AC_VO],
local->aql_txq_limit_low[IEEE80211_AC_VI],
@@ -253,7 +256,8 @@ static ssize_t aql_txq_limit_read(struct file *file,
local->aql_txq_limit_low[IEEE80211_AC_BE],
local->aql_txq_limit_high[IEEE80211_AC_BE],
local->aql_txq_limit_low[IEEE80211_AC_BK],
- local->aql_txq_limit_high[IEEE80211_AC_BK]);
+ local->aql_txq_limit_high[IEEE80211_AC_BK],
+ local->aql_txq_limit_bc);
return simple_read_from_buffer(user_buf, count, ppos,
buf, len);
}
@@ -267,6 +271,7 @@ static ssize_t aql_txq_limit_write(struct file *file,
char buf[100];
u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old;
struct sta_info *sta;
+ int n;

if (count >= sizeof(buf))
return -EINVAL;
@@ -279,10 +284,16 @@ static ssize_t aql_txq_limit_write(struct file *file,
else
buf[count] = '\0';

- if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
+ n = sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high);
+ if (n < 2)
return -EINVAL;

- if (ac >= IEEE80211_NUM_ACS)
+ if (ac == IEEE80211_NUM_ACS) {
+ local->aql_txq_limit_bc = q_limit_low;
+ return count;
+ }
+
+ if (n != 3 || ac >= IEEE80211_NUM_ACS)
return -EINVAL;

q_limit_low_old = local->aql_txq_limit_low[ac];
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index eb32174984c3..f28fe17b1c21 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1305,10 +1305,12 @@ struct ieee80211_local {
spinlock_t handle_wake_tx_queue_lock;

u16 airtime_flags;
+ u32 aql_txq_limit_bc;
u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
u32 aql_threshold;
atomic_t aql_total_pending_airtime;
+ atomic_t aql_bc_pending_airtime;
atomic_t aql_ac_pending_airtime[IEEE80211_NUM_ACS];

const struct ieee80211_ops *ops;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index e05bcc35bc1e..4d87ca898e35 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -808,6 +808,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
spin_lock_init(&local->rx_path_lock);
spin_lock_init(&local->queue_stop_reason_lock);

+ local->aql_txq_limit_bc = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC;
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
INIT_LIST_HEAD(&local->active_txqs[i]);
spin_lock_init(&local->active_txq_lock[i]);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 4391d8dd634b..412468f608de 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2347,28 +2347,27 @@ void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
struct sta_info *sta, u8 ac,
u16 tx_airtime, bool tx_completed)
{
+ atomic_t *counter;
int tx_pending;

if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL))
return;

- if (!tx_completed) {
- if (sta)
- atomic_add(tx_airtime,
- &sta->airtime[ac].aql_tx_pending);
+ if (sta)
+ counter = &sta->airtime[ac].aql_tx_pending;
+ else
+ counter = &local->aql_bc_pending_airtime;

+ if (!tx_completed) {
+ atomic_add(tx_airtime, counter);
atomic_add(tx_airtime, &local->aql_total_pending_airtime);
atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]);
return;
}

- if (sta) {
- tx_pending = atomic_sub_return(tx_airtime,
- &sta->airtime[ac].aql_tx_pending);
- if (tx_pending < 0)
- atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending,
- tx_pending, 0);
- }
+ tx_pending = atomic_sub_return(tx_airtime, counter);
+ if (tx_pending < 0)
+ atomic_cmpxchg(counter, tx_pending, 0);

atomic_sub(tx_airtime, &local->aql_total_pending_airtime);
tx_pending = atomic_sub_return(tx_airtime,
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 68a48abc7287..b6293a5e4bbf 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3972,9 +3972,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
encap_out:
IEEE80211_SKB_CB(skb)->control.vif = vif;

- if (tx.sta &&
- wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
- bool ampdu = txq->ac != IEEE80211_AC_VO;
+ if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
+ bool ampdu = txq->sta && txq->ac != IEEE80211_AC_VO;
u32 airtime;

airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta,
@@ -4144,7 +4143,8 @@ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw,
return true;

if (!txq->sta)
- return true;
+ return atomic_read(&local->aql_bc_pending_airtime) <
+ local->aql_txq_limit_bc;

if (unlikely(txq->tid == IEEE80211_NUM_TIDS))
return true;
@@ -4193,15 +4193,15 @@ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,

spin_lock_bh(&local->active_txq_lock[ac]);

- if (!txqi->txq.sta)
- goto out;
-
if (list_empty(&txqi->schedule_order))
goto out;

if (!ieee80211_txq_schedule_airtime_check(local, ac))
goto out;

+ if (!txqi->txq.sta)
+ goto out;
+
list_for_each_entry_safe(iter, tmp, &local->active_txqs[ac],
schedule_order) {
if (iter == txqi)
--
2.43.0



2024-02-09 19:32:59

by Johannes Berg

[permalink] [raw]
Subject: Re: [RFC] mac80211: add AQL support for broadcast packets


> - if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
> + n = sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high);
> + if (n < 2)
> return -EINVAL;
>
> - if (ac >= IEEE80211_NUM_ACS)
> + if (ac == IEEE80211_NUM_ACS) {
> + local->aql_txq_limit_bc = q_limit_low;
> + return count;
> + }

If we keep this, it should probably check n == 2?

But not sure I like it - in other places ac == NUM_ACS means management
rather than not multicast; could we just check something like

strncmp(buf, "mcast ", 6)

instead? Or "BC/MC" matching the output? Though we don't match VO on
input either.

johannes

2024-02-10 14:55:13

by Toke Høiland-Jørgensen

[permalink] [raw]
Subject: Re: [RFC] mac80211: add AQL support for broadcast packets

Felix Fietkau <[email protected]> writes:

> Excessive broadcast traffic with little competing unicast traffic can easily
> flood hardware queues, leading to throughput issues. Additionally, filling
> the hardware queues with too many packets breaks FQ for broadcast data.
> Fix this by enabling AQL for broadcast packets.
>
> Signed-off-by: Felix Fietkau <[email protected]>
> ---
> include/net/cfg80211.h | 1 +
> net/mac80211/debugfs.c | 19 +++++++++++++++----
> net/mac80211/ieee80211_i.h | 2 ++
> net/mac80211/main.c | 1 +
> net/mac80211/sta_info.c | 21 ++++++++++-----------
> net/mac80211/tx.c | 14 +++++++-------
> 6 files changed, 36 insertions(+), 22 deletions(-)
>
> diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
> index a499bd7e1def..2f280c629aea 100644
> --- a/include/net/cfg80211.h
> +++ b/include/net/cfg80211.h
> @@ -3385,6 +3385,7 @@ enum wiphy_params_flags {
> /* The per TXQ device queue limit in airtime */
> #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000
> #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000
> +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC 50000

How did you arrive at the 50 ms figure for the limit on broadcast
traffic? Seems like quite a lot? Did you experiment with different
values?

-Toke

2024-02-12 10:29:04

by Johannes Berg

[permalink] [raw]
Subject: Re: [RFC] mac80211: add AQL support for broadcast packets

On Sat, 2024-02-10 at 17:18 +0100, Felix Fietkau wrote:
>
> > > +++ b/include/net/cfg80211.h
> > > @@ -3385,6 +3385,7 @@ enum wiphy_params_flags {
> > > /* The per TXQ device queue limit in airtime */
> > > #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000
> > > #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000
> > > +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC 50000
> >
> > How did you arrive at the 50 ms figure for the limit on broadcast
> > traffic? Seems like quite a lot? Did you experiment with different
> > values?
>
> Whenever a client is connected and in powersave mode, all multicast
> packets are buffered and sent after the beacon. Because of that I
> decided to use half of a default beacon interval.

That makes some sense, I guess.

It does have me wondering though if we should also consider multicast
for airtime fairness in some way?

johannes

2024-02-12 10:56:59

by Toke Høiland-Jørgensen

[permalink] [raw]
Subject: Re: [RFC] mac80211: add AQL support for broadcast packets

Johannes Berg <[email protected]> writes:

> On Sat, 2024-02-10 at 17:18 +0100, Felix Fietkau wrote:
>>
>> > > +++ b/include/net/cfg80211.h
>> > > @@ -3385,6 +3385,7 @@ enum wiphy_params_flags {
>> > > /* The per TXQ device queue limit in airtime */
>> > > #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000
>> > > #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000
>> > > +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC 50000
>> >
>> > How did you arrive at the 50 ms figure for the limit on broadcast
>> > traffic? Seems like quite a lot? Did you experiment with different
>> > values?
>>
>> Whenever a client is connected and in powersave mode, all multicast
>> packets are buffered and sent after the beacon. Because of that I
>> decided to use half of a default beacon interval.
>
> That makes some sense, I guess.

This implies that we will allow enough data to be queued up in the
hardware to spend half the next beacon interval just sending that
broadcast data? Isn't that a bit much if the goal is to prevent
broadcast from killing the network? What effect did you measure of this
patch? :)

Also, as soon as something is actually transmitted, the kernel will
start pushing more data into the HW from the queue in the host. So the
HW queue limit shouldn't be set as "this is the maximum that should be
transmitted in one go", but rather "this is the minimum time we need for
the software stack to catch up and refill the queue before it runs
empty". So from that perspective 50ms also seems a bit high?

> It does have me wondering though if we should also consider multicast
> for airtime fairness in some way?

Yeah, that would make sense. The virtual time-based scheduler that we
ended up reverting actually included airtime accounting for the
multicast queue as well. I don't recall if there was any problem with
that particular part of the change, or if it's just incidental that we
got rid of it as part of the revert. But it may be worth revisiting and
adding a similar mechanism to the round-robin scheduler...

-Toke

2024-02-12 13:51:56

by Toke Høiland-Jørgensen

[permalink] [raw]
Subject: Re: [RFC] mac80211: add AQL support for broadcast packets

Felix Fietkau <[email protected]> writes:

> On 12.02.24 11:56, Toke Høiland-Jørgensen wrote:
>> Johannes Berg <[email protected]> writes:
>>
>>> On Sat, 2024-02-10 at 17:18 +0100, Felix Fietkau wrote:
>>>>
>>>> > > +++ b/include/net/cfg80211.h
>>>> > > @@ -3385,6 +3385,7 @@ enum wiphy_params_flags {
>>>> > > /* The per TXQ device queue limit in airtime */
>>>> > > #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000
>>>> > > #define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000
>>>> > > +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_BC 50000
>>>> >
>>>> > How did you arrive at the 50 ms figure for the limit on broadcast
>>>> > traffic? Seems like quite a lot? Did you experiment with different
>>>> > values?
>>>>
>>>> Whenever a client is connected and in powersave mode, all multicast
>>>> packets are buffered and sent after the beacon. Because of that I
>>>> decided to use half of a default beacon interval.
>>>
>>> That makes some sense, I guess.
>>
>> This implies that we will allow enough data to be queued up in the
>> hardware to spend half the next beacon interval just sending that
>> broadcast data? Isn't that a bit much if the goal is to prevent
>> broadcast from killing the network? What effect did you measure of this
>> patch? :)
>
> I didn't do any real measurements with this patch yet. How much
> broadcast data is actually sent after the beacon is still up to the
> driver/hardware, so depending on that, the limit might even be less than
> 50ms. I also wanted to be conservative in limiting buffering in order to
> avoid potential regressions. While 50ms may seem like much, I believe it
> is still a significant improvement over the current state, which is
> unlimited.
>
>> Also, as soon as something is actually transmitted, the kernel will
>> start pushing more data into the HW from the queue in the host. So the
>> HW queue limit shouldn't be set as "this is the maximum that should be
>> transmitted in one go", but rather "this is the minimum time we need for
>> the software stack to catch up and refill the queue before it runs
>> empty". So from that perspective 50ms also seems a bit high?
>
> When broadcast buffering is enabled, the driver/hardware typically
> prepares the set of packets to be transmitted before the beacon is sent.
> Any packet not ready by then will be sent in the next round.
> I added the 50ms limit based on that assumption.

Ah, so even if this is being done in software it's happening in the
driver, so post-TXQ dequeue? OK, in that case I guess it makes sense;
would love to see some numbers, of course, but I guess the debugfs
additions in this patch will make it possible to actually monitor the
queue lengths seen in the wild :)

>>> It does have me wondering though if we should also consider multicast
>>> for airtime fairness in some way?
>>
>> Yeah, that would make sense. The virtual time-based scheduler that we
>> ended up reverting actually included airtime accounting for the
>> multicast queue as well. I don't recall if there was any problem with
>> that particular part of the change, or if it's just incidental that we
>> got rid of it as part of the revert. But it may be worth revisiting and
>> adding a similar mechanism to the round-robin scheduler...
>
> The round-robin scheduler already has some consideration of multicast -
> it always puts the multicast queues last in the active_txqs list.

Ah, right. Hmm, not quite clear to me how that works out in terms of
fairness, but it should at least prevent the MC queue from blocking
everything else...

-Toke