2020-07-25 18:56:35

by Felix Fietkau

[permalink] [raw]
Subject: [RFC] mac80211: add a function for running rx without passing skbs to the stack

This can be used to run mac80211 rx processing on a batch of frames in NAPI
poll before passing them to the network stack in a large batch.
This can improve icache footprint, or it can be used to pass frames via
netif_receive_skb_list.

Signed-off-by: Felix Fietkau <[email protected]>
---
include/net/mac80211.h | 25 +++++++++++++++++
net/mac80211/ieee80211_i.h | 2 +-
net/mac80211/rx.c | 56 +++++++++++++++++++++++---------------
3 files changed, 60 insertions(+), 23 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 11d5610d2ad5..cc9c477184a4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4357,6 +4357,31 @@ void ieee80211_free_hw(struct ieee80211_hw *hw);
*/
void ieee80211_restart_hw(struct ieee80211_hw *hw);

+/**
+ * ieee80211_rx_list - receive frame and store processed skbs in a list
+ *
+ * Use this function to hand received frames to mac80211. The receive
+ * buffer in @skb must start with an IEEE 802.11 header. In case of a
+ * paged @skb is used, the driver is recommended to put the ieee80211
+ * header of the frame on the linear part of the @skb to avoid memory
+ * allocation and/or memcpy by the stack.
+ *
+ * This function may not be called in IRQ context. Calls to this function
+ * for a single hardware must be synchronized against each other. Calls to
+ * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
+ * mixed for a single hardware. Must not run concurrently with
+ * ieee80211_tx_status() or ieee80211_tx_status_ni().
+ *
+ * This function must be called with BHs disabled and RCU read lock
+ *
+ * @hw: the hardware this frame came in on
+ * @sta: the station the frame was received from, or %NULL
+ * @skb: the buffer to receive, owned by mac80211 after this call
+ * @list: the destination list
+ */
+void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta,
+ struct sk_buff *skb, struct sk_buff_head *list);
+
/**
* ieee80211_rx_napi - receive frame from NAPI context
*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 28b154c6e72d..13aad8c3a16f 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -218,7 +218,7 @@ enum ieee80211_rx_flags {
};

struct ieee80211_rx_data {
- struct napi_struct *napi;
+ struct sk_buff_head *list;
struct sk_buff *skb;
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 5c5af4b5fc08..0026b58783bd 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2578,8 +2578,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
memset(skb->cb, 0, sizeof(skb->cb));

/* deliver to local stack */
- if (rx->napi)
- napi_gro_receive(rx->napi, skb);
+ if (rx->list)
+ __skb_queue_tail(rx->list, skb);
else
netif_receive_skb(skb);
}
@@ -3869,7 +3869,6 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
/* This is OK -- must be QoS data frame */
.security_idx = tid,
.seqno_idx = tid,
- .napi = NULL, /* must be NULL to not have races */
};
struct tid_ampdu_rx *tid_agg_rx;

@@ -4479,8 +4478,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
/* deliver to local stack */
skb->protocol = eth_type_trans(skb, fast_rx->dev);
memset(skb->cb, 0, sizeof(skb->cb));
- if (rx->napi)
- napi_gro_receive(rx->napi, skb);
+ if (rx->list)
+ __skb_queue_tail(rx->list, skb);
else
netif_receive_skb(skb);

@@ -4547,7 +4546,7 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
struct ieee80211_sta *pubsta,
struct sk_buff *skb,
- struct napi_struct *napi)
+ struct sk_buff_head *list)
{
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_sub_if_data *sdata;
@@ -4562,7 +4561,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
memset(&rx, 0, sizeof(rx));
rx.skb = skb;
rx.local = local;
- rx.napi = napi;
+ rx.list = list;

if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc))
I802_DEBUG_INC(local->dot11ReceivedFragmentCount);
@@ -4670,8 +4669,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
* This is the receive path handler. It is called by a low level driver when an
* 802.11 MPDU is received from the hardware.
*/
-void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
- struct sk_buff *skb, struct napi_struct *napi)
+void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
+ struct sk_buff *skb, struct sk_buff_head *list)
{
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_rate *rate = NULL;
@@ -4762,13 +4761,6 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,

status->rx_flags = 0;

- /*
- * key references and virtual interfaces are protected using RCU
- * and this requires that we are in a read-side RCU section during
- * receive processing
- */
- rcu_read_lock();
-
/*
* Frames with failed FCS/PLCP checksum are not returned,
* all other frames are returned without radiotap header
@@ -4776,23 +4768,43 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
* Also, frames with less than 16 bytes are dropped.
*/
skb = ieee80211_rx_monitor(local, skb, rate);
- if (!skb) {
- rcu_read_unlock();
+ if (!skb)
return;
- }

ieee80211_tpt_led_trig_rx(local,
((struct ieee80211_hdr *)skb->data)->frame_control,
skb->len);

- __ieee80211_rx_handle_packet(hw, pubsta, skb, napi);
-
- rcu_read_unlock();
+ __ieee80211_rx_handle_packet(hw, pubsta, skb, list);

return;
drop:
kfree_skb(skb);
}
+EXPORT_SYMBOL(ieee80211_rx_list);
+
+void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
+ struct sk_buff *skb, struct napi_struct *napi)
+{
+ struct sk_buff_head list;
+
+ __skb_queue_head_init(&list);
+
+ /*
+ * key references and virtual interfaces are protected using RCU
+ * and this requires that we are in a read-side RCU section during
+ * receive processing
+ */
+ rcu_read_lock();
+ ieee80211_rx_list(hw, pubsta, skb, &list);
+ rcu_read_unlock();
+
+ while ((skb = __skb_dequeue(&list)) != NULL)
+ if (napi)
+ napi_gro_receive(napi, skb);
+ else
+ netif_receive_skb(skb);
+}
EXPORT_SYMBOL(ieee80211_rx_napi);

/* This is a version of the rx handler that can be called from hard irq
--
2.24.0


2020-07-25 19:15:09

by Johannes Berg

[permalink] [raw]
Subject: Re: [RFC] mac80211: add a function for running rx without passing skbs to the stack


> +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
> + struct sk_buff *skb, struct napi_struct *napi)
> +{
> + struct sk_buff_head list;
> +
> + __skb_queue_head_init(&list);
> +
> + /*
> + * key references and virtual interfaces are protected using RCU
> + * and this requires that we are in a read-side RCU section during
> + * receive processing
> + */
> + rcu_read_lock();
> + ieee80211_rx_list(hw, pubsta, skb, &list);
> + rcu_read_unlock();
> +
> + while ((skb = __skb_dequeue(&list)) != NULL)

I'd drop the != NULL, but no strong feelings :)

> + if (napi)
> + napi_gro_receive(napi, skb);
> + else
> + netif_receive_skb(skb);

Nit: I'd prefer braces on the loop, just makes it nicer to read IMHO.

OTOH, the !napi case should use netif_receive_skb_list(), no?

Given the discussion, it also seems a bit odd to add more work for NAPI
poll where we process one by one ... But I see why you did that, and I
guess it's not actually that much more work.

johannes

2020-07-29 01:49:20

by Rajkumar Manoharan

[permalink] [raw]
Subject: Re: [RFC] mac80211: add a function for running rx without passing skbs to the stack

On 2020-07-25 12:14, Johannes Berg wrote:
>> +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta
>> *pubsta,
>> + struct sk_buff *skb, struct napi_struct *napi)
>> +{
>> + struct sk_buff_head list;
>> +
>> + __skb_queue_head_init(&list);
>> +
>> + /*
>> + * key references and virtual interfaces are protected using RCU
>> + * and this requires that we are in a read-side RCU section during
>> + * receive processing
>> + */
>> + rcu_read_lock();
>> + ieee80211_rx_list(hw, pubsta, skb, &list);
>> + rcu_read_unlock();
>> +
>> + while ((skb = __skb_dequeue(&list)) != NULL)
>
> I'd drop the != NULL, but no strong feelings :)
>
>> + if (napi)
>> + napi_gro_receive(napi, skb);
>> + else
>> + netif_receive_skb(skb);
>
> Nit: I'd prefer braces on the loop, just makes it nicer to read IMHO.
>
> OTOH, the !napi case should use netif_receive_skb_list(), no?
>
> Given the discussion, it also seems a bit odd to add more work for NAPI
> poll where we process one by one ... But I see why you did that, and I
> guess it's not actually that much more work.
>
IIUC Sebastian mentioned that threadable NAPI approach is helping in
load balancing
with minor check for dummy netdev for wireless drivers. Does this change
improve
the latency in both threaded and non-threaded modes?

-Rajkumar

2020-07-29 01:57:17

by Rajkumar Manoharan

[permalink] [raw]
Subject: Re: [RFC] mac80211: add a function for running rx without passing skbs to the stack

On 2020-07-28 18:47, Rajkumar Manoharan wrote:
> On 2020-07-25 12:14, Johannes Berg wrote:
>>> +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta
>>> *pubsta,
>>> + struct sk_buff *skb, struct napi_struct *napi)
>>> +{
>>> + struct sk_buff_head list;
>>> +
>>> + __skb_queue_head_init(&list);
>>> +
>>> + /*
>>> + * key references and virtual interfaces are protected using RCU
>>> + * and this requires that we are in a read-side RCU section during
>>> + * receive processing
>>> + */
>>> + rcu_read_lock();
>>> + ieee80211_rx_list(hw, pubsta, skb, &list);
>>> + rcu_read_unlock();
>>> +
>>> + while ((skb = __skb_dequeue(&list)) != NULL)
>>
>> I'd drop the != NULL, but no strong feelings :)
>>
>>> + if (napi)
>>> + napi_gro_receive(napi, skb);
>>> + else
>>> + netif_receive_skb(skb);
>>
>> Nit: I'd prefer braces on the loop, just makes it nicer to read IMHO.
>>
>> OTOH, the !napi case should use netif_receive_skb_list(), no?
>>
>> Given the discussion, it also seems a bit odd to add more work for
>> NAPI
>> poll where we process one by one ... But I see why you did that, and I
>> guess it's not actually that much more work.
>>
> IIUC Sebastian mentioned that threadable NAPI approach is helping in
> load balancing
> with minor check for dummy netdev for wireless drivers. Does this
> change improve
> the latency in both threaded and non-threaded modes?
>
Also I was thinking of introducing threaded irq in athX driver. I was
concerned about
more threaded levels and intermediate lists might increase sojourn which
will impact latency. no?

-Rajkumar

2020-07-30 14:03:11

by Sebastian Gottschall

[permalink] [raw]
Subject: Re: [RFC] mac80211: add a function for running rx without passing skbs to the stack


Am 29.07.2020 um 03:47 schrieb Rajkumar Manoharan:
> On 2020-07-25 12:14, Johannes Berg wrote:
>>> +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct
>>> ieee80211_sta *pubsta,
>>> +               struct sk_buff *skb, struct napi_struct *napi)
>>> +{
>>> +    struct sk_buff_head list;
>>> +
>>> +    __skb_queue_head_init(&list);
>>> +
>>> +    /*
>>> +     * key references and virtual interfaces are protected using RCU
>>> +     * and this requires that we are in a read-side RCU section during
>>> +     * receive processing
>>> +     */
>>> +    rcu_read_lock();
>>> +    ieee80211_rx_list(hw, pubsta, skb, &list);
>>> +    rcu_read_unlock();
>>> +
>>> +    while ((skb = __skb_dequeue(&list)) != NULL)
>>
>> I'd drop the != NULL, but no strong feelings :)
>>
>>> +        if (napi)
>>> +            napi_gro_receive(napi, skb);
>>> +        else
>>> +            netif_receive_skb(skb);
>>
>> Nit: I'd prefer braces on the loop, just makes it nicer to read IMHO.
>>
>> OTOH, the !napi case should use netif_receive_skb_list(), no?
>>
>> Given the discussion, it also seems a bit odd to add more work for NAPI
>> poll where we process one by one ... But I see why you did that, and I
>> guess it's not actually that much more work.
>>
> IIUC Sebastian mentioned that threadable NAPI approach is helping in
> load balancing
> with minor check for dummy netdev for wireless drivers. Does this
> change improve
> the latency in both threaded and non-threaded modes?
check also for felix's last variant. he made a different patch based on
workq's
i'm testing this variant now on various platforms including a patched
ath10k driver
next step would be just to change other wifi drivers for the same
concept. just a few are using napi at all (mt76, ath10k and iwlwifi)
>
> -Rajkumar
>