Because of the constant size and guaranteed 16 bit alignment, the inline
compare_ether_addr function is much cheaper than calling memcmp.
Signed-off-by: Felix Fietkau <[email protected]>
---
net/wireless/mlme.c | 32 +++++++++++++++++---------------
net/wireless/scan.c | 2 +-
2 files changed, 18 insertions(+), 16 deletions(-)
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index f5a7ac3..e14fdcc 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/etherdevice.h>
#include <linux/netdevice.h>
#include <linux/nl80211.h>
#include <linux/slab.h>
@@ -100,7 +101,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
ASSERT_WDEV_LOCK(wdev);
if (wdev->current_bss &&
- memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) {
+ compare_ether_addr(wdev->current_bss->pub.bssid, bssid) == 0) {
cfg80211_unhold_bss(wdev->current_bss);
cfg80211_put_bss(&wdev->current_bss->pub);
wdev->current_bss = NULL;
@@ -115,7 +116,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
- from_ap = memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0;
+ from_ap = compare_ether_addr(mgmt->sa, dev->dev_addr) != 0;
__cfg80211_disconnected(dev, NULL, 0, reason_code, from_ap);
} else if (wdev->sme_state == CFG80211_SME_CONNECTING) {
__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, NULL, 0,
@@ -154,7 +155,7 @@ void __cfg80211_send_disassoc(struct net_device *dev,
return;
if (wdev->current_bss &&
- memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) {
+ compare_ether_addr(wdev->current_bss->pub.bssid, bssid) == 0) {
cfg80211_sme_disassoc(dev, wdev->current_bss);
cfg80211_unhold_bss(wdev->current_bss);
cfg80211_put_bss(&wdev->current_bss->pub);
@@ -165,7 +166,7 @@ void __cfg80211_send_disassoc(struct net_device *dev,
reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
- from_ap = memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0;
+ from_ap = compare_ether_addr(mgmt->sa, dev->dev_addr) != 0;
__cfg80211_disconnected(dev, NULL, 0, reason_code, from_ap);
}
EXPORT_SYMBOL(__cfg80211_send_disassoc);
@@ -285,7 +286,7 @@ int __cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
return -EINVAL;
if (wdev->current_bss &&
- memcmp(bssid, wdev->current_bss->pub.bssid, ETH_ALEN) == 0)
+ compare_ether_addr(bssid, wdev->current_bss->pub.bssid) == 0)
return -EALREADY;
memset(&req, 0, sizeof(req));
@@ -362,7 +363,7 @@ int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
memset(&req, 0, sizeof(req));
if (wdev->current_bss && prev_bssid &&
- memcmp(wdev->current_bss->pub.bssid, prev_bssid, ETH_ALEN) == 0) {
+ compare_ether_addr(wdev->current_bss->pub.bssid, prev_bssid) == 0) {
/*
* Trying to reassociate: Allow this to proceed and let the old
* association to be dropped when the new one is completed.
@@ -446,7 +447,8 @@ int __cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
if (local_state_change) {
if (wdev->current_bss &&
- memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) {
+ compare_ether_addr(wdev->current_bss->pub.bssid, bssid)
+ == 0) {
cfg80211_unhold_bss(wdev->current_bss);
cfg80211_put_bss(&wdev->current_bss->pub);
wdev->current_bss = NULL;
@@ -495,7 +497,7 @@ static int __cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
req.local_state_change = local_state_change;
req.ie = ie;
req.ie_len = ie_len;
- if (memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0)
+ if (compare_ether_addr(wdev->current_bss->pub.bssid, bssid) == 0)
req.bss = &wdev->current_bss->pub;
else
return -ENOTCONN;
@@ -758,8 +760,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
break;
}
- if (memcmp(wdev->current_bss->pub.bssid,
- mgmt->bssid, ETH_ALEN)) {
+ if (compare_ether_addr(wdev->current_bss->pub.bssid,
+ mgmt->bssid)) {
err = -ENOTCONN;
break;
}
@@ -772,8 +774,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
break;
/* for station, check that DA is the AP */
- if (memcmp(wdev->current_bss->pub.bssid,
- mgmt->da, ETH_ALEN)) {
+ if (compare_ether_addr(wdev->current_bss->pub.bssid,
+ mgmt->da)) {
err = -ENOTCONN;
break;
}
@@ -781,11 +783,11 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_P2P_GO:
case NL80211_IFTYPE_AP_VLAN:
- if (memcmp(mgmt->bssid, dev->dev_addr, ETH_ALEN))
+ if (compare_ether_addr(mgmt->bssid, dev->dev_addr))
err = -EINVAL;
break;
case NL80211_IFTYPE_MESH_POINT:
- if (memcmp(mgmt->sa, mgmt->bssid, ETH_ALEN)) {
+ if (compare_ether_addr(mgmt->sa, mgmt->bssid)) {
err = -EINVAL;
break;
}
@@ -804,7 +806,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
return err;
}
- if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0)
+ if (compare_ether_addr(mgmt->sa, dev->dev_addr) != 0)
return -EINVAL;
/* Transmit the Action frame as requested by user space */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 70faadf..fdbcfe6 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -378,7 +378,7 @@ static int cmp_bss_core(struct cfg80211_bss *a,
b->len_information_elements);
}
- return memcmp(a->bssid, b->bssid, ETH_ALEN);
+ return compare_ether_addr(a->bssid, b->bssid);
}
static int cmp_bss(struct cfg80211_bss *a,
--
1.7.3.2
Signed-off-by: Felix Fietkau <[email protected]>
---
net/mac80211/debugfs_netdev.c | 71 +++++++---------------------------------
1 files changed, 13 insertions(+), 58 deletions(-)
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index a32eeda..6ed0455 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -499,26 +499,23 @@ IEEE80211_IF_FILE(dot11MeshForwarding, u.mesh.mshcfg.dot11MeshForwarding, DEC);
IEEE80211_IF_FILE(rssi_threshold, u.mesh.mshcfg.rssi_threshold, DEC);
#endif
-
-#define DEBUGFS_ADD(name) \
- debugfs_create_file(#name, 0400, sdata->debugfs.dir, \
- sdata, &name##_ops);
-
#define DEBUGFS_ADD_MODE(name, mode) \
debugfs_create_file(#name, mode, sdata->debugfs.dir, \
sdata, &name##_ops);
-static void add_sta_files(struct ieee80211_sub_if_data *sdata)
+#define DEBUGFS_ADD(name) DEBUGFS_ADD_MODE(name, 0400)
+
+static void add_common_files(struct ieee80211_sub_if_data *sdata)
{
DEBUGFS_ADD(drop_unencrypted);
- DEBUGFS_ADD(flags);
- DEBUGFS_ADD(state);
- DEBUGFS_ADD(channel_type);
DEBUGFS_ADD(rc_rateidx_mask_2ghz);
DEBUGFS_ADD(rc_rateidx_mask_5ghz);
DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz);
DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz);
+}
+static void add_sta_files(struct ieee80211_sub_if_data *sdata)
+{
DEBUGFS_ADD(bssid);
DEBUGFS_ADD(aid);
DEBUGFS_ADD(last_beacon);
@@ -531,15 +528,6 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata)
static void add_ap_files(struct ieee80211_sub_if_data *sdata)
{
- DEBUGFS_ADD(drop_unencrypted);
- DEBUGFS_ADD(flags);
- DEBUGFS_ADD(state);
- DEBUGFS_ADD(channel_type);
- DEBUGFS_ADD(rc_rateidx_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mask_5ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz);
-
DEBUGFS_ADD(num_sta_authorized);
DEBUGFS_ADD(num_sta_ps);
DEBUGFS_ADD(dtim_count);
@@ -549,48 +537,14 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
static void add_ibss_files(struct ieee80211_sub_if_data *sdata)
{
- DEBUGFS_ADD(channel_type);
- DEBUGFS_ADD(rc_rateidx_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mask_5ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz);
-
DEBUGFS_ADD_MODE(tsf, 0600);
}
static void add_wds_files(struct ieee80211_sub_if_data *sdata)
{
- DEBUGFS_ADD(drop_unencrypted);
- DEBUGFS_ADD(flags);
- DEBUGFS_ADD(state);
- DEBUGFS_ADD(channel_type);
- DEBUGFS_ADD(rc_rateidx_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mask_5ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz);
-
DEBUGFS_ADD(peer);
}
-static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
-{
- DEBUGFS_ADD(drop_unencrypted);
- DEBUGFS_ADD(flags);
- DEBUGFS_ADD(state);
- DEBUGFS_ADD(channel_type);
- DEBUGFS_ADD(rc_rateidx_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mask_5ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz);
- DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz);
-}
-
-static void add_monitor_files(struct ieee80211_sub_if_data *sdata)
-{
- DEBUGFS_ADD(flags);
- DEBUGFS_ADD(state);
- DEBUGFS_ADD(channel_type);
-}
-
#ifdef CONFIG_MAC80211_MESH
static void add_mesh_files(struct ieee80211_sub_if_data *sdata)
@@ -651,6 +605,13 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
if (!sdata->debugfs.dir)
return;
+ DEBUGFS_ADD(flags);
+ DEBUGFS_ADD(state);
+ DEBUGFS_ADD(channel_type);
+
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ add_common_files(sdata);
+
switch (sdata->vif.type) {
case NL80211_IFTYPE_MESH_POINT:
#ifdef CONFIG_MAC80211_MESH
@@ -671,12 +632,6 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
case NL80211_IFTYPE_WDS:
add_wds_files(sdata);
break;
- case NL80211_IFTYPE_MONITOR:
- add_monitor_files(sdata);
- break;
- case NL80211_IFTYPE_AP_VLAN:
- add_vlan_files(sdata);
- break;
default:
break;
}
--
1.7.3.2
On Mon, Mar 19, 2012 at 11:52 AM, Felix Fietkau <[email protected]> wrote:
> On 2012-03-19 11:50 AM, Helmut Schaa wrote:
>> On Mon, Mar 19, 2012 at 11:36 AM, Felix Fietkau <[email protected]> wrote:
>>> On 2012-03-19 10:29 AM, Helmut Schaa wrote:
>>>> Hi,
>>>>
>>>> On Mon, Mar 19, 2012 at 9:39 AM, Johannes Berg
>>>> <[email protected]> wrote:
>>>>> On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
>>>>>> On 2012-03-18 11:17 AM, Johannes Berg wrote:
>>>>>> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>>>>>> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>>>>>> >> timeout doesn't need to be so precise.
>>>>>> >>
>>>>>> >> Switch to a different strategy: Schedule the timer initially, store jiffies
>>>>>> >> of all last rx/tx activity which would previously modify the timer, and
>>>>>> >> let the timer re-arm itself after checking the last rx/tx timestamp.
>>>>>> >
>>>>>> > I don't like this. It's not the optimisation you think it is on other
>>>>>> > ("embedded") systems where firing a timer is more expensive.
>>>>>> >
>>>>>> > You're trading power consumption against CPU utilisation by causing the
>>>>>> > timer to wake up.
>>>>>> I considered that was well, but didn't think one wakeup every 5 seconds
>>>>>> or so would be significant. Would you take the patch if I change the
>>>>>> timer to be deferrable, so that it doesn't cause wakeups by itself?
>>>>>
>>>>> I'm not really convinced, for making them deferrable we should analyse
>>>>> the consequences of that more carefully, for example it seems possible
>>>>> that the system wakes up to send a packet, and then the first thing that
>>>>> happens is a few aggregation handshakes ... that wastes a lot of time
>>>>> and power.
>>>>
>>>> I like the idea of getting rid of the mod_timer overhead. Looking at the timer
>>>> code, if the timer value is unchanged mod_timer is not that expensive.
>>>>
>>>> So, instead of calling mod_timer for every successive frame with a slightly
>>>> different timeout we could just use round_jiffies to round the timeout to the
>>>> next full second. This would in most cases take the quick path through
>>>> mod_timer and only update the timer once every second.
>>>>
>>>> See code (untested, not even compile tested) below.
>>> I would still like to avoid the overhead of apply_slack(), which is
>>> called early by mod_timer(). It was visible in both CPU cycles and
>>> icache misses when I did some profiling under high tx load.
>>
>> Indeed, however, I don't know the timer code at all. Seems like the default
>> slack for a timer is 0.4%. Setting the slack to 0 with set_timer_slack
>> should allow a shorter path through apply_slack. Not sure if that's sufficient
>> already.
> Looking at the code, it appears that this would not be sufficient.
What about just using mod_timer_pinned, that doesn't apply any slack.
However, this is mainly intended for not moving the timer to a different CPU.
Helmut
On 2012-03-19 11:55 AM, Helmut Schaa wrote:
> On Mon, Mar 19, 2012 at 11:52 AM, Felix Fietkau <[email protected]> wrote:
>> On 2012-03-19 11:50 AM, Helmut Schaa wrote:
>>> On Mon, Mar 19, 2012 at 11:36 AM, Felix Fietkau <[email protected]> wrote:
>>>> On 2012-03-19 10:29 AM, Helmut Schaa wrote:
>>>>> Hi,
>>>>>
>>>>> On Mon, Mar 19, 2012 at 9:39 AM, Johannes Berg
>>>>> <[email protected]> wrote:
>>>>>> On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
>>>>>>> On 2012-03-18 11:17 AM, Johannes Berg wrote:
>>>>>>> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>>>>>>> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>>>>>>> >> timeout doesn't need to be so precise.
>>>>>>> >>
>>>>>>> >> Switch to a different strategy: Schedule the timer initially, store jiffies
>>>>>>> >> of all last rx/tx activity which would previously modify the timer, and
>>>>>>> >> let the timer re-arm itself after checking the last rx/tx timestamp.
>>>>>>> >
>>>>>>> > I don't like this. It's not the optimisation you think it is on other
>>>>>>> > ("embedded") systems where firing a timer is more expensive.
>>>>>>> >
>>>>>>> > You're trading power consumption against CPU utilisation by causing the
>>>>>>> > timer to wake up.
>>>>>>> I considered that was well, but didn't think one wakeup every 5 seconds
>>>>>>> or so would be significant. Would you take the patch if I change the
>>>>>>> timer to be deferrable, so that it doesn't cause wakeups by itself?
>>>>>>
>>>>>> I'm not really convinced, for making them deferrable we should analyse
>>>>>> the consequences of that more carefully, for example it seems possible
>>>>>> that the system wakes up to send a packet, and then the first thing that
>>>>>> happens is a few aggregation handshakes ... that wastes a lot of time
>>>>>> and power.
>>>>>
>>>>> I like the idea of getting rid of the mod_timer overhead. Looking at the timer
>>>>> code, if the timer value is unchanged mod_timer is not that expensive.
>>>>>
>>>>> So, instead of calling mod_timer for every successive frame with a slightly
>>>>> different timeout we could just use round_jiffies to round the timeout to the
>>>>> next full second. This would in most cases take the quick path through
>>>>> mod_timer and only update the timer once every second.
>>>>>
>>>>> See code (untested, not even compile tested) below.
>>>> I would still like to avoid the overhead of apply_slack(), which is
>>>> called early by mod_timer(). It was visible in both CPU cycles and
>>>> icache misses when I did some profiling under high tx load.
>>>
>>> Indeed, however, I don't know the timer code at all. Seems like the default
>>> slack for a timer is 0.4%. Setting the slack to 0 with set_timer_slack
>>> should allow a shorter path through apply_slack. Not sure if that's sufficient
>>> already.
>> Looking at the code, it appears that this would not be sufficient.
>
> What about just using mod_timer_pinned, that doesn't apply any slack.
> However, this is mainly intended for not moving the timer to a different CPU.
That seems like API abuse to me. I looked at mod_timer again, and it
might actually take out the bulk of the code, but I'd still like to
avoid the cost of this thing completely. The icache on most of these
MIPS routers is so small and the memory bandwidth so limited, that it's
worth properly optimizing the hotpath.
- Felix
On Mon, 2012-03-19 at 11:01 +0100, Felix Fietkau wrote:
> > I'm not really convinced, for making them deferrable we should analyse
> > the consequences of that more carefully, for example it seems possible
> > that the system wakes up to send a packet, and then the first thing that
> > happens is a few aggregation handshakes ... that wastes a lot of time
> > and power.
> How is that any more expensive than triggering a wakeup before that time
> caused by the session timer expiry?
It might not be more expensive, but the timing would be odd? You'd tear
down the session just to set it up again?
> > Also, at least for TX aggregation, you don't even give them a timeout in
> > ath9k so that wouldn't really be an issue?
> minstrel_ht does give it a timeout. OpenWrt is not using the ath9k rate
> control module.
Good point. Still though I suspect that this should be made
configurable, where aggregation sessions don't consume hardware
resources (like in our case) and you set them up with the first packet
it doesn't really make sense to time them out etc.?
johannes
On Mon, Mar 19, 2012 at 11:36 AM, Felix Fietkau <[email protected]> wrote:
> On 2012-03-19 10:29 AM, Helmut Schaa wrote:
>> Hi,
>>
>> On Mon, Mar 19, 2012 at 9:39 AM, Johannes Berg
>> <[email protected]> wrote:
>>> On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
>>>> On 2012-03-18 11:17 AM, Johannes Berg wrote:
>>>> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>>>> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>>>> >> timeout doesn't need to be so precise.
>>>> >>
>>>> >> Switch to a different strategy: Schedule the timer initially, store jiffies
>>>> >> of all last rx/tx activity which would previously modify the timer, and
>>>> >> let the timer re-arm itself after checking the last rx/tx timestamp.
>>>> >
>>>> > I don't like this. It's not the optimisation you think it is on other
>>>> > ("embedded") systems where firing a timer is more expensive.
>>>> >
>>>> > You're trading power consumption against CPU utilisation by causing the
>>>> > timer to wake up.
>>>> I considered that was well, but didn't think one wakeup every 5 seconds
>>>> or so would be significant. Would you take the patch if I change the
>>>> timer to be deferrable, so that it doesn't cause wakeups by itself?
>>>
>>> I'm not really convinced, for making them deferrable we should analyse
>>> the consequences of that more carefully, for example it seems possible
>>> that the system wakes up to send a packet, and then the first thing that
>>> happens is a few aggregation handshakes ... that wastes a lot of time
>>> and power.
>>
>> I like the idea of getting rid of the mod_timer overhead. Looking at the timer
>> code, if the timer value is unchanged mod_timer is not that expensive.
>>
>> So, instead of calling mod_timer for every successive frame with a slightly
>> different timeout we could just use round_jiffies to round the timeout to the
>> next full second. This would in most cases take the quick path through
>> mod_timer and only update the timer once every second.
>>
>> See code (untested, not even compile tested) below.
> I would still like to avoid the overhead of apply_slack(), which is
> called early by mod_timer(). It was visible in both CPU cycles and
> icache misses when I did some profiling under high tx load.
Indeed, however, I don't know the timer code at all. Seems like the default
slack for a timer is 0.4%. Setting the slack to 0 with set_timer_slack
should allow a shorter path through apply_slack. Not sure if that's sufficient
already.
Helmut
On 2012-03-19 10:29 AM, Helmut Schaa wrote:
> Hi,
>
> On Mon, Mar 19, 2012 at 9:39 AM, Johannes Berg
> <[email protected]> wrote:
>> On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
>>> On 2012-03-18 11:17 AM, Johannes Berg wrote:
>>> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>>> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>>> >> timeout doesn't need to be so precise.
>>> >>
>>> >> Switch to a different strategy: Schedule the timer initially, store jiffies
>>> >> of all last rx/tx activity which would previously modify the timer, and
>>> >> let the timer re-arm itself after checking the last rx/tx timestamp.
>>> >
>>> > I don't like this. It's not the optimisation you think it is on other
>>> > ("embedded") systems where firing a timer is more expensive.
>>> >
>>> > You're trading power consumption against CPU utilisation by causing the
>>> > timer to wake up.
>>> I considered that was well, but didn't think one wakeup every 5 seconds
>>> or so would be significant. Would you take the patch if I change the
>>> timer to be deferrable, so that it doesn't cause wakeups by itself?
>>
>> I'm not really convinced, for making them deferrable we should analyse
>> the consequences of that more carefully, for example it seems possible
>> that the system wakes up to send a packet, and then the first thing that
>> happens is a few aggregation handshakes ... that wastes a lot of time
>> and power.
>
> I like the idea of getting rid of the mod_timer overhead. Looking at the timer
> code, if the timer value is unchanged mod_timer is not that expensive.
>
> So, instead of calling mod_timer for every successive frame with a slightly
> different timeout we could just use round_jiffies to round the timeout to the
> next full second. This would in most cases take the quick path through
> mod_timer and only update the timer once every second.
>
> See code (untested, not even compile tested) below.
I would still like to avoid the overhead of apply_slack(), which is
called early by mod_timer(). It was visible in both CPU cycles and
icache misses when I did some profiling under high tx load.
- Felix
On 2012-03-19 11:50 AM, Helmut Schaa wrote:
> On Mon, Mar 19, 2012 at 11:36 AM, Felix Fietkau <[email protected]> wrote:
>> On 2012-03-19 10:29 AM, Helmut Schaa wrote:
>>> Hi,
>>>
>>> On Mon, Mar 19, 2012 at 9:39 AM, Johannes Berg
>>> <[email protected]> wrote:
>>>> On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
>>>>> On 2012-03-18 11:17 AM, Johannes Berg wrote:
>>>>> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>>>>> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>>>>> >> timeout doesn't need to be so precise.
>>>>> >>
>>>>> >> Switch to a different strategy: Schedule the timer initially, store jiffies
>>>>> >> of all last rx/tx activity which would previously modify the timer, and
>>>>> >> let the timer re-arm itself after checking the last rx/tx timestamp.
>>>>> >
>>>>> > I don't like this. It's not the optimisation you think it is on other
>>>>> > ("embedded") systems where firing a timer is more expensive.
>>>>> >
>>>>> > You're trading power consumption against CPU utilisation by causing the
>>>>> > timer to wake up.
>>>>> I considered that was well, but didn't think one wakeup every 5 seconds
>>>>> or so would be significant. Would you take the patch if I change the
>>>>> timer to be deferrable, so that it doesn't cause wakeups by itself?
>>>>
>>>> I'm not really convinced, for making them deferrable we should analyse
>>>> the consequences of that more carefully, for example it seems possible
>>>> that the system wakes up to send a packet, and then the first thing that
>>>> happens is a few aggregation handshakes ... that wastes a lot of time
>>>> and power.
>>>
>>> I like the idea of getting rid of the mod_timer overhead. Looking at the timer
>>> code, if the timer value is unchanged mod_timer is not that expensive.
>>>
>>> So, instead of calling mod_timer for every successive frame with a slightly
>>> different timeout we could just use round_jiffies to round the timeout to the
>>> next full second. This would in most cases take the quick path through
>>> mod_timer and only update the timer once every second.
>>>
>>> See code (untested, not even compile tested) below.
>> I would still like to avoid the overhead of apply_slack(), which is
>> called early by mod_timer(). It was visible in both CPU cycles and
>> icache misses when I did some profiling under high tx load.
>
> Indeed, however, I don't know the timer code at all. Seems like the default
> slack for a timer is 0.4%. Setting the slack to 0 with set_timer_slack
> should allow a shorter path through apply_slack. Not sure if that's sufficient
> already.
Looking at the code, it appears that this would not be sufficient.
- Felix
On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
> On 2012-03-18 11:17 AM, Johannes Berg wrote:
> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
> >> timeout doesn't need to be so precise.
> >>
> >> Switch to a different strategy: Schedule the timer initially, store jiffies
> >> of all last rx/tx activity which would previously modify the timer, and
> >> let the timer re-arm itself after checking the last rx/tx timestamp.
> >
> > I don't like this. It's not the optimisation you think it is on other
> > ("embedded") systems where firing a timer is more expensive.
> >
> > You're trading power consumption against CPU utilisation by causing the
> > timer to wake up.
> I considered that was well, but didn't think one wakeup every 5 seconds
> or so would be significant. Would you take the patch if I change the
> timer to be deferrable, so that it doesn't cause wakeups by itself?
I'm not really convinced, for making them deferrable we should analyse
the consequences of that more carefully, for example it seems possible
that the system wakes up to send a packet, and then the first thing that
happens is a few aggregation handshakes ... that wastes a lot of time
and power.
Also, at least for TX aggregation, you don't even give them a timeout in
ath9k so that wouldn't really be an issue?
johannes
On Mon, 2012-03-19 at 10:29 +0100, Helmut Schaa wrote:
> > I'm not really convinced, for making them deferrable we should analyse
> > the consequences of that more carefully, for example it seems possible
> > that the system wakes up to send a packet, and then the first thing that
> > happens is a few aggregation handshakes ... that wastes a lot of time
> > and power.
>
> I like the idea of getting rid of the mod_timer overhead. Looking at the timer
> code, if the timer value is unchanged mod_timer is not that expensive.
>
> So, instead of calling mod_timer for every successive frame with a slightly
> different timeout we could just use round_jiffies to round the timeout to the
> next full second. This would in most cases take the quick path through
> mod_timer and only update the timer once every second.
That seems like a good plan.
johannes
On 2012-03-19 11:05 AM, Johannes Berg wrote:
> On Mon, 2012-03-19 at 11:01 +0100, Felix Fietkau wrote:
>
>> > I'm not really convinced, for making them deferrable we should analyse
>> > the consequences of that more carefully, for example it seems possible
>> > that the system wakes up to send a packet, and then the first thing that
>> > happens is a few aggregation handshakes ... that wastes a lot of time
>> > and power.
>> How is that any more expensive than triggering a wakeup before that time
>> caused by the session timer expiry?
>
> It might not be more expensive, but the timing would be odd? You'd tear
> down the session just to set it up again?
I don't think it matters, since it's an extremely rare case anyway, and
without my change it would have to re-establish the aggregation session
anyway. It's much more likely for it to run into a wakeup from something
else on the system before that happens.
>> > Also, at least for TX aggregation, you don't even give them a timeout in
>> > ath9k so that wouldn't really be an issue?
>> minstrel_ht does give it a timeout. OpenWrt is not using the ath9k rate
>> control module.
>
> Good point. Still though I suspect that this should be made
> configurable, where aggregation sessions don't consume hardware
> resources (like in our case) and you set them up with the first packet
> it doesn't really make sense to time them out etc.?
Yes, makes sense.
- Felix
On 2012-03-18 11:17 AM, Johannes Berg wrote:
> On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>> timeout doesn't need to be so precise.
>>
>> Switch to a different strategy: Schedule the timer initially, store jiffies
>> of all last rx/tx activity which would previously modify the timer, and
>> let the timer re-arm itself after checking the last rx/tx timestamp.
>
> I don't like this. It's not the optimisation you think it is on other
> ("embedded") systems where firing a timer is more expensive.
>
> You're trading power consumption against CPU utilisation by causing the
> timer to wake up.
I considered that was well, but didn't think one wakeup every 5 seconds
or so would be significant. Would you take the patch if I change the
timer to be deferrable, so that it doesn't cause wakeups by itself?
- Felix
On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
> timeout doesn't need to be so precise.
>
> Switch to a different strategy: Schedule the timer initially, store jiffies
> of all last rx/tx activity which would previously modify the timer, and
> let the timer re-arm itself after checking the last rx/tx timestamp.
I don't like this. It's not the optimisation you think it is on other
("embedded") systems where firing a timer is more expensive.
You're trading power consumption against CPU utilisation by causing the
timer to wake up.
johannes
Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
timeout doesn't need to be so precise.
Switch to a different strategy: Schedule the timer initially, store jiffies
of all last rx/tx activity which would previously modify the timer, and
let the timer re-arm itself after checking the last rx/tx timestamp.
This visibly reduces CPU load under high network load on small embedded
systems.
Signed-off-by: Felix Fietkau <[email protected]>
---
net/mac80211/agg-rx.c | 16 +++++++++++++++-
net/mac80211/agg-tx.c | 16 +++++++++++++++-
net/mac80211/ieee80211_i.h | 3 ++-
net/mac80211/rx.c | 3 +--
net/mac80211/sta_info.h | 4 ++++
net/mac80211/tx.c | 3 +--
6 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 1068f66..d92b345 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -141,6 +141,18 @@ static void sta_rx_agg_session_timer_expired(unsigned long data)
u8 *timer_to_id = ptid - *ptid;
struct sta_info *sta = container_of(timer_to_id, struct sta_info,
timer_to_tid[0]);
+ struct tid_ampdu_rx *tid_rx;
+ unsigned long timeout;
+
+ tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[*ptid]);
+ if (!tid_rx)
+ return;
+
+ timeout = tid_rx->last_rx + TU_TO_JIFFIES(tid_rx->timeout);
+ if (time_is_after_jiffies(timeout)) {
+ mod_timer(&tid_rx->session_timer, timeout);
+ return;
+ }
#ifdef CONFIG_MAC80211_HT_DEBUG
printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
@@ -334,8 +346,10 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
/* activate it for RX */
rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], tid_agg_rx);
- if (timeout)
+ if (timeout) {
mod_timer(&tid_agg_rx->session_timer, TU_TO_EXP_TIME(timeout));
+ tid_agg_rx->last_rx = jiffies;
+ }
end:
mutex_unlock(&sta->ampdu_mlme.mtx);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 76be617..16632fa 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -417,6 +417,18 @@ static void sta_tx_agg_session_timer_expired(unsigned long data)
u8 *timer_to_id = ptid - *ptid;
struct sta_info *sta = container_of(timer_to_id, struct sta_info,
timer_to_tid[0]);
+ struct tid_ampdu_tx *tid_tx;
+ unsigned long timeout;
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, *ptid);
+ if (!tid_tx)
+ return;
+
+ timeout = tid_tx->last_tx + TU_TO_JIFFIES(tid_tx->timeout);
+ if (time_is_after_jiffies(timeout)) {
+ mod_timer(&tid_tx->session_timer, timeout);
+ return;
+ }
#ifdef CONFIG_MAC80211_HT_DEBUG
printk(KERN_DEBUG "tx session timer expired on tid %d\n", (u16)*ptid);
@@ -884,9 +896,11 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
sta->ampdu_mlme.addba_req_num[tid] = 0;
- if (tid_tx->timeout)
+ if (tid_tx->timeout) {
mod_timer(&tid_tx->session_timer,
TU_TO_EXP_TIME(tid_tx->timeout));
+ tid_tx->last_tx = jiffies;
+ }
} else {
___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d9798a3..2785976 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -52,7 +52,8 @@ struct ieee80211_local;
* increased memory use (about 2 kB of RAM per entry). */
#define IEEE80211_FRAGMENT_MAX 4
-#define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024))
+#define TU_TO_JIFFIES(x) (usecs_to_jiffies((x) * 1024))
+#define TU_TO_EXP_TIME(x) (jiffies + TU_TO_JIFFIES(x))
#define IEEE80211_DEFAULT_UAPSD_QUEUES \
(IEEE80211_WMM_IE_STA_QOSINFO_AC_BK | \
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index bcfe8c7..8da3b36 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -793,8 +793,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx)
/* reset session timer */
if (tid_agg_rx->timeout)
- mod_timer(&tid_agg_rx->session_timer,
- TU_TO_EXP_TIME(tid_agg_rx->timeout));
+ tid_agg_rx->last_rx = jiffies;
/* if this mpdu is fragmented - terminate rx aggregation session */
sc = le16_to_cpu(hdr->seq_ctrl);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index ab05768..e21652b 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -101,6 +101,7 @@ enum ieee80211_sta_info_flags {
* @dialog_token: dialog token for aggregation session
* @timeout: session timeout value to be filled in ADDBA requests
* @state: session state (see above)
+ * @last_tx: jiffies of last tx activity
* @stop_initiator: initiator of a session stop
* @tx_stop: TX DelBA frame when stopping
* @buf_size: reorder buffer size at receiver
@@ -122,6 +123,7 @@ struct tid_ampdu_tx {
struct timer_list addba_resp_timer;
struct sk_buff_head pending;
unsigned long state;
+ unsigned long last_tx;
u16 timeout;
u8 dialog_token;
u8 stop_initiator;
@@ -139,6 +141,7 @@ struct tid_ampdu_tx {
* @reorder_time: jiffies when skb was added
* @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
* @reorder_timer: releases expired frames from the reorder buffer.
+ * @last_rx: jiffies of last rx activity
* @head_seq_num: head sequence number in reordering buffer.
* @stored_mpdu_num: number of MPDUs in reordering buffer
* @ssn: Starting Sequence Number expected to be aggregated.
@@ -163,6 +166,7 @@ struct tid_ampdu_rx {
unsigned long *reorder_time;
struct timer_list session_timer;
struct timer_list reorder_timer;
+ unsigned long last_rx;
u16 head_seq_num;
u16 stored_mpdu_num;
u16 ssn;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 782a601..5ad2c40 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1118,8 +1118,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
/* reset session timer */
if (reset_agg_timer && tid_tx->timeout)
- mod_timer(&tid_tx->session_timer,
- TU_TO_EXP_TIME(tid_tx->timeout));
+ tid_tx->last_tx = jiffies;
return queued;
}
--
1.7.3.2
On 2012-03-19 9:39 AM, Johannes Berg wrote:
> On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
>> On 2012-03-18 11:17 AM, Johannes Berg wrote:
>> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>> >> timeout doesn't need to be so precise.
>> >>
>> >> Switch to a different strategy: Schedule the timer initially, store jiffies
>> >> of all last rx/tx activity which would previously modify the timer, and
>> >> let the timer re-arm itself after checking the last rx/tx timestamp.
>> >
>> > I don't like this. It's not the optimisation you think it is on other
>> > ("embedded") systems where firing a timer is more expensive.
>> >
>> > You're trading power consumption against CPU utilisation by causing the
>> > timer to wake up.
>> I considered that was well, but didn't think one wakeup every 5 seconds
>> or so would be significant. Would you take the patch if I change the
>> timer to be deferrable, so that it doesn't cause wakeups by itself?
>
> I'm not really convinced, for making them deferrable we should analyse
> the consequences of that more carefully, for example it seems possible
> that the system wakes up to send a packet, and then the first thing that
> happens is a few aggregation handshakes ... that wastes a lot of time
> and power.
How is that any more expensive than triggering a wakeup before that time
caused by the session timer expiry?
> Also, at least for TX aggregation, you don't even give them a timeout in
> ath9k so that wouldn't really be an issue?
minstrel_ht does give it a timeout. OpenWrt is not using the ath9k rate
control module.
- Felix
Hi,
On Mon, Mar 19, 2012 at 9:39 AM, Johannes Berg
<[email protected]> wrote:
> On Sun, 2012-03-18 at 12:13 +0100, Felix Fietkau wrote:
>> On 2012-03-18 11:17 AM, Johannes Berg wrote:
>> > On Sun, 2012-03-18 at 00:00 +0100, Felix Fietkau wrote:
>> >> Calling mod_timer from the rx/tx hotpath is somewhat expensive, and the
>> >> timeout doesn't need to be so precise.
>> >>
>> >> Switch to a different strategy: Schedule the timer initially, store jiffies
>> >> of all last rx/tx activity which would previously modify the timer, and
>> >> let the timer re-arm itself after checking the last rx/tx timestamp.
>> >
>> > I don't like this. It's not the optimisation you think it is on other
>> > ("embedded") systems where firing a timer is more expensive.
>> >
>> > You're trading power consumption against CPU utilisation by causing the
>> > timer to wake up.
>> I considered that was well, but didn't think one wakeup every 5 seconds
>> or so would be significant. Would you take the patch if I change the
>> timer to be deferrable, so that it doesn't cause wakeups by itself?
>
> I'm not really convinced, for making them deferrable we should analyse
> the consequences of that more carefully, for example it seems possible
> that the system wakes up to send a packet, and then the first thing that
> happens is a few aggregation handshakes ... that wastes a lot of time
> and power.
I like the idea of getting rid of the mod_timer overhead. Looking at the timer
code, if the timer value is unchanged mod_timer is not that expensive.
So, instead of calling mod_timer for every successive frame with a slightly
different timeout we could just use round_jiffies to round the timeout to the
next full second. This would in most cases take the quick path through
mod_timer and only update the timer once every second.
See code (untested, not even compile tested) below.
Helmut
---
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 44627c8..25c1621 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -53,6 +53,7 @@ struct ieee80211_local;
#define IEEE80211_FRAGMENT_MAX 4
#define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024))
+#define TU_TO_EXP_TIME_ROUNDED(x) round_jiffies(TU_TO_EXP_TIME(x))
#define IEEE80211_DEFAULT_UAPSD_QUEUES \
(IEEE80211_WMM_IE_STA_QOSINFO_AC_BK | \
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 491c96f..f1b111d 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -794,7 +794,7 @@ static void ieee80211_rx_reorder_ampdu(struct
ieee80211_rx_data *rx)
/* reset session timer */
if (tid_agg_rx->timeout)
mod_timer(&tid_agg_rx->session_timer,
- TU_TO_EXP_TIME(tid_agg_rx->timeout));
+ TU_TO_EXP_TIME_ROUNDED(tid_agg_rx->timeout));
/* if this mpdu is fragmented - terminate rx aggregation session */
sc = le16_to_cpu(hdr->seq_ctrl);