From: Johannes Berg <[email protected]>
While we're connected, the AP shouldn't change the primary channel
in the HT information. We checked this, and dropped the connection
if it did change it.
Unfortunately, this is causing problems on some APs, e.g. on the
Netgear WRT610NL: the beacons seem to always contain a bad channel
and if we made a connection using a probe response (correct data)
we drop the connection immediately and can basically not connect
properly at all.
Work around this by ignoring the HT primary channel information in
beacons if we're already connected.
Also print out more verbose messages in the other situations to
help diagnose similar bugs quicker in the future.
Cc: [email protected] [3.10]
Signed-off-by: Johannes Berg <[email protected]>
---
net/mac80211/mlme.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 383a961..77e7796 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -211,8 +211,9 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel,
const struct ieee80211_ht_operation *ht_oper,
const struct ieee80211_vht_operation *vht_oper,
- struct cfg80211_chan_def *chandef, bool verbose)
+ struct cfg80211_chan_def *chandef, bool tracking)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct cfg80211_chan_def vht_chandef;
u32 ht_cfreq, ret;
@@ -231,7 +232,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan,
channel->band);
/* check that channel matches the right operating channel */
- if (channel->center_freq != ht_cfreq) {
+ if (!tracking && channel->center_freq != ht_cfreq) {
/*
* It's possible that some APs are confused here;
* Netgear WNDR3700 sometimes reports 4 higher than
@@ -239,11 +240,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
* since we look at probe response/beacon data here
* it should be OK.
*/
- if (verbose)
- sdata_info(sdata,
- "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n",
- channel->center_freq, ht_cfreq,
- ht_oper->primary_chan, channel->band);
+ sdata_info(sdata,
+ "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n",
+ channel->center_freq, ht_cfreq,
+ ht_oper->primary_chan, channel->band);
ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
goto out;
}
@@ -297,7 +297,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
channel->band);
break;
default:
- if (verbose)
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
sdata_info(sdata,
"AP VHT operation IE has invalid channel width (%d), disable VHT\n",
vht_oper->chan_width);
@@ -306,7 +306,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
}
if (!cfg80211_chandef_valid(&vht_chandef)) {
- if (verbose)
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
sdata_info(sdata,
"AP VHT information is invalid, disable VHT\n");
ret = IEEE80211_STA_DISABLE_VHT;
@@ -319,7 +319,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
}
if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) {
- if (verbose)
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
sdata_info(sdata,
"AP VHT information doesn't match HT, disable VHT\n");
ret = IEEE80211_STA_DISABLE_VHT;
@@ -346,7 +346,7 @@ out:
ret |= chandef_downgrade(chandef);
}
- if (chandef->width != vht_chandef.width && verbose)
+ if (chandef->width != vht_chandef.width && !tracking)
sdata_info(sdata,
"capabilities/regulatory prevented using AP HT/VHT configuration, downgraded\n");
@@ -386,7 +386,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
/* calculate new channel (type) based on HT/VHT operation IEs */
flags = ieee80211_determine_chantype(sdata, sband, chan, ht_oper,
- vht_oper, &chandef, false);
+ vht_oper, &chandef, true);
/*
* Downgrade the new channel if we associated with restricted
@@ -3838,7 +3838,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
cbss->channel,
ht_oper, vht_oper,
- &chandef, true);
+ &chandef, false);
sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss),
local->rx_chains);
--
1.8.0
* Chris Wright ([email protected]) wrote:
> while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
> IEEE80211_CHAN_DISABLED)) {
> if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
> ret = IEEE80211_STA_DISABLE_HT |
> IEEE80211_STA_DISABLE_VHT;
> goto out;
> }
Actually, it just looks like this above loop is broken. Code flow is:
chandef->width = NL80211_CHAN_WIDTH_20;
...
if (!vht_oper || !sband->vht_cap.vht_supported) {
ret = IEEE80211_STA_DISABLE_VHT;
goto out;
}
...
out:
/* don't print the message below for VHT mismatch if VHT is disabled */
if (ret & IEEE80211_STA_DISABLE_VHT)
vht_chandef = *chandef;
while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
IEEE80211_CHAN_DISABLED)) {
if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
ret = IEEE80211_STA_DISABLE_HT |
IEEE80211_STA_DISABLE_VHT;
goto out;
}
ret |= chandef_downgrade(chandef);
}
We enter the while loop w/ width NL80211_CHAN_WIDTH_20
(i.e. ht_cap.ht_supported is true), do one downgrade to
NL80211_CHAN_WIDTH_20_NOHT, and then we are stuck in a permanent loop.
I did not see any way that cfg80211_chandef_usable() will update chandef->width
so once width is NL80211_CHAN_WIDTH_20_NOHT and ht_cap.ht_supported is
true there is no end to the goto loop.
So here is a hack that at least gets wireless working (only because
there's another AP that's not got 11n enabled I believe).
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0e5aab1..b68ca05 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -336,7 +336,7 @@ out:
if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
ret = IEEE80211_STA_DISABLE_HT |
IEEE80211_STA_DISABLE_VHT;
- goto out;
+ break;
}
ret |= chandef_downgrade(chandef);
* Johannes Berg ([email protected]) wrote:
> While we're connected, the AP shouldn't change the primary channel
> in the HT information. We checked this, and dropped the connection
> if it did change it.
>
> Unfortunately, this is causing problems on some APs, e.g. on the
> Netgear WRT610NL: the beacons seem to always contain a bad channel
> and if we made a connection using a probe response (correct data)
> we drop the connection immediately and can basically not connect
> properly at all.
>
> Work around this by ignoring the HT primary channel information in
> beacons if we're already connected.
>
> Also print out more verbose messages in the other situations to
> help diagnose similar bugs quicker in the future.
I just gave this a try, and my laptop is still hanging w/ same WARN_ON
being triggered:
...
while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
IEEE80211_CHAN_DISABLED)) {
if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
ret = IEEE80211_STA_DISABLE_HT |
IEEE80211_STA_DISABLE_VHT;
goto out;
}
...
Jul 31 05:45:10 x220 kernel: [ 40.047670] wlp3s0: authenticate with 2c:36:f8:fa:46:21
Jul 31 05:45:10 x220 kernel: [ 40.053635] wlp3s0: send auth to 2c:36:f8:fa:46:21 (try 1/3)
Jul 31 05:45:10 x220 kernel: [ 40.146302] wlp3s0: authenticated
Jul 31 05:45:10 x220 kernel: [ 40.146675] wlp3s0: waiting for beacon from 2c:36:f8:fa:46:21
Jul 31 05:45:11 x220 kernel: [ 40.250878] wlp3s0: associate with 2c:36:f8:fa:46:21 (try 1/3)
Jul 31 05:45:11 x220 kernel: [ 40.254241] wlp3s0: RX AssocResp from 2c:36:f8:fa:46:21 (capab=0x101 status=0 aid=3)
Jul 31 05:45:11 x220 kernel: [ 40.275168] wlp3s0: associated
Jul 31 05:45:11 x220 kernel: [ 40.275252] IPv6: ADDRCONF(NETDEV_CHANGE): wlp3s0: link becomes ready
Jul 31 05:45:11 x220 kernel: [ 40.275385] cfg80211: Calling CRDA for country: DE
Jul 31 05:45:11 x220 kernel: [ 40.296753] cfg80211: Regulatory domain changed to country: DE
Jul 31 05:45:11 x220 kernel: [ 40.296763] cfg80211: (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp)
Jul 31 05:45:11 x220 kernel: [ 40.296769] cfg80211: (2400000 KHz - 2483500 KHz @ 40000 KHz), (N/A, 2000 mBm)
Jul 31 05:45:11 x220 kernel: [ 40.296774] cfg80211: (5150000 KHz - 5250000 KHz @ 40000 KHz), (N/A, 2000 mBm)
Jul 31 05:45:11 x220 kernel: [ 40.296778] cfg80211: (5250000 KHz - 5350000 KHz @ 40000 KHz), (N/A, 2000 mBm)
Jul 31 05:45:11 x220 kernel: [ 40.296782] cfg80211: (5470000 KHz - 5725000 KHz @ 40000 KHz), (N/A, 2698 mBm)
Jul 31 05:45:11 x220 kernel: [ 40.296787] cfg80211: (57240000 KHz - 65880000 KHz @ 2160000 KHz), (N/A, 4000 mBm)
Jul 31 05:45:11 x220 NetworkManager[574]: <info> (wlp3s0): supplicant interface state: scanning -> authenticating
Jul 31 05:45:11 x220 NetworkManager[574]: <info> (wlp3s0): supplicant interface state: authenticating -> associating
Jul 31 05:45:11 x220 NetworkManager[574]: <info> (wlp3s0): supplicant interface state: associating -> completed
Jul 31 05:45:11 x220 NetworkManager[574]: <info> Activation (wlp3s0/wireless) Stage 2 of 5 (Device Configure) successful. Connected to wireless network 'ietf'.
Jul 31 05:45:11 x220 NetworkManager[574]: <info> Activation (wlp3s0) Stage 3 of 5 (IP Configure Start) scheduled.
Jul 31 05:45:11 x220 NetworkManager[574]: <info> Activation (wlp3s0) Stage 3 of 5 (IP Configure Start) started...
Jul 31 05:45:11 x220 NetworkManager[574]: <info> (wlp3s0): device state change: config -> ip-config (reason 'none') [50 70 0]
Jul 31 05:45:11 x220 NetworkManager[574]: <info> Activation (wlp3s0) Beginning DHCPv4 transaction (timeout in 45 seconds)
Jul 31 05:45:11 x220 NetworkManager[574]: <info> dhclient started with pid 1786
Jul 31 05:45:11 x220 NetworkManager[574]: <info> Activation (wlp3s0) Beginning IP6 addrconf.
Jul 31 05:45:11 x220 NetworkManager[574]: <info> Activation (wlp3s0) Stage 3 of 5 (IP Configure Start) complete.
Jul 31 05:45:11 x220 kernel: [ 40.354582] ------------[ cut here ]------------
Jul 31 05:45:11 x220 kernel: [ 40.354628] WARNING: CPU: 2 PID: 85 at net/mac80211/mlme.c:338 ieee80211_determine_chantype+0x26f/0x380 [mac80211]()
Jul 31 05:45:11 x220 kernel: [ 40.354631] Modules linked in: ebtable_nat ebtables ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat xt_CHECKSUM iptable_mangle tun bridge openvswitch stp llc bnep bluetooth arc4 iwldvm ip6t_REJECT nf_conntrack_ipv6 nf_conntrack_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 ip6table_filter ip6_tables xt_conntrack nf_conntrack mac80211 iwlwifi snd_hda_codec_hdmi uvcvideo snd_hda_codec_conexant cfg80211 snd_hda_intel videobuf2_vmalloc snd_hda_codec videobuf2_memops videobuf2_core snd_hwdep iTCO_wdt videodev iTCO_vendor_support snd_seq media snd_seq_device sdhci_pci sdhci snd_pcm e1000e i2c_i801 joydev mmc_core lpc_ich mfd_core thinkpad_acpi tpm_tis wmi tpm rfkill snd_page_alloc tpm_bios snd_timer snd ptp soundcore pps_core mperf uinput binfmt_misc dm_crypt i915 i2c_algo_bit drm_kms_helper drm i2c_core video
Jul 31 05:45:11 x220 kernel: [ 40.354715] CPU: 2 PID: 85 Comm: kworker/u16:4 Not tainted 3.11.0-rc3+ #7
Jul 31 05:45:11 x220 kernel: [ 40.354720] Hardware name: LENOVO 4291CL9/4291CL9, BIOS 8DET56WW (1.26 ) 12/01/2011
Jul 31 05:45:11 x220 kernel: [ 40.354744] Workqueue: phy0 ieee80211_iface_work [mac80211]
Jul 31 05:45:11 x220 kernel: [ 40.354748] 0000000000000009 ffff880403cff9d8 ffffffff8160bd12 0000000000000000
Jul 31 05:45:11 x220 kernel: [ 40.354754] ffff880403cffa10 ffffffff8105393d ffff880403cffae0 0000000000000810
Jul 31 05:45:11 x220 kernel: [ 40.354759] ffff8803f6638800 0000000000000001 ffff8803f3e37070 ffff880403cffa20
Jul 31 05:45:11 x220 kernel: [ 40.354765] Call Trace:
Jul 31 05:45:11 x220 kernel: [ 40.354776] [<ffffffff8160bd12>] dump_stack+0x45/0x56
Jul 31 05:45:11 x220 kernel: [ 40.354784] [<ffffffff8105393d>] warn_slowpath_common+0x7d/0xa0
Jul 31 05:45:11 x220 kernel: [ 40.354791] [<ffffffff81053a1a>] warn_slowpath_null+0x1a/0x20
Jul 31 05:45:11 x220 kernel: [ 40.354819] [<ffffffffa040be2f>] ieee80211_determine_chantype+0x26f/0x380 [mac80211]
Jul 31 05:45:11 x220 kernel: [ 40.354852] [<ffffffffa040f209>] ieee80211_rx_mgmt_beacon+0x589/0x10f0 [mac80211]
Jul 31 05:45:11 x220 kernel: [ 40.354885] [<ffffffffa04111d3>] ieee80211_sta_rx_queued_mgmt+0x2b3/0x530 [mac80211]
Jul 31 05:45:11 x220 kernel: [ 40.354931] [<ffffffff81092d74>] ? load_balance+0x144/0x790
Jul 31 05:45:11 x220 kernel: [ 40.354962] [<ffffffffa03e31cc>] ieee80211_iface_work+0x25c/0x360 [mac80211]
Jul 31 05:45:11 x220 kernel: [ 40.354971] [<ffffffff8107131f>] process_one_work+0x16f/0x420
Jul 31 05:45:11 x220 kernel: [ 40.354979] [<ffffffff81071f3b>] worker_thread+0x11b/0x390
Jul 31 05:45:11 x220 kernel: [ 40.354987] [<ffffffff81071e20>] ? manage_workers.isra.25+0x2a0/0x2a0
Jul 31 05:45:11 x220 kernel: [ 40.354994] [<ffffffff810784a0>] kthread+0xc0/0xd0
Jul 31 05:45:11 x220 kernel: [ 40.355002] [<ffffffff810783e0>] ? insert_kthread_work+0x40/0x40
Jul 31 05:45:11 x220 kernel: [ 40.355010] [<ffffffff8161af1c>] ret_from_fork+0x7c/0xb0
Jul 31 05:45:11 x220 kernel: [ 40.355017] [<ffffffff810783e0>] ? insert_kthread_work+0x40/0x40
Jul 31 05:45:11 x220 kernel: [ 40.355022] ---[ end trace a4924d352b62426c ]---
And this continues until I hit power switch. Any specific debugging that would
be helpful?
thanks,
-chris
Commit "3d9646d mac80211: fix channel selection bug" introduced a possible
infinite loop by moving the out target above the chandef_downgrade
while loop. When we downgrade to NL80211_CHAN_WIDTH_20_NOHT, we jump
back up to re-run the while loop...indefinitely. Replace goto with
break and carry on. This may not be sufficient to connect to the AP,
but will at least keep the cpu from livelocking. Thanks to Derek Atkins
as an extra pair of debugging eyes.
Cc: [email protected]
Signed-off-by: Chris Wright <[email protected]>
---
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0e5aab1..b68ca05 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -336,7 +336,7 @@ out:
if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
ret = IEEE80211_STA_DISABLE_HT |
IEEE80211_STA_DISABLE_VHT;
- goto out;
+ break;
}
ret |= chandef_downgrade(chandef);
On Wed, 2013-07-31 at 12:12 -0700, Chris Wright wrote:
> Commit "3d9646d mac80211: fix channel selection bug" introduced a possible
> infinite loop by moving the out target above the chandef_downgrade
> while loop. When we downgrade to NL80211_CHAN_WIDTH_20_NOHT, we jump
> back up to re-run the while loop...indefinitely. Replace goto with
> break and carry on. This may not be sufficient to connect to the AP,
> but will at least keep the cpu from livelocking. Thanks to Derek Atkins
> as an extra pair of debugging eyes.
Applied, thanks.
johannes
On Wed, Jul 31, 2013 at 11:50:34AM +0200, Johannes Berg wrote:
> From: Johannes Berg <[email protected]>
>
> While we're connected, the AP shouldn't change the primary channel
> in the HT information. We checked this, and dropped the connection
> if it did change it.
>
> Unfortunately, this is causing problems on some APs, e.g. on the
> Netgear WRT610NL: the beacons seem to always contain a bad channel
> and if we made a connection using a probe response (correct data)
> we drop the connection immediately and can basically not connect
> properly at all.
>
> Work around this by ignoring the HT primary channel information in
> beacons if we're already connected.
>
> Also print out more verbose messages in the other situations to
> help diagnose similar bugs quicker in the future.
>
> Cc: [email protected] [3.10]
Acked-by: Andy Isaacson <[email protected]>
LGTM. Wasn't able to test though. Recommend this for -stable,
regressing wifi connectivity makes it hard to report bugs.
-andy
On Wed, 2013-07-31 at 11:50 +0200, Johannes Berg wrote:
> From: Johannes Berg <[email protected]>
>
> While we're connected, the AP shouldn't change the primary channel
> in the HT information. We checked this, and dropped the connection
> if it did change it.
>
> Unfortunately, this is causing problems on some APs, e.g. on the
> Netgear WRT610NL: the beacons seem to always contain a bad channel
> and if we made a connection using a probe response (correct data)
> we drop the connection immediately and can basically not connect
> properly at all.
>
> Work around this by ignoring the HT primary channel information in
> beacons if we're already connected.
>
> Also print out more verbose messages in the other situations to
> help diagnose similar bugs quicker in the future.
Applied.
johannes