Fix following schedule while atomic in mt76x02_reset_state
since synchronize_rcu is run inside a RCU section
[44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
[44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
[44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
[44036.944287] INFO: lockdep is turned off.
[44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
[44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
[44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
[44036.944308] Call Trace:
[44036.944317] dump_stack+0x67/0x90
[44036.944322] ___might_sleep.cold.88+0x9f/0xaf
[44036.944327] rcu_blocking_is_gp+0x13/0x50
[44036.944330] synchronize_rcu+0x17/0x80
[44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
[44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
[44036.944355] process_one_work+0x2a5/0x620
[44036.944361] worker_thread+0x35/0x3e0
[44036.944368] kthread+0x11c/0x140
[44036.944376] ret_from_fork+0x3a/0x50
[44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
[44036.944387] INFO: lockdep is turned off.
[44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
Introduce __mt76_sta_remove in order to run sta_remove without holding dev->mutex.
Move __mt76_sta_remove outside of RCU section in mt76x02_reset_state
Fixes: e4ebb8b403d1 ("mt76: mt76x2: implement full device restart on watchdog reset")
Signed-off-by: Lorenzo Bianconi <[email protected]>
---
Changes since v1:
- fix checkpatch warnings
- add lockdep_assert_held in mt76x02_reset_state
- access wcid pointer through rcu_dereference_protected()
---
drivers/net/wireless/mediatek/mt76/mac80211.c | 18 +++++++++++-------
drivers/net/wireless/mediatek/mt76/mt76.h | 2 ++
.../net/wireless/mediatek/mt76/mt76x02_mmio.c | 19 ++++++++++---------
3 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index a033745adb2f..316167404729 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -679,19 +679,15 @@ mt76_sta_add(struct mt76_dev *dev, struct ieee80211_vif *vif,
return ret;
}
-static void
-mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif,
- struct ieee80211_sta *sta)
+void __mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta)
{
struct mt76_wcid *wcid = (struct mt76_wcid *)sta->drv_priv;
- int idx = wcid->idx;
- int i;
+ int i, idx = wcid->idx;
rcu_assign_pointer(dev->wcid[idx], NULL);
synchronize_rcu();
- mutex_lock(&dev->mutex);
-
if (dev->drv->sta_remove)
dev->drv->sta_remove(dev, vif, sta);
@@ -699,7 +695,15 @@ mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif,
for (i = 0; i < ARRAY_SIZE(sta->txq); i++)
mt76_txq_remove(dev, sta->txq[i]);
mt76_wcid_free(dev->wcid_mask, idx);
+}
+EXPORT_SYMBOL_GPL(__mt76_sta_remove);
+static void
+mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta)
+{
+ mutex_lock(&dev->mutex);
+ __mt76_sta_remove(dev, vif, sta);
mutex_unlock(&dev->mutex);
}
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 477027bb9aaf..eb72e4bf3db6 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -697,6 +697,8 @@ int mt76_sta_state(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
struct ieee80211_sta *sta,
enum ieee80211_sta_state old_state,
enum ieee80211_sta_state new_state);
+void __mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta);
struct ieee80211_sta *mt76_rx_convert(struct sk_buff *skb);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c
index 6a34a6afcfe4..531779d8856e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c
@@ -423,19 +423,23 @@ static void mt76x02_reset_state(struct mt76x02_dev *dev)
{
int i;
+ lockdep_assert_held(&dev->mt76.mutex);
+
clear_bit(MT76_STATE_RUNNING, &dev->mt76.state);
rcu_read_lock();
-
ieee80211_iter_keys_rcu(dev->mt76.hw, NULL, mt76x02_key_sync, NULL);
+ rcu_read_unlock();
for (i = 0; i < ARRAY_SIZE(dev->mt76.wcid); i++) {
- struct mt76_wcid *wcid = rcu_dereference(dev->mt76.wcid[i]);
- struct mt76x02_sta *msta;
struct ieee80211_sta *sta;
struct ieee80211_vif *vif;
+ struct mt76x02_sta *msta;
+ struct mt76_wcid *wcid;
void *priv;
+ wcid = rcu_dereference_protected(dev->mt76.wcid[i],
+ lockdep_is_held(&dev->mt76.mutex));
if (!wcid)
continue;
@@ -445,13 +449,10 @@ static void mt76x02_reset_state(struct mt76x02_dev *dev)
priv = msta->vif;
vif = container_of(priv, struct ieee80211_vif, drv_priv);
- mt76_sta_state(dev->mt76.hw, vif, sta,
- IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST);
+ __mt76_sta_remove(&dev->mt76, vif, sta);
memset(msta, 0, sizeof(*msta));
}
- rcu_read_unlock();
-
dev->vif_mask = 0;
dev->beacon_mask = 0;
}
@@ -471,11 +472,11 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev)
for (i = 0; i < ARRAY_SIZE(dev->mt76.napi); i++)
napi_disable(&dev->mt76.napi[i]);
+ mutex_lock(&dev->mt76.mutex);
+
if (restart)
mt76x02_reset_state(dev);
- mutex_lock(&dev->mt76.mutex);
-
if (dev->beacon_mask)
mt76_clear(dev, MT_BEACON_TIME_CFG,
MT_BEACON_TIME_CFG_BEACON_TX |
--
2.20.1
On Mon, Mar 11, 2019 at 02:24:35PM +0100, Lorenzo Bianconi wrote:
> Fix following schedule while atomic in mt76x02_reset_state
> since synchronize_rcu is run inside a RCU section
>
> [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> [44036.944287] INFO: lockdep is turned off.
> [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> [44036.944308] Call Trace:
> [44036.944317] dump_stack+0x67/0x90
> [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> [44036.944327] rcu_blocking_is_gp+0x13/0x50
> [44036.944330] synchronize_rcu+0x17/0x80
> [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> [44036.944355] process_one_work+0x2a5/0x620
> [44036.944361] worker_thread+0x35/0x3e0
> [44036.944368] kthread+0x11c/0x140
> [44036.944376] ret_from_fork+0x3a/0x50
> [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> [44036.944387] INFO: lockdep is turned off.
> [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
Does the patch fix the issue for you ? For me on my MT7612E device it
make the BUG warning gone, but instead of that I have total system hung
without any error message except information about hw restart.
[ 174.425507] mt76x2e 0000:04:00.0: mac specific condition occurred
[ 176.590750] mt76x2e 0000:04:00.0: MCU message 31 (seq 13) timed out
[ 176.861345] mt76x2e 0000:04:00.0: Firmware Version: 0.0.00
[ 176.867214] mt76x2e 0000:04:00.0: Build: 1
[ 176.876563] mt76x2e 0000:04:00.0: Build Time: 201507311614____
[ 176.908095] mt76x2e 0000:04:00.0: Firmware running!
[ 176.920030] ieee80211 phy0: Hardware restart was requested
... hung at this point.
This is with this fix and Felix's
[PATCH] mac80211: do not call driver wake_tx_queue op during reconfig
on latest nbd/wireless tree.
Stanislaw
>
> On Mon, Mar 11, 2019 at 02:24:35PM +0100, Lorenzo Bianconi wrote:
> > Fix following schedule while atomic in mt76x02_reset_state
> > since synchronize_rcu is run inside a RCU section
> >
> > [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> > [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> > [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> > [44036.944287] INFO: lockdep is turned off.
> > [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> > [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> > [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> > [44036.944308] Call Trace:
> > [44036.944317] dump_stack+0x67/0x90
> > [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> > [44036.944327] rcu_blocking_is_gp+0x13/0x50
> > [44036.944330] synchronize_rcu+0x17/0x80
> > [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> > [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> > [44036.944355] process_one_work+0x2a5/0x620
> > [44036.944361] worker_thread+0x35/0x3e0
> > [44036.944368] kthread+0x11c/0x140
> > [44036.944376] ret_from_fork+0x3a/0x50
> > [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> > [44036.944387] INFO: lockdep is turned off.
> > [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
>
> Does the patch fix the issue for you ? For me on my MT7612E device it
> make the BUG warning gone, but instead of that I have total system hung
> without any error message except information about hw restart.
>
Hi Stanislaw,
this patch just fixes the 'schedule while atomic' issue.
> [ 174.425507] mt76x2e 0000:04:00.0: mac specific condition occurred
> [ 176.590750] mt76x2e 0000:04:00.0: MCU message 31 (seq 13) timed out
> [ 176.861345] mt76x2e 0000:04:00.0: Firmware Version: 0.0.00
> [ 176.867214] mt76x2e 0000:04:00.0: Build: 1
> [ 176.876563] mt76x2e 0000:04:00.0: Build Time: 201507311614____
> [ 176.908095] mt76x2e 0000:04:00.0: Firmware running!
> [ 176.920030] ieee80211 phy0: Hardware restart was requested
>
> ... hung at this point.
>
> This is with this fix and Felix's
> [PATCH] mac80211: do not call driver wake_tx_queue op during reconfig
> on latest nbd/wireless tree.
>
> Stanislaw
>
are you using U7612E-H1? I am still having issues on this card but I had no time
to look at it yet.
Regards,
Lorenzo
--
UNIX is Sexy: who | grep -i blonde | talk; cd ~; wine; talk; touch;
unzip; touch; strip; gasp; finger; gasp; mount; fsck; more; yes; gasp;
umount; make clean; sleep
On Tue, Mar 12, 2019 at 05:48:14PM +0100, Lorenzo Bianconi wrote:
> >
> > On Mon, Mar 11, 2019 at 02:24:35PM +0100, Lorenzo Bianconi wrote:
> > > Fix following schedule while atomic in mt76x02_reset_state
> > > since synchronize_rcu is run inside a RCU section
> > >
> > > [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> > > [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> > > [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> > > [44036.944287] INFO: lockdep is turned off.
> > > [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> > > [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> > > [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> > > [44036.944308] Call Trace:
> > > [44036.944317] dump_stack+0x67/0x90
> > > [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> > > [44036.944327] rcu_blocking_is_gp+0x13/0x50
> > > [44036.944330] synchronize_rcu+0x17/0x80
> > > [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> > > [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> > > [44036.944355] process_one_work+0x2a5/0x620
> > > [44036.944361] worker_thread+0x35/0x3e0
> > > [44036.944368] kthread+0x11c/0x140
> > > [44036.944376] ret_from_fork+0x3a/0x50
> > > [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> > > [44036.944387] INFO: lockdep is turned off.
> > > [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
> >
> > Does the patch fix the issue for you ? For me on my MT7612E device it
> > make the BUG warning gone, but instead of that I have total system hung
> > without any error message except information about hw restart.
> >
> Hi Stanislaw,
>
> this patch just fixes the 'schedule while atomic' issue.
Well, if it exchange 'schedule while atomic' warning to system hung it's
not good fix. Again, does the fix make restart work for you ? Have you
tested it ?
> > [ 174.425507] mt76x2e 0000:04:00.0: mac specific condition occurred
> > [ 176.590750] mt76x2e 0000:04:00.0: MCU message 31 (seq 13) timed out
> > [ 176.861345] mt76x2e 0000:04:00.0: Firmware Version: 0.0.00
> > [ 176.867214] mt76x2e 0000:04:00.0: Build: 1
> > [ 176.876563] mt76x2e 0000:04:00.0: Build Time: 201507311614____
> > [ 176.908095] mt76x2e 0000:04:00.0: Firmware running!
> > [ 176.920030] ieee80211 phy0: Hardware restart was requested
> >
> > ... hung at this point.
> >
> > This is with this fix and Felix's
> > [PATCH] mac80211: do not call driver wake_tx_queue op during reconfig
> > on latest nbd/wireless tree.
> >
> > Stanislaw
> >
>
> are you using U7612E-H1? I am still having issues on this card but I had no time
> to look at it yet.
Not sure if the number is correct, but yes, I don't have others MT7612E
cards, only one card which mt76x2e driver does not handle well.
Stanislaw
>
> On Tue, Mar 12, 2019 at 05:48:14PM +0100, Lorenzo Bianconi wrote:
> > >
> > > On Mon, Mar 11, 2019 at 02:24:35PM +0100, Lorenzo Bianconi wrote:
> > > > Fix following schedule while atomic in mt76x02_reset_state
> > > > since synchronize_rcu is run inside a RCU section
> > > >
> > > > [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> > > > [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> > > > [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> > > > [44036.944287] INFO: lockdep is turned off.
> > > > [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> > > > [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> > > > [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> > > > [44036.944308] Call Trace:
> > > > [44036.944317] dump_stack+0x67/0x90
> > > > [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> > > > [44036.944327] rcu_blocking_is_gp+0x13/0x50
> > > > [44036.944330] synchronize_rcu+0x17/0x80
> > > > [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> > > > [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> > > > [44036.944355] process_one_work+0x2a5/0x620
> > > > [44036.944361] worker_thread+0x35/0x3e0
> > > > [44036.944368] kthread+0x11c/0x140
> > > > [44036.944376] ret_from_fork+0x3a/0x50
> > > > [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> > > > [44036.944387] INFO: lockdep is turned off.
> > > > [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
> > >
> > > Does the patch fix the issue for you ? For me on my MT7612E device it
> > > make the BUG warning gone, but instead of that I have total system hung
> > > without any error message except information about hw restart.
The system hang is not related to the 'schedule while atomic'.
If you look at the code we run synchronize_rcu() inside a rcu section
and this is not allowed.
I just run it outside of the rcu section protecting it with the mutex,
the reset is performed even with
this patch (just look at syslog). The system hang is related to this
particular card since other devices
work properly.
Regards,
Lorenzo
> > >
> > Hi Stanislaw,
> >
> > this patch just fixes the 'schedule while atomic' issue.
>
> Well, if it exchange 'schedule while atomic' warning to system hung it's
> not good fix. Again, does the fix make restart work for you ? Have you
> tested it ?
>
> > > [ 174.425507] mt76x2e 0000:04:00.0: mac specific condition occurred
> > > [ 176.590750] mt76x2e 0000:04:00.0: MCU message 31 (seq 13) timed out
> > > [ 176.861345] mt76x2e 0000:04:00.0: Firmware Version: 0.0.00
> > > [ 176.867214] mt76x2e 0000:04:00.0: Build: 1
> > > [ 176.876563] mt76x2e 0000:04:00.0: Build Time: 201507311614____
> > > [ 176.908095] mt76x2e 0000:04:00.0: Firmware running!
> > > [ 176.920030] ieee80211 phy0: Hardware restart was requested
> > >
> > > ... hung at this point.
> > >
> > > This is with this fix and Felix's
> > > [PATCH] mac80211: do not call driver wake_tx_queue op during reconfig
> > > on latest nbd/wireless tree.
> > >
> > > Stanislaw
> > >
> >
> > are you using U7612E-H1? I am still having issues on this card but I had no time
> > to look at it yet.
>
> Not sure if the number is correct, but yes, I don't have others MT7612E
> cards, only one card which mt76x2e driver does not handle well.
>
> Stanislaw
--
UNIX is Sexy: who | grep -i blonde | talk; cd ~; wine; talk; touch;
unzip; touch; strip; gasp; finger; gasp; mount; fsck; more; yes; gasp;
umount; make clean; sleep
On Wed, Mar 13, 2019 at 10:31:11AM +0100, Lorenzo Bianconi wrote:
> >
> > On Tue, Mar 12, 2019 at 05:48:14PM +0100, Lorenzo Bianconi wrote:
> > > >
> > > > On Mon, Mar 11, 2019 at 02:24:35PM +0100, Lorenzo Bianconi wrote:
> > > > > Fix following schedule while atomic in mt76x02_reset_state
> > > > > since synchronize_rcu is run inside a RCU section
> > > > >
> > > > > [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> > > > > [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> > > > > [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> > > > > [44036.944287] INFO: lockdep is turned off.
> > > > > [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> > > > > [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> > > > > [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> > > > > [44036.944308] Call Trace:
> > > > > [44036.944317] dump_stack+0x67/0x90
> > > > > [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> > > > > [44036.944327] rcu_blocking_is_gp+0x13/0x50
> > > > > [44036.944330] synchronize_rcu+0x17/0x80
> > > > > [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> > > > > [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> > > > > [44036.944355] process_one_work+0x2a5/0x620
> > > > > [44036.944361] worker_thread+0x35/0x3e0
> > > > > [44036.944368] kthread+0x11c/0x140
> > > > > [44036.944376] ret_from_fork+0x3a/0x50
> > > > > [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> > > > > [44036.944387] INFO: lockdep is turned off.
> > > > > [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
> > > >
> > > > Does the patch fix the issue for you ? For me on my MT7612E device it
> > > > make the BUG warning gone, but instead of that I have total system hung
> > > > without any error message except information about hw restart.
>
> The system hang is not related to the 'schedule while atomic'.
> If you look at the code we run synchronize_rcu() inside a rcu section
> and this is not allowed.
> I just run it outside of the rcu section protecting it with the mutex,
> the reset is performed even with
> this patch (just look at syslog). The system hang is related to this
> particular card since other devices
> work properly.
Ok, so it works for you. That I wanted to know.
Stanislaw
>
> On Wed, Mar 13, 2019 at 10:31:11AM +0100, Lorenzo Bianconi wrote:
> > >
> > > On Tue, Mar 12, 2019 at 05:48:14PM +0100, Lorenzo Bianconi wrote:
> > > > >
> > > > > On Mon, Mar 11, 2019 at 02:24:35PM +0100, Lorenzo Bianconi wrote:
> > > > > > Fix following schedule while atomic in mt76x02_reset_state
> > > > > > since synchronize_rcu is run inside a RCU section
> > > > > >
> > > > > > [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> > > > > > [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> > > > > > [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> > > > > > [44036.944287] INFO: lockdep is turned off.
> > > > > > [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> > > > > > [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> > > > > > [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> > > > > > [44036.944308] Call Trace:
> > > > > > [44036.944317] dump_stack+0x67/0x90
> > > > > > [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> > > > > > [44036.944327] rcu_blocking_is_gp+0x13/0x50
> > > > > > [44036.944330] synchronize_rcu+0x17/0x80
> > > > > > [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> > > > > > [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> > > > > > [44036.944355] process_one_work+0x2a5/0x620
> > > > > > [44036.944361] worker_thread+0x35/0x3e0
> > > > > > [44036.944368] kthread+0x11c/0x140
> > > > > > [44036.944376] ret_from_fork+0x3a/0x50
> > > > > > [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> > > > > > [44036.944387] INFO: lockdep is turned off.
> > > > > > [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
> > > > >
> > > > > Does the patch fix the issue for you ? For me on my MT7612E device it
> > > > > make the BUG warning gone, but instead of that I have total system hung
> > > > > without any error message except information about hw restart.
> >
> > The system hang is not related to the 'schedule while atomic'.
> > If you look at the code we run synchronize_rcu() inside a rcu section
> > and this is not allowed.
> > I just run it outside of the rcu section protecting it with the mutex,
> > the reset is performed even with
> > this patch (just look at syslog). The system hang is related to this
> > particular card since other devices
> > work properly.
>
> Ok, so it works for you. That I wanted to know.
If you mean:
- the reset is performed: yes
- the card is working: no
Regards,
Lorenzo
>
> Stanislaw
--
UNIX is Sexy: who | grep -i blonde | talk; cd ~; wine; talk; touch;
unzip; touch; strip; gasp; finger; gasp; mount; fsck; more; yes; gasp;
umount; make clean; sleep
On Wed, Mar 13, 2019 at 10:36:39AM +0100, Lorenzo Bianconi wrote:
> >
> > On Wed, Mar 13, 2019 at 10:31:11AM +0100, Lorenzo Bianconi wrote:
> > > >
> > > > On Tue, Mar 12, 2019 at 05:48:14PM +0100, Lorenzo Bianconi wrote:
> > > > > >
> > > > > > On Mon, Mar 11, 2019 at 02:24:35PM +0100, Lorenzo Bianconi wrote:
> > > > > > > Fix following schedule while atomic in mt76x02_reset_state
> > > > > > > since synchronize_rcu is run inside a RCU section
> > > > > > >
> > > > > > > [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> > > > > > > [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> > > > > > > [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> > > > > > > [44036.944287] INFO: lockdep is turned off.
> > > > > > > [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> > > > > > > [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> > > > > > > [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> > > > > > > [44036.944308] Call Trace:
> > > > > > > [44036.944317] dump_stack+0x67/0x90
> > > > > > > [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> > > > > > > [44036.944327] rcu_blocking_is_gp+0x13/0x50
> > > > > > > [44036.944330] synchronize_rcu+0x17/0x80
> > > > > > > [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> > > > > > > [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> > > > > > > [44036.944355] process_one_work+0x2a5/0x620
> > > > > > > [44036.944361] worker_thread+0x35/0x3e0
> > > > > > > [44036.944368] kthread+0x11c/0x140
> > > > > > > [44036.944376] ret_from_fork+0x3a/0x50
> > > > > > > [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> > > > > > > [44036.944387] INFO: lockdep is turned off.
> > > > > > > [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
> > > > > >
> > > > > > Does the patch fix the issue for you ? For me on my MT7612E device it
> > > > > > make the BUG warning gone, but instead of that I have total system hung
> > > > > > without any error message except information about hw restart.
> > >
> > > The system hang is not related to the 'schedule while atomic'.
> > > If you look at the code we run synchronize_rcu() inside a rcu section
> > > and this is not allowed.
> > > I just run it outside of the rcu section protecting it with the mutex,
> > > the reset is performed even with
> > > this patch (just look at syslog). The system hang is related to this
> > > particular card since other devices
> > > work properly.
> >
> > Ok, so it works for you. That I wanted to know.
>
> If you mean:
> - the reset is performed: yes
I meant the reset works on mt76x2e without hung on SMP machines
(on whatever card).
Stanislaw
Lorenzo Bianconi <[email protected]> wrote:
> Fix following schedule while atomic in mt76x02_reset_state
> since synchronize_rcu is run inside a RCU section
>
> [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out
> [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818
> [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1
> [44036.944287] INFO: lockdep is turned off.
> [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7
> [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009
> [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib]
> [44036.944308] Call Trace:
> [44036.944317] dump_stack+0x67/0x90
> [44036.944322] ___might_sleep.cold.88+0x9f/0xaf
> [44036.944327] rcu_blocking_is_gp+0x13/0x50
> [44036.944330] synchronize_rcu+0x17/0x80
> [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76]
> [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib]
> [44036.944355] process_one_work+0x2a5/0x620
> [44036.944361] worker_thread+0x35/0x3e0
> [44036.944368] kthread+0x11c/0x140
> [44036.944376] ret_from_fork+0x3a/0x50
> [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002
> [44036.944387] INFO: lockdep is turned off.
> [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi
>
> Introduce __mt76_sta_remove in order to run sta_remove without holding dev->mutex.
> Move __mt76_sta_remove outside of RCU section in mt76x02_reset_state
>
> Fixes: e4ebb8b403d1 ("mt76: mt76x2: implement full device restart on watchdog reset")
> Signed-off-by: Lorenzo Bianconi <[email protected]>
Patch applied to wireless-drivers.git, thanks.
13f61dfc5235 mt76: fix schedule while atomic in mt76x02_reset_state
--
https://patchwork.kernel.org/patch/10847437/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches