2024-03-07 19:30:27

by Ben Greear

[permalink] [raw]
Subject: [PATCH 1/6] wifi: mt76: mt7996: add debugging for MCU command timeouts.

From: Ben Greear <[email protected]>

Print information about whether the message is the first timeout,
and also print info if we manage to recover after a timeout.

Signed-off-by: Ben Greear <[email protected]>
---
drivers/net/wireless/mediatek/mt76/mt76.h | 2 ++
.../net/wireless/mediatek/mt76/mt7996/mcu.c | 31 +++++++++++++++++--
2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index b20c34d5a0f7..c341915e4d62 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -829,6 +829,8 @@ struct mt76_dev {
struct device *dma_dev;

struct mt76_mcu mcu;
+ u32 first_failed_mcu_cmd; /* for debugging */
+ u32 last_successful_mcu_cmd; /* for debugging */

struct net_device napi_dev;
struct net_device tx_napi_dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
index 699be57309c2..a858a0914bf0 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
@@ -200,14 +200,39 @@ mt7996_mcu_parse_response(struct mt76_dev *mdev, int cmd,
int ret = 0;

if (!skb) {
- dev_err(mdev->dev, "Message %08x (seq %d) timeout\n",
- cmd, seq);
+ const char *first = "Secondary";
+
+ if (!mdev->first_failed_mcu_cmd)
+ first = "Initial";
+
+ dev_err(mdev->dev,
+ "MCU: %s Failure: Message %08x (cid %lx ext_cid: %lx seq %d) timeout. Last successful cmd: 0x%x\n",
+ first,
+ cmd, FIELD_GET(__MCU_CMD_FIELD_ID, cmd),
+ FIELD_GET(__MCU_CMD_FIELD_EXT_ID, cmd), seq,
+ mdev->last_successful_mcu_cmd);
+
+ if (!mdev->first_failed_mcu_cmd)
+ mdev->first_failed_mcu_cmd = cmd;
return -ETIMEDOUT;
}

+ mdev->last_successful_mcu_cmd = cmd;
+
+ if (mdev->first_failed_mcu_cmd) {
+ dev_err(mdev->dev, "MCU: First success after failure: Message %08x (cid %lx ext_cid: %lx seq %d)\n",
+ cmd, FIELD_GET(__MCU_CMD_FIELD_ID, cmd),
+ FIELD_GET(__MCU_CMD_FIELD_EXT_ID, cmd), seq);
+ mdev->first_failed_mcu_cmd = 0;
+ }
+
rxd = (struct mt7996_mcu_rxd *)skb->data;
- if (seq != rxd->seq)
+ if (seq != rxd->seq) {
+ /* This can happen if the previous request didn't wait (which is normal).
+ * Quietly return EAGAIN in that case, it is not something to warn about.
+ */
return -EAGAIN;
+ }

if (cmd == MCU_CMD(PATCH_SEM_CONTROL)) {
skb_pull(skb, sizeof(*rxd) - 4);
--
2.42.0



2024-03-07 19:30:31

by Ben Greear

[permalink] [raw]
Subject: [PATCH 4/6] wifi: mt76: mt7915: Mitigate mcu communication loss.

From: Ben Greear <[email protected]>

Many calls that end up sending mcu messages to the firmware hold
RTNL or other important locks. So when radio stops answering,
the entire system becomes very sluggish.

Add timeout counter, and if radio times out 3 times in a row,
consider it dead and no longer attempt to talk to it.

Signed-off-by: Ben Greear <[email protected]>
---
drivers/net/wireless/mediatek/mt76/dma.c | 14 ++++++++++++++
drivers/net/wireless/mediatek/mt76/mt76.h | 3 +++
drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 2 ++
3 files changed, 19 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 00230f106294..4d1426093e1e 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -510,6 +510,20 @@ mt76_dma_tx_queue_skb_raw(struct mt76_dev *dev, struct mt76_queue *q,
if (test_bit(MT76_MCU_RESET, &dev->phy.state))
goto error;

+ /* Check for non responsive radios. Better to just stop sending it messages
+ * than continuously block the OS (since rtnl and similar are often held while
+ * the timeout is happening).
+ */
+ if (dev->mcu_timeouts > MAX_MCU_TIMEOUTS) {
+ static unsigned long last_log;
+
+ if (time_after(jiffies, last_log + 5 * HZ)) {
+ last_log = jiffies;
+ mtk_dbg(dev, WRN, "mt76-dma-tx-queue-skb-raw, too many timeouts, msg is dropped.\n");
+ }
+ goto error;
+ }
+
if (q->queued + 1 >= q->ndesc - 1)
goto error;

diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index dd8a24cda48a..b052a9c24c73 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -832,6 +832,9 @@ struct mt76_dev {
struct mt76_mcu mcu;
u32 first_failed_mcu_cmd; /* for debugging */
u32 last_successful_mcu_cmd; /* for debugging */
+ u32 mcu_timeouts; /* sequential timeout counter */
+ #define MAX_MCU_TIMEOUTS 3
+

struct net_device napi_dev;
struct net_device tx_napi_dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index c67c4f6ca2aa..f3e60fba48b2 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -161,11 +161,13 @@ mt7915_mcu_parse_response(struct mt76_dev *mdev, int cmd,
int ret = 0;

if (!skb) {
+ mdev->mcu_timeouts++;
dev_err(mdev->dev, "Message %08x (seq %d) timeout\n",
cmd, seq);
return -ETIMEDOUT;
}

+ mdev->mcu_timeouts = 0;
rxd = (struct mt76_connac2_mcu_rxd *)skb->data;
if (seq != rxd->seq &&
!(rxd->eid == MCU_CMD_EXT_CID &&
--
2.42.0