2017-01-25 16:37:08

by Felix Fietkau

[permalink] [raw]
Subject: [PATCH 1/4] ath9k: rename tx_complete_work to hw_check_work

Also include common MAC alive check. This should make the hang checks
more reliable for modes where beacons are not sent and is used as a
starting point for further hang check improvements

Signed-off-by: Felix Fietkau <[email protected]>
---
drivers/net/wireless/ath/ath9k/ath9k.h | 6 ++---
drivers/net/wireless/ath/ath9k/init.c | 1 +
drivers/net/wireless/ath/ath9k/link.c | 46 ++++++++++++++++++----------------
drivers/net/wireless/ath/ath9k/main.c | 10 +++++---
drivers/net/wireless/ath/ath9k/xmit.c | 2 --
5 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index 331947b6a667..b2fa44adc60e 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -108,7 +108,7 @@ int ath_descdma_setup(struct ath_softc *sc, struct ath_descdma *dd,
#define ATH_AGGR_MIN_QDEPTH 2
/* minimum h/w qdepth for non-aggregated traffic */
#define ATH_NON_AGGR_MIN_QDEPTH 8
-#define ATH_TX_COMPLETE_POLL_INT 1000
+#define ATH_HW_CHECK_POLL_INT 1000
#define ATH_TXFIFO_DEPTH 8
#define ATH_TX_ERROR 0x01

@@ -745,7 +745,7 @@ void ath9k_csa_update(struct ath_softc *sc);
#define ATH_PAPRD_TIMEOUT 100 /* msecs */
#define ATH_PLL_WORK_INTERVAL 100

-void ath_tx_complete_poll_work(struct work_struct *work);
+void ath_hw_check_work(struct work_struct *work);
void ath_reset_work(struct work_struct *work);
bool ath_hw_check(struct ath_softc *sc);
void ath_hw_pll_work(struct work_struct *work);
@@ -1053,7 +1053,7 @@ struct ath_softc {
#ifdef CONFIG_ATH9K_DEBUGFS
struct ath9k_debug debug;
#endif
- struct delayed_work tx_complete_work;
+ struct delayed_work hw_check_work;
struct delayed_work hw_pll_work;
struct timer_list sleep_timer;

diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 084ad1bd495f..a2f7f48bbb92 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -681,6 +681,7 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
INIT_WORK(&sc->hw_reset_work, ath_reset_work);
INIT_WORK(&sc->paprd_work, ath_paprd_calibrate);
INIT_DELAYED_WORK(&sc->hw_pll_work, ath_hw_pll_work);
+ INIT_DELAYED_WORK(&sc->hw_check_work, ath_hw_check_work);

ath9k_init_channel_context(sc);

diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c
index 5ad0feeebc86..27c50562dc47 100644
--- a/drivers/net/wireless/ath/ath9k/link.c
+++ b/drivers/net/wireless/ath/ath9k/link.c
@@ -20,20 +20,13 @@
* TX polling - checks if the TX engine is stuck somewhere
* and issues a chip reset if so.
*/
-void ath_tx_complete_poll_work(struct work_struct *work)
+static bool ath_tx_complete_check(struct ath_softc *sc)
{
- struct ath_softc *sc = container_of(work, struct ath_softc,
- tx_complete_work.work);
struct ath_txq *txq;
int i;
- bool needreset = false;
-

- if (sc->tx99_state) {
- ath_dbg(ath9k_hw_common(sc->sc_ah), RESET,
- "skip tx hung detection on tx99\n");
- return;
- }
+ if (sc->tx99_state)
+ return true;

for (i = 0; i < IEEE80211_NUM_ACS; i++) {
txq = sc->tx.txq_map[i];
@@ -41,25 +34,36 @@ void ath_tx_complete_poll_work(struct work_struct *work)
ath_txq_lock(sc, txq);
if (txq->axq_depth) {
if (txq->axq_tx_inprogress) {
- needreset = true;
ath_txq_unlock(sc, txq);
- break;
- } else {
- txq->axq_tx_inprogress = true;
+ goto reset;
}
+
+ txq->axq_tx_inprogress = true;
}
ath_txq_unlock(sc, txq);
}

- if (needreset) {
- ath_dbg(ath9k_hw_common(sc->sc_ah), RESET,
- "tx hung, resetting the chip\n");
- ath9k_queue_reset(sc, RESET_TYPE_TX_HANG);
+ return true;
+
+reset:
+ ath_dbg(ath9k_hw_common(sc->sc_ah), RESET,
+ "tx hung, resetting the chip\n");
+ ath9k_queue_reset(sc, RESET_TYPE_TX_HANG);
+ return false;
+
+}
+
+void ath_hw_check_work(struct work_struct *work)
+{
+ struct ath_softc *sc = container_of(work, struct ath_softc,
+ hw_check_work.work);
+
+ if (!ath_hw_check(sc) ||
+ !ath_tx_complete_check(sc))
return;
- }

- ieee80211_queue_delayed_work(sc->hw, &sc->tx_complete_work,
- msecs_to_jiffies(ATH_TX_COMPLETE_POLL_INT));
+ ieee80211_queue_delayed_work(sc->hw, &sc->hw_check_work,
+ msecs_to_jiffies(ATH_HW_CHECK_POLL_INT));
}

/*
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 58f06ce9a4cf..0345d6ec564f 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -181,7 +181,7 @@ void ath9k_ps_restore(struct ath_softc *sc)
static void __ath_cancel_work(struct ath_softc *sc)
{
cancel_work_sync(&sc->paprd_work);
- cancel_delayed_work_sync(&sc->tx_complete_work);
+ cancel_delayed_work_sync(&sc->hw_check_work);
cancel_delayed_work_sync(&sc->hw_pll_work);

#ifdef CONFIG_ATH9K_BTCOEX_SUPPORT
@@ -198,7 +198,8 @@ void ath_cancel_work(struct ath_softc *sc)

void ath_restart_work(struct ath_softc *sc)
{
- ieee80211_queue_delayed_work(sc->hw, &sc->tx_complete_work, 0);
+ ieee80211_queue_delayed_work(sc->hw, &sc->hw_check_work,
+ ATH_HW_CHECK_POLL_INT);

if (AR_SREV_9340(sc->sc_ah) || AR_SREV_9330(sc->sc_ah))
ieee80211_queue_delayed_work(sc->hw, &sc->hw_pll_work,
@@ -2091,7 +2092,7 @@ void __ath9k_flush(struct ieee80211_hw *hw, u32 queues, bool drop,
int timeout;
bool drain_txq;

- cancel_delayed_work_sync(&sc->tx_complete_work);
+ cancel_delayed_work_sync(&sc->hw_check_work);

if (ah->ah_flags & AH_UNPLUGGED) {
ath_dbg(common, ANY, "Device has been unplugged!\n");
@@ -2129,7 +2130,8 @@ void __ath9k_flush(struct ieee80211_hw *hw, u32 queues, bool drop,
ath9k_ps_restore(sc);
}

- ieee80211_queue_delayed_work(hw, &sc->tx_complete_work, 0);
+ ieee80211_queue_delayed_work(hw, &sc->hw_check_work,
+ ATH_HW_CHECK_POLL_INT);
}

static bool ath9k_tx_frames_pending(struct ieee80211_hw *hw)
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 6962a63b869a..8f9a87915ba9 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -2916,8 +2916,6 @@ int ath_tx_init(struct ath_softc *sc, int nbufs)
return error;
}

- INIT_DELAYED_WORK(&sc->tx_complete_work, ath_tx_complete_poll_work);
-
if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA)
error = ath_tx_edma_init(sc);

--
2.11.0


2017-01-27 13:11:05

by Felix Fietkau

[permalink] [raw]
Subject: Re: [PATCH 4/4] ath9k: fix race condition in enabling/disabling IRQs

On 2017-01-25 17:36, Felix Fietkau wrote:
> The code currently relies on refcounting to disable IRQs from within the
> IRQ handler and re-enabling them again after the tasklet has run.
>
> However, due to race conditions sometimes the IRQ handler might be
> called twice, or the tasklet may not run at all (if interrupted in the
> middle of a reset).
>
> This can cause nasty imbalances in the irq-disable refcount which will
> get the driver permanently stuck until the entire radio has been stopped
> and started again (ath_reset will not recover from this).
>
> Instead of using this fragile logic, change the code to ensure that
> running the irq handler during tasklet processing is safe, and leave the
> refcount untouched.
>
> Cc: [email protected]
> Signed-off-by: Felix Fietkau <[email protected]>
Please don't apply this yet, it looks like it might cause some
regressions on other devices. I will investigate.

- Felix

2017-01-26 09:56:00

by Simon Wunderlich

[permalink] [raw]
Subject: Re: [PATCH 3/4] ath9k: check for deaf rx path state

Hey Felix,


On Wednesday, January 25, 2017 5:36:53 PM CET Felix Fietkau wrote:
> Various chips occasionally run into a state where the tx path still
> appears to be working normally, but the rx path is deaf.
>
> There is no known register signature to check for this state explicitly,
> so use the lack of rx interrupts as an indicator.
>
> This detection is prone to false positives, since a device could also
> simply be in an environment where there are no frames on the air.
> However, in this case doing a reset should be harmless since it's
> obviously not interrupting any real activity. To avoid confusion, call
> the reset counters in this case "Rx path inactive" instead of something
> like "Rx path deaf", since it may not be an indication of a real
> hardware failure.
>
> Signed-off-by: Felix Fietkau <[email protected]>

As we observed in the field, it may happen that there are still RX interrupts
triggered, but just a very low number - in which case I believe your version
wouldn't fix the problem. Therefore we had a threshold in our original patch
[1].

We would also appreciate if you can at least briefly mention our work if you
resend/rework our stuff.

Thank you,
Simon

[1] https://patchwork.kernel.org/patch/9433621/


Attachments:
signature.asc (833.00 B)
This is a digitally signed message part.

2017-01-26 10:26:06

by Felix Fietkau

[permalink] [raw]
Subject: Re: [PATCH 3/4] ath9k: check for deaf rx path state

On 2017-01-26 11:15, Simon Wunderlich wrote:
> Hey,
>
> On Thursday, January 26, 2017 11:02:53 AM CET Felix Fietkau wrote:
>> On 2017-01-26 10:50, Simon Wunderlich wrote:
>> > Hey Felix,
>> >
>> > On Wednesday, January 25, 2017 5:36:53 PM CET Felix Fietkau wrote:
>> >> Various chips occasionally run into a state where the tx path still
>> >> appears to be working normally, but the rx path is deaf.
>> >>
>> >> There is no known register signature to check for this state explicitly,
>> >> so use the lack of rx interrupts as an indicator.
>> >>
>> >> This detection is prone to false positives, since a device could also
>> >> simply be in an environment where there are no frames on the air.
>> >> However, in this case doing a reset should be harmless since it's
>> >> obviously not interrupting any real activity. To avoid confusion, call
>> >> the reset counters in this case "Rx path inactive" instead of something
>> >> like "Rx path deaf", since it may not be an indication of a real
>> >> hardware failure.
>> >>
>> >> Signed-off-by: Felix Fietkau <[email protected]>
>> >
>> > As we observed in the field, it may happen that there are still RX
>> > interrupts triggered, but just a very low number - in which case I
>> > believe your version wouldn't fix the problem. Therefore we had a
>> > threshold in our original patch [1].
>>
>> It seems that you were seeing something different than what I was seeing
>> in my tests. Though it could be that my issues were actually caused by
>> something else. I had queued up these changes a while back before I
>> finally found and fixed the IRQ issue.
>
> What we found a good threshold was to check for less than 1 RX interrupt per
> second, and check the mean average (about) every 30 seconds. If there is any
> other AP or a station connected, it will not reset the chip, and also there
> will be no reset on short outages.
But if there's less than 1 Rx interrupt per second, then my patch should
also trigger, right?

- Felix

2017-01-26 10:02:57

by Felix Fietkau

[permalink] [raw]
Subject: Re: [PATCH 3/4] ath9k: check for deaf rx path state

On 2017-01-26 10:50, Simon Wunderlich wrote:
> Hey Felix,
>
>
> On Wednesday, January 25, 2017 5:36:53 PM CET Felix Fietkau wrote:
>> Various chips occasionally run into a state where the tx path still
>> appears to be working normally, but the rx path is deaf.
>>
>> There is no known register signature to check for this state explicitly,
>> so use the lack of rx interrupts as an indicator.
>>
>> This detection is prone to false positives, since a device could also
>> simply be in an environment where there are no frames on the air.
>> However, in this case doing a reset should be harmless since it's
>> obviously not interrupting any real activity. To avoid confusion, call
>> the reset counters in this case "Rx path inactive" instead of something
>> like "Rx path deaf", since it may not be an indication of a real
>> hardware failure.
>>
>> Signed-off-by: Felix Fietkau <[email protected]>
>
> As we observed in the field, it may happen that there are still RX interrupts
> triggered, but just a very low number - in which case I believe your version
> wouldn't fix the problem. Therefore we had a threshold in our original patch
> [1].
It seems that you were seeing something different than what I was seeing
in my tests. Though it could be that my issues were actually caused by
something else. I had queued up these changes a while back before I
finally found and fixed the IRQ issue.

> We would also appreciate if you can at least briefly mention our work if you
> resend/rework our stuff.
Sorry about that. I rebased this from older experimental patches and
forgot to put in all the relevant context.

Kalle, please drop this change for now.

- Felix

2017-01-25 16:36:57

by Felix Fietkau

[permalink] [raw]
Subject: [PATCH 3/4] ath9k: check for deaf rx path state

Various chips occasionally run into a state where the tx path still
appears to be working normally, but the rx path is deaf.

There is no known register signature to check for this state explicitly,
so use the lack of rx interrupts as an indicator.

This detection is prone to false positives, since a device could also
simply be in an environment where there are no frames on the air.
However, in this case doing a reset should be harmless since it's
obviously not interrupting any real activity. To avoid confusion, call
the reset counters in this case "Rx path inactive" instead of something
like "Rx path deaf", since it may not be an indication of a real
hardware failure.

Signed-off-by: Felix Fietkau <[email protected]>
---
drivers/net/wireless/ath/ath9k/ath9k.h | 1 +
drivers/net/wireless/ath/ath9k/debug.c | 1 +
drivers/net/wireless/ath/ath9k/debug.h | 1 +
drivers/net/wireless/ath/ath9k/link.c | 16 +++++++++++++++-
drivers/net/wireless/ath/ath9k/main.c | 2 ++
5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index b2fa44adc60e..a396c99ee84d 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -1027,6 +1027,7 @@ struct ath_softc {

u8 gtt_cnt;
u32 intrstatus;
+ u32 rx_active;
u16 ps_flags; /* PS_* */
bool ps_enabled;
bool ps_idle;
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
index 43930c336987..a42402413659 100644
--- a/drivers/net/wireless/ath/ath9k/debug.c
+++ b/drivers/net/wireless/ath/ath9k/debug.c
@@ -763,6 +763,7 @@ static int read_file_reset(struct seq_file *file, void *data)
[RESET_TYPE_BEACON_STUCK] = "Stuck Beacon",
[RESET_TYPE_MCI] = "MCI Reset",
[RESET_TYPE_CALIBRATION] = "Calibration error",
+ [RESET_TYPE_RX_INACTIVE] = "Rx path inactive",
[RESET_TX_DMA_ERROR] = "Tx DMA stop error",
[RESET_RX_DMA_ERROR] = "Rx DMA stop error",
};
diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
index 249f8141cd00..a40c84e8d0b6 100644
--- a/drivers/net/wireless/ath/ath9k/debug.h
+++ b/drivers/net/wireless/ath/ath9k/debug.h
@@ -50,6 +50,7 @@ enum ath_reset_type {
RESET_TYPE_BEACON_STUCK,
RESET_TYPE_MCI,
RESET_TYPE_CALIBRATION,
+ RESET_TYPE_RX_INACTIVE,
RESET_TX_DMA_ERROR,
RESET_RX_DMA_ERROR,
__RESET_TYPE_MAX
diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c
index 27c50562dc47..b9d92aae00c2 100644
--- a/drivers/net/wireless/ath/ath9k/link.c
+++ b/drivers/net/wireless/ath/ath9k/link.c
@@ -53,13 +53,27 @@ static bool ath_tx_complete_check(struct ath_softc *sc)

}

+static bool ath_rx_active_check(struct ath_softc *sc)
+{
+ if (sc->rx_active) {
+ sc->rx_active = 0;
+ return true;
+ }
+
+ ath_dbg(ath9k_hw_common(sc->sc_ah), RESET,
+ "rx path inactive, resetting the chip\n");
+ ath9k_queue_reset(sc, RESET_TYPE_RX_INACTIVE);
+ return false;
+}
+
void ath_hw_check_work(struct work_struct *work)
{
struct ath_softc *sc = container_of(work, struct ath_softc,
hw_check_work.work);

if (!ath_hw_check(sc) ||
- !ath_tx_complete_check(sc))
+ !ath_tx_complete_check(sc) ||
+ !ath_rx_active_check(sc))
return;

ieee80211_queue_delayed_work(sc->hw, &sc->hw_check_work,
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 0345d6ec564f..985b1cade4a4 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -269,6 +269,7 @@ static bool ath_complete_reset(struct ath_softc *sc, bool start)
}

sc->gtt_cnt = 0;
+ sc->rx_active = 1;

ath9k_hw_set_interrupts(ah);
ath9k_hw_enable_interrupts(ah);
@@ -452,6 +453,7 @@ void ath9k_tasklet(unsigned long data)
ath_rx_tasklet(sc, 0, true);

ath_rx_tasklet(sc, 0, false);
+ sc->rx_active = 1;
}

if (status & ATH9K_INT_TX) {
--
2.11.0

2017-01-26 10:15:03

by Simon Wunderlich

[permalink] [raw]
Subject: Re: [PATCH 3/4] ath9k: check for deaf rx path state

Hey,

On Thursday, January 26, 2017 11:02:53 AM CET Felix Fietkau wrote:
> On 2017-01-26 10:50, Simon Wunderlich wrote:
> > Hey Felix,
> >
> > On Wednesday, January 25, 2017 5:36:53 PM CET Felix Fietkau wrote:
> >> Various chips occasionally run into a state where the tx path still
> >> appears to be working normally, but the rx path is deaf.
> >>
> >> There is no known register signature to check for this state explicitly,
> >> so use the lack of rx interrupts as an indicator.
> >>
> >> This detection is prone to false positives, since a device could also
> >> simply be in an environment where there are no frames on the air.
> >> However, in this case doing a reset should be harmless since it's
> >> obviously not interrupting any real activity. To avoid confusion, call
> >> the reset counters in this case "Rx path inactive" instead of something
> >> like "Rx path deaf", since it may not be an indication of a real
> >> hardware failure.
> >>
> >> Signed-off-by: Felix Fietkau <[email protected]>
> >
> > As we observed in the field, it may happen that there are still RX
> > interrupts triggered, but just a very low number - in which case I
> > believe your version wouldn't fix the problem. Therefore we had a
> > threshold in our original patch [1].
>
> It seems that you were seeing something different than what I was seeing
> in my tests. Though it could be that my issues were actually caused by
> something else. I had queued up these changes a while back before I
> finally found and fixed the IRQ issue.

What we found a good threshold was to check for less than 1 RX interrupt per
second, and check the mean average (about) every 30 seconds. If there is any
other AP or a station connected, it will not reset the chip, and also there
will be no reset on short outages.

However, Access Points "alone" somewhere without users may still accumulate a
lot of false resets. Your name choice is better than ours in this regard, lets
hope users understand that as well. :)

>
> > We would also appreciate if you can at least briefly mention our work if
> > you resend/rework our stuff.
>
> Sorry about that. I rebased this from older experimental patches and
> forgot to put in all the relevant context.

Cool, thanks!
Simon


Attachments:
signature.asc (833.00 B)
This is a digitally signed message part.

2017-01-25 16:36:57

by Felix Fietkau

[permalink] [raw]
Subject: [PATCH 4/4] ath9k: fix race condition in enabling/disabling IRQs

The code currently relies on refcounting to disable IRQs from within the
IRQ handler and re-enabling them again after the tasklet has run.

However, due to race conditions sometimes the IRQ handler might be
called twice, or the tasklet may not run at all (if interrupted in the
middle of a reset).

This can cause nasty imbalances in the irq-disable refcount which will
get the driver permanently stuck until the entire radio has been stopped
and started again (ath_reset will not recover from this).

Instead of using this fragile logic, change the code to ensure that
running the irq handler during tasklet processing is safe, and leave the
refcount untouched.

Cc: [email protected]
Signed-off-by: Felix Fietkau <[email protected]>
---
drivers/net/wireless/ath/ath9k/ath9k.h | 1 +
drivers/net/wireless/ath/ath9k/init.c | 1 +
drivers/net/wireless/ath/ath9k/mac.c | 44 ++++++++++++++++++++++++++--------
drivers/net/wireless/ath/ath9k/mac.h | 1 +
drivers/net/wireless/ath/ath9k/main.c | 15 ++++++++----
5 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index a396c99ee84d..46dfca522be3 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -998,6 +998,7 @@ struct ath_softc {
struct survey_info *cur_survey;
struct survey_info survey[ATH9K_NUM_CHANNELS];

+ spinlock_t intr_lock;
struct tasklet_struct intr_tq;
struct tasklet_struct bcon_tasklet;
struct ath_hw *sc_ah;
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index a2f7f48bbb92..fa4b3cc1ba22 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -669,6 +669,7 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
common->bt_ant_diversity = 1;

spin_lock_init(&common->cc_lock);
+ spin_lock_init(&sc->intr_lock);
spin_lock_init(&sc->sc_serial_rw);
spin_lock_init(&sc->sc_pm_lock);
spin_lock_init(&sc->chan_lock);
diff --git a/drivers/net/wireless/ath/ath9k/mac.c b/drivers/net/wireless/ath/ath9k/mac.c
index f79587570af6..c059dfcaca22 100644
--- a/drivers/net/wireless/ath/ath9k/mac.c
+++ b/drivers/net/wireless/ath/ath9k/mac.c
@@ -810,21 +810,12 @@ void ath9k_hw_disable_interrupts(struct ath_hw *ah)
}
EXPORT_SYMBOL(ath9k_hw_disable_interrupts);

-void ath9k_hw_enable_interrupts(struct ath_hw *ah)
+static void __ath9k_hw_enable_interrupts(struct ath_hw *ah)
{
struct ath_common *common = ath9k_hw_common(ah);
u32 sync_default = AR_INTR_SYNC_DEFAULT;
u32 async_mask;

- if (!(ah->imask & ATH9K_INT_GLOBAL))
- return;
-
- if (!atomic_inc_and_test(&ah->intr_ref_cnt)) {
- ath_dbg(common, INTERRUPT, "Do not enable IER ref count %d\n",
- atomic_read(&ah->intr_ref_cnt));
- return;
- }
-
if (AR_SREV_9340(ah) || AR_SREV_9550(ah) || AR_SREV_9531(ah) ||
AR_SREV_9561(ah))
sync_default &= ~AR_INTR_SYNC_HOST1_FATAL;
@@ -846,6 +837,39 @@ void ath9k_hw_enable_interrupts(struct ath_hw *ah)
ath_dbg(common, INTERRUPT, "AR_IMR 0x%x IER 0x%x\n",
REG_READ(ah, AR_IMR), REG_READ(ah, AR_IER));
}
+
+void ath9k_hw_resume_interrupts(struct ath_hw *ah)
+{
+ struct ath_common *common = ath9k_hw_common(ah);
+
+ if (!(ah->imask & ATH9K_INT_GLOBAL))
+ return;
+
+ if (atomic_read(&ah->intr_ref_cnt) != 0) {
+ ath_dbg(common, INTERRUPT, "Do not enable IER ref count %d\n",
+ atomic_read(&ah->intr_ref_cnt));
+ return;
+ }
+
+ __ath9k_hw_enable_interrupts(ah);
+}
+EXPORT_SYMBOL(ath9k_hw_resume_interrupts);
+
+void ath9k_hw_enable_interrupts(struct ath_hw *ah)
+{
+ struct ath_common *common = ath9k_hw_common(ah);
+
+ if (!(ah->imask & ATH9K_INT_GLOBAL))
+ return;
+
+ if (!atomic_inc_and_test(&ah->intr_ref_cnt)) {
+ ath_dbg(common, INTERRUPT, "Do not enable IER ref count %d\n",
+ atomic_read(&ah->intr_ref_cnt));
+ return;
+ }
+
+ __ath9k_hw_enable_interrupts(ah);
+}
EXPORT_SYMBOL(ath9k_hw_enable_interrupts);

void ath9k_hw_set_interrupts(struct ath_hw *ah)
diff --git a/drivers/net/wireless/ath/ath9k/mac.h b/drivers/net/wireless/ath/ath9k/mac.h
index 3bab01435a86..770fc11b41d1 100644
--- a/drivers/net/wireless/ath/ath9k/mac.h
+++ b/drivers/net/wireless/ath/ath9k/mac.h
@@ -744,6 +744,7 @@ void ath9k_hw_set_interrupts(struct ath_hw *ah);
void ath9k_hw_enable_interrupts(struct ath_hw *ah);
void ath9k_hw_disable_interrupts(struct ath_hw *ah);
void ath9k_hw_kill_interrupts(struct ath_hw *ah);
+void ath9k_hw_resume_interrupts(struct ath_hw *ah);

void ar9002_hw_attach_mac_ops(struct ath_hw *ah);

diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 985b1cade4a4..d8ea087bdd07 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -375,9 +375,14 @@ void ath9k_tasklet(unsigned long data)
struct ath_common *common = ath9k_hw_common(ah);
enum ath_reset_type type;
unsigned long flags;
- u32 status = sc->intrstatus;
+ u32 status;
u32 rxmask;

+ spin_lock_irqsave(&sc->intr_lock, flags);
+ status = sc->intrstatus;
+ sc->intrstatus = 0;
+ spin_unlock_irqrestore(&sc->intr_lock, flags);
+
ath9k_ps_wakeup(sc);
spin_lock(&sc->sc_pcu_lock);

@@ -480,7 +485,7 @@ void ath9k_tasklet(unsigned long data)
ath9k_btcoex_handle_interrupt(sc, status);

/* re-enable hardware interrupt */
- ath9k_hw_enable_interrupts(ah);
+ ath9k_hw_resume_interrupts(ah);
out:
spin_unlock(&sc->sc_pcu_lock);
ath9k_ps_restore(sc);
@@ -544,7 +549,9 @@ irqreturn_t ath_isr(int irq, void *dev)
return IRQ_NONE;

/* Cache the status */
- sc->intrstatus = status;
+ spin_lock(&sc->intr_lock);
+ sc->intrstatus |= status;
+ spin_unlock(&sc->intr_lock);

if (status & SCHED_INTR)
sched = true;
@@ -590,7 +597,7 @@ irqreturn_t ath_isr(int irq, void *dev)

if (sched) {
/* turn off every interrupt */
- ath9k_hw_disable_interrupts(ah);
+ ath9k_hw_kill_interrupts(ah);
tasklet_schedule(&sc->intr_tq);
}

--
2.11.0

2017-01-26 10:33:01

by Simon Wunderlich

[permalink] [raw]
Subject: Re: [PATCH 3/4] ath9k: check for deaf rx path state

Hi Felix,

On Thursday, January 26, 2017 11:26:03 AM CET Felix Fietkau wrote:
> On 2017-01-26 11:15, Simon Wunderlich wrote:
> > Hey,
> >
> > On Thursday, January 26, 2017 11:02:53 AM CET Felix Fietkau wrote:
> >> On 2017-01-26 10:50, Simon Wunderlich wrote:
> >> > Hey Felix,
> >> >
> >> > On Wednesday, January 25, 2017 5:36:53 PM CET Felix Fietkau wrote:
> >> >> Various chips occasionally run into a state where the tx path still
> >> >> appears to be working normally, but the rx path is deaf.
> >> >>
> >> >> There is no known register signature to check for this state
> >> >> explicitly,
> >> >> so use the lack of rx interrupts as an indicator.
> >> >>
> >> >> This detection is prone to false positives, since a device could also
> >> >> simply be in an environment where there are no frames on the air.
> >> >> However, in this case doing a reset should be harmless since it's
> >> >> obviously not interrupting any real activity. To avoid confusion, call
> >> >> the reset counters in this case "Rx path inactive" instead of
> >> >> something
> >> >> like "Rx path deaf", since it may not be an indication of a real
> >> >> hardware failure.
> >> >>
> >> >> Signed-off-by: Felix Fietkau <[email protected]>
> >> >
> >> > As we observed in the field, it may happen that there are still RX
> >> > interrupts triggered, but just a very low number - in which case I
> >> > believe your version wouldn't fix the problem. Therefore we had a
> >> > threshold in our original patch [1].
> >>
> >> It seems that you were seeing something different than what I was seeing
> >> in my tests. Though it could be that my issues were actually caused by
> >> something else. I had queued up these changes a while back before I
> >> finally found and fixed the IRQ issue.
> >
> > What we found a good threshold was to check for less than 1 RX interrupt
> > per second, and check the mean average (about) every 30 seconds. If there
> > is any other AP or a station connected, it will not reset the chip, and
> > also there will be no reset on short outages.
>
> But if there's less than 1 Rx interrupt per second, then my patch should
> also trigger, right?

yes, that function you hooked in is called once a second. However, this will
likely lead to one reset per second one a "lonely" access point, which could
create problems for clients connecting the first time, or power-saving clients
who don't talk much. It's not so unlikely that an AP will not hear anything
for a full second, and the reset puts it out of operation for some time, too.
(Not sure how much beacons etc are affected, for example) If you can check
only every 30 seconds on the average, you would reduce this problem.

Cheers,
Simon


Attachments:
signature.asc (833.00 B)
This is a digitally signed message part.

2017-01-25 16:37:02

by Felix Fietkau

[permalink] [raw]
Subject: [PATCH 2/4] ath9k_hw: check if the chip failed to wake up

In an RFC patch, Sven Eckelmann and Simon Wunderlich reported:

"QCA 802.11n chips (especially AR9330/AR9340) sometimes end up in a
state in which a read of AR_CFG always returns 0xdeadbeef.
This should not happen when when the power_mode of the device is
ATH9K_PM_AWAKE."

Include the check for the default register state in the existing MAC
hang check.

Signed-off-by: Felix Fietkau <[email protected]>
---
drivers/net/wireless/ath/ath9k/hw.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 6989f89b3116..11fd98df2c43 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -1623,6 +1623,10 @@ bool ath9k_hw_check_alive(struct ath_hw *ah)
int count = 50;
u32 reg, last_val;

+ /* Check if chip failed to wake up */
+ if (REG_READ(ah, AR_CFG) == 0xdeadbeef)
+ return false;
+
if (AR_SREV_9300(ah))
return !ath9k_hw_detect_mac_hang(ah);

--
2.11.0

2017-02-02 09:10:38

by Kalle Valo

[permalink] [raw]
Subject: Re: [PATCH 4/4] ath9k: fix race condition in enabling/disabling IRQs

Felix Fietkau <[email protected]> writes:

> On 2017-01-25 17:36, Felix Fietkau wrote:
>> The code currently relies on refcounting to disable IRQs from within the
>> IRQ handler and re-enabling them again after the tasklet has run.
>>
>> However, due to race conditions sometimes the IRQ handler might be
>> called twice, or the tasklet may not run at all (if interrupted in the
>> middle of a reset).
>>
>> This can cause nasty imbalances in the irq-disable refcount which will
>> get the driver permanently stuck until the entire radio has been stopped
>> and started again (ath_reset will not recover from this).
>>
>> Instead of using this fragile logic, change the code to ensure that
>> running the irq handler during tasklet processing is safe, and leave the
>> refcount untouched.
>>
>> Cc: [email protected]
>> Signed-off-by: Felix Fietkau <[email protected]>
>
> Please don't apply this yet, it looks like it might cause some
> regressions on other devices. I will investigate.

Ok, as patch 3 also had some changes I'll drop these now. So please
resend the whole patch series once you know more.

--
Kalle Valo