2021-11-18 09:54:03

by Wen Gong

[permalink] [raw]
Subject: [PATCH v2] ath11k: add wait operation for tx management packets for flush from mac80211

In ath11k, tx of management packet is doing in a work queue. Sometimes
the workqueue does not finish tx immediately, then it lead after the next
step of vdev delete finished, it start to send the management packet to
firmware and lead firmware crash.

ieee80211_set_disassoc have logic of ieee80211_flush_queues after it
send_deauth_disassoc to ath11k, its purpose is make sure the deauth
was actually sent, so it need to change ath11k to match the purpose
of mac80211.

To address these issue wait for tx mgmt and tx data packets.

Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01230-QCAHSTSWPLZ_V2_TO_X86-1

Signed-off-by: Wen Gong <[email protected]>
---
v2: rebased to latest ath.git master ath-202111170737

drivers/net/wireless/ath/ath11k/core.c | 1 +
drivers/net/wireless/ath/ath11k/core.h | 1 +
drivers/net/wireless/ath/ath11k/mac.c | 40 ++++++++++++++++++++++----
drivers/net/wireless/ath/ath11k/wmi.c | 12 +++++++-
4 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c
index a40bbca3e9af..a7817c3bcaae 100644
--- a/drivers/net/wireless/ath/ath11k/core.c
+++ b/drivers/net/wireless/ath/ath11k/core.c
@@ -1078,6 +1078,7 @@ static void ath11k_core_restart(struct work_struct *work)
idr_for_each(&ar->txmgmt_idr,
ath11k_mac_tx_mgmt_pending_free, ar);
idr_destroy(&ar->txmgmt_idr);
+ wake_up(&ar->txmgmt_empty_waitq);
}

wake_up(&ab->wmi_ab.tx_credits_wq);
diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h
index bbfc10fd5c6d..6c6337bbe378 100644
--- a/drivers/net/wireless/ath/ath11k/core.h
+++ b/drivers/net/wireless/ath/ath11k/core.h
@@ -547,6 +547,7 @@ struct ath11k {
/* protects txmgmt_idr data */
spinlock_t txmgmt_idr_lock;
atomic_t num_pending_mgmt_tx;
+ wait_queue_head_t txmgmt_empty_waitq;

/* cycle count is reported twice for each visited channel during scan.
* access protected by data_lock
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 292b2b7eab11..01308c70366f 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -5128,6 +5128,14 @@ static int __ath11k_set_antenna(struct ath11k *ar, u32 tx_ant, u32 rx_ant)
return 0;
}

+static void ath11k_mgmt_over_wmi_tx_drop(struct ath11k *ar, struct sk_buff *skb)
+{
+ ieee80211_free_txskb(ar->hw, skb);
+
+ if (atomic_dec_and_test(&ar->num_pending_mgmt_tx))
+ wake_up(&ar->txmgmt_empty_waitq);
+}
+
int ath11k_mac_tx_mgmt_pending_free(int buf_id, void *skb, void *ctx)
{
struct sk_buff *msdu = skb;
@@ -5144,7 +5152,7 @@ int ath11k_mac_tx_mgmt_pending_free(int buf_id, void *skb, void *ctx)
info = IEEE80211_SKB_CB(msdu);
memset(&info->status, 0, sizeof(info->status));

- ieee80211_free_txskb(ar->hw, msdu);
+ ath11k_mgmt_over_wmi_tx_drop(ar, msdu);

return 0;
}
@@ -5182,6 +5190,10 @@ static int ath11k_mac_mgmt_tx_wmi(struct ath11k *ar, struct ath11k_vif *arvif,
buf_id = idr_alloc(&ar->txmgmt_idr, skb, 0,
ATH11K_TX_MGMT_NUM_PENDING_MAX, GFP_ATOMIC);
spin_unlock_bh(&ar->txmgmt_idr_lock);
+
+ ath11k_dbg(ar->ab, ATH11K_DBG_MAC,
+ "mac tx mgmt frame, buf id %d\n", buf_id);
+
if (buf_id < 0)
return -ENOSPC;

@@ -5228,7 +5240,7 @@ static void ath11k_mgmt_over_wmi_tx_purge(struct ath11k *ar)
struct sk_buff *skb;

while ((skb = skb_dequeue(&ar->wmi_mgmt_tx_queue)) != NULL)
- ieee80211_free_txskb(ar->hw, skb);
+ ath11k_mgmt_over_wmi_tx_drop(ar, skb);
}

static void ath11k_mgmt_over_wmi_tx_work(struct work_struct *work)
@@ -5243,7 +5255,7 @@ static void ath11k_mgmt_over_wmi_tx_work(struct work_struct *work)
skb_cb = ATH11K_SKB_CB(skb);
if (!skb_cb->vif) {
ath11k_warn(ar->ab, "no vif found for mgmt frame\n");
- ieee80211_free_txskb(ar->hw, skb);
+ ath11k_mgmt_over_wmi_tx_drop(ar, skb);
continue;
}

@@ -5258,14 +5270,18 @@ static void ath11k_mgmt_over_wmi_tx_work(struct work_struct *work)

ath11k_warn(ar->ab, "failed to tx mgmt frame, vdev_id %d :%d\n",
arvif->vdev_id, ret);
- ieee80211_free_txskb(ar->hw, skb);
+ ath11k_mgmt_over_wmi_tx_drop(ar, skb);
+ } else {
+ ath11k_dbg(ar->ab, ATH11K_DBG_MAC,
+ "mac tx mgmt frame, vdev_id %d\n",
+ arvif->vdev_id);
}
} else {
ath11k_warn(ar->ab,
"dropping mgmt frame for vdev %d, is_started %d\n",
arvif->vdev_id,
arvif->is_started);
- ieee80211_free_txskb(ar->hw, skb);
+ ath11k_mgmt_over_wmi_tx_drop(ar, skb);
}
}
}
@@ -5297,6 +5313,7 @@ static int ath11k_mac_mgmt_tx(struct ath11k *ar, struct sk_buff *skb,

skb_queue_tail(q, skb);
ieee80211_queue_work(ar->hw, &ar->wmi_mgmt_tx_work);
+ atomic_inc(&ar->num_pending_mgmt_tx);

return 0;
}
@@ -6784,6 +6801,17 @@ static void ath11k_mac_op_flush(struct ieee80211_hw *hw, struct ieee80211_vif *v
ATH11K_FLUSH_TIMEOUT);
if (time_left == 0)
ath11k_warn(ar->ab, "failed to flush transmit queue %ld\n", time_left);
+
+ time_left = wait_event_timeout(ar->txmgmt_empty_waitq,
+ (atomic_read(&ar->num_pending_mgmt_tx) == 0),
+ ATH11K_FLUSH_TIMEOUT);
+ if (time_left == 0)
+ ath11k_warn(ar->ab, "failed to flush mgmt transmit queue %ld\n",
+ time_left);
+
+ ath11k_dbg(ar->ab, ATH11K_DBG_MAC,
+ "mac mgmt tx flush mgmt pending %d\n",
+ atomic_read(&ar->num_pending_mgmt_tx));
}

static int
@@ -8022,6 +8050,8 @@ int ath11k_mac_register(struct ath11k_base *ab)
ret = __ath11k_mac_register(ar);
if (ret)
goto err_cleanup;
+
+ init_waitqueue_head(&ar->txmgmt_empty_waitq);
}

return 0;
diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index 614b2f6bcc8e..2c1f3d783083 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -4867,6 +4867,7 @@ static int wmi_process_mgmt_tx_comp(struct ath11k *ar, u32 desc_id,
struct sk_buff *msdu;
struct ieee80211_tx_info *info;
struct ath11k_skb_cb *skb_cb;
+ int num_mgmt;

spin_lock_bh(&ar->txmgmt_idr_lock);
msdu = idr_find(&ar->txmgmt_idr, desc_id);
@@ -4890,10 +4891,19 @@ static int wmi_process_mgmt_tx_comp(struct ath11k *ar, u32 desc_id,

ieee80211_tx_status_irqsafe(ar->hw, msdu);

+ num_mgmt = atomic_dec_if_positive(&ar->num_pending_mgmt_tx);
+
/* WARN when we received this event without doing any mgmt tx */
- if (atomic_dec_if_positive(&ar->num_pending_mgmt_tx) < 0)
+ if (num_mgmt < 0)
WARN_ON_ONCE(1);

+ ath11k_dbg(ar->ab, ATH11K_DBG_WMI,
+ "wmi mgmt tx comp pending %d desc id %d\n",
+ num_mgmt, desc_id);
+
+ if (!num_mgmt)
+ wake_up(&ar->txmgmt_empty_waitq);
+
return 0;
}


base-commit: 63ec871bc50a306aac550e2d85f697ca2d5f5deb
--
2.31.1



2021-11-22 12:19:37

by Kalle Valo

[permalink] [raw]
Subject: Re: [PATCH v2] ath11k: add wait operation for tx management packets for flush from mac80211

Wen Gong <[email protected]> writes:

> In ath11k, tx of management packet is doing in a work queue. Sometimes
> the workqueue does not finish tx immediately, then it lead after the next
> step of vdev delete finished, it start to send the management packet to
> firmware and lead firmware crash.
>
> ieee80211_set_disassoc have logic of ieee80211_flush_queues after it
> send_deauth_disassoc to ath11k, its purpose is make sure the deauth
> was actually sent, so it need to change ath11k to match the purpose
> of mac80211.
>
> To address these issue wait for tx mgmt and tx data packets.
>
> Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01230-QCAHSTSWPLZ_V2_TO_X86-1
>
> Signed-off-by: Wen Gong <[email protected]>

I sometimes see new warnings with this patch:

[ 142.346474] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0
[ 147.466367] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0

And more importantly this breaks suspend on my NUC testbox:

[ 175.141820] PM: suspend entry (deep)
[ 175.150512] Filesystems sync: 0.008 seconds
[ 175.229703] Freezing user space processes ... (elapsed 0.003 seconds) done.
[ 175.233506] OOM killer disabled.
[ 175.233588] Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done.
[ 175.242457] printk: Suspending console(s) (use no_console_suspend to debug)
[ 175.245269] wlan0: deauthenticating from 00:03:7f:48:81:59 by local choice (Reason: 3=DEAUTH_LEAVING)
[ 175.285881] e1000e: EEE TX LPI TIMER: 00000011
[ 180.745259] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0
[ 185.864977] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0
[ 185.970601] PM: Some devices failed to suspend, or early wake event detected
[ 185.974081] usb usb3: root hub lost power or was reset
[ 185.974135] usb usb4: root hub lost power or was reset
[ 186.098870] nvme nvme0: 8/0/0 default/read/poll queues
[ 186.297077] OOM killer enabled.
[ 186.297361] Restarting tasks ... done.
[ 186.343733] PM: suspend exit

My setup:

Tag: ath-202111221111

[ 151.123732] ath11k_pci 0000:06:00.0: MSI vectors: 32
[ 151.123851] ath11k_pci 0000:06:00.0: qca6390 hw2.0
[ 151.720618] ath11k_pci 0000:06:00.0: chip_id 0x0 chip_family 0xb board_id 0xff soc_id 0xffffffff
[ 151.720912] ath11k_pci 0000:06:00.0: fw_version 0x101c06cc fw_build_timestamp 2020-06-24 19:50 fw_build_id

--
https://patchwork.kernel.org/project/linux-wireless/list/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

2021-11-26 03:34:51

by Wen Gong

[permalink] [raw]
Subject: Re: [PATCH v2] ath11k: add wait operation for tx management packets for flush from mac80211

Hi Kalle,

I have found the reason.

I will send new version to fix it.

It is because commit c0b0d2e87d91ce283c8766b4b3c2ec9ac90ebf96 (ath11k:
Increment pending_mgmt_tx count before tx send invoke) which commit at
Nov 15 11:25:52 2021.

And my rebase is auto merged by git, then it inc twice for
num_pending_mgmt_tx.

wgong@wgong-HP3-Z230-SFF-Workstation:~/ath11k/ath-upstream$ git
cherry-pick 0869610178a1
Auto-merging drivers/net/wireless/ath/ath11k/wmi.c
Auto-merging drivers/net/wireless/ath/ath11k/mac.c
Auto-merging drivers/net/wireless/ath/ath11k/core.h
Auto-merging drivers/net/wireless/ath/ath11k/core.c
[add_wait_mgmt_ath-202111221436 954d233dbe34] ath11k: add wait operation
for tx management packets for flush from mac80211

On 11/22/2021 8:19 PM, Kalle Valo wrote:
> Wen Gong <[email protected]> writes:
>
>> In ath11k, tx of management packet is doing in a work queue. Sometimes
>> the workqueue does not finish tx immediately, then it lead after the next
>> step of vdev delete finished, it start to send the management packet to
>> firmware and lead firmware crash.
>>
>> ieee80211_set_disassoc have logic of ieee80211_flush_queues after it
>> send_deauth_disassoc to ath11k, its purpose is make sure the deauth
>> was actually sent, so it need to change ath11k to match the purpose
>> of mac80211.
>>
>> To address these issue wait for tx mgmt and tx data packets.
>>
>> Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01230-QCAHSTSWPLZ_V2_TO_X86-1
>>
>> Signed-off-by: Wen Gong <[email protected]>
> I sometimes see new warnings with this patch:
>
> [ 142.346474] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0
> [ 147.466367] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0
>
> And more importantly this breaks suspend on my NUC testbox:
>
> [ 175.141820] PM: suspend entry (deep)
> [ 175.150512] Filesystems sync: 0.008 seconds
> [ 175.229703] Freezing user space processes ... (elapsed 0.003 seconds) done.
> [ 175.233506] OOM killer disabled.
> [ 175.233588] Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done.
> [ 175.242457] printk: Suspending console(s) (use no_console_suspend to debug)
> [ 175.245269] wlan0: deauthenticating from 00:03:7f:48:81:59 by local choice (Reason: 3=DEAUTH_LEAVING)
> [ 175.285881] e1000e: EEE TX LPI TIMER: 00000011
> [ 180.745259] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0
> [ 185.864977] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit queue 0
> [ 185.970601] PM: Some devices failed to suspend, or early wake event detected
> [ 185.974081] usb usb3: root hub lost power or was reset
> [ 185.974135] usb usb4: root hub lost power or was reset
> [ 186.098870] nvme nvme0: 8/0/0 default/read/poll queues
> [ 186.297077] OOM killer enabled.
> [ 186.297361] Restarting tasks ... done.
> [ 186.343733] PM: suspend exit
>
> My setup:
>
> Tag: ath-202111221111
>
> [ 151.123732] ath11k_pci 0000:06:00.0: MSI vectors: 32
> [ 151.123851] ath11k_pci 0000:06:00.0: qca6390 hw2.0
> [ 151.720618] ath11k_pci 0000:06:00.0: chip_id 0x0 chip_family 0xb board_id 0xff soc_id 0xffffffff
> [ 151.720912] ath11k_pci 0000:06:00.0: fw_version 0x101c06cc fw_build_timestamp 2020-06-24 19:50 fw_build_id
>

2021-11-26 06:56:42

by Wen Gong

[permalink] [raw]
Subject: Re: [PATCH v2] ath11k: add wait operation for tx management packets for flush from mac80211

Hi Kalle,

I have sent new version patch, I have test it for connect/disconnect, it
is OK now, it will not happen timed out.

https://patchwork.kernel.org/project/linux-wireless/patch/[email protected]/

[PATCH v3] ath11k: add wait operation for tx management packets for
flush from mac80211

On 11/26/2021 11:32 AM, Wen Gong wrote:
> Hi Kalle,
>
> I have found the reason.
>
> I will send new version to fix it.
>
> It is because commit c0b0d2e87d91ce283c8766b4b3c2ec9ac90ebf96 (ath11k:
> Increment pending_mgmt_tx count before tx send invoke) which commit at
> Nov 15 11:25:52 2021.
>
> And my rebase is auto merged by git, then it inc twice for
> num_pending_mgmt_tx.
>
> wgong@wgong-HP3-Z230-SFF-Workstation:~/ath11k/ath-upstream$ git
> cherry-pick 0869610178a1
> Auto-merging drivers/net/wireless/ath/ath11k/wmi.c
> Auto-merging drivers/net/wireless/ath/ath11k/mac.c
> Auto-merging drivers/net/wireless/ath/ath11k/core.h
> Auto-merging drivers/net/wireless/ath/ath11k/core.c
> [add_wait_mgmt_ath-202111221436 954d233dbe34] ath11k: add wait
> operation for tx management packets for flush from mac80211
>
> On 11/22/2021 8:19 PM, Kalle Valo wrote:
>> Wen Gong <[email protected]> writes:
>>
>>> In ath11k, tx of management packet is doing in a work queue. Sometimes
>>> the workqueue does not finish tx immediately, then it lead after the
>>> next
>>> step of vdev delete finished, it start to send the management packet to
>>> firmware and lead firmware crash.
>>>
>>> ieee80211_set_disassoc have logic of ieee80211_flush_queues after it
>>> send_deauth_disassoc to ath11k, its purpose is make sure the deauth
>>> was actually sent, so it need to change ath11k to match the purpose
>>> of mac80211.
>>>
>>> To address these issue wait for tx mgmt and tx data packets.
>>>
>>> Tested-on: QCA6390 hw2.0 PCI
>>> WLAN.HST.1.0.1-01230-QCAHSTSWPLZ_V2_TO_X86-1
>>>
>>> Signed-off-by: Wen Gong <[email protected]>
>> I sometimes see new warnings with this patch:
>>
>> [  142.346474] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit
>> queue 0
>> [  147.466367] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit
>> queue 0
>>
>> And more importantly this breaks suspend on my NUC testbox:
>>
>> [  175.141820] PM: suspend entry (deep)
>> [  175.150512] Filesystems sync: 0.008 seconds
>> [  175.229703] Freezing user space processes ... (elapsed 0.003
>> seconds) done.
>> [  175.233506] OOM killer disabled.
>> [  175.233588] Freezing remaining freezable tasks ... (elapsed 0.001
>> seconds) done.
>> [  175.242457] printk: Suspending console(s) (use no_console_suspend
>> to debug)
>> [  175.245269] wlan0: deauthenticating from 00:03:7f:48:81:59 by
>> local choice (Reason: 3=DEAUTH_LEAVING)
>> [  175.285881] e1000e: EEE TX LPI TIMER: 00000011
>> [  180.745259] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit
>> queue 0
>> [  185.864977] ath11k_pci 0000:06:00.0: failed to flush mgmt transmit
>> queue 0
>> [  185.970601] PM: Some devices failed to suspend, or early wake
>> event detected
>> [  185.974081] usb usb3: root hub lost power or was reset
>> [  185.974135] usb usb4: root hub lost power or was reset
>> [  186.098870] nvme nvme0: 8/0/0 default/read/poll queues
>> [  186.297077] OOM killer enabled.
>> [  186.297361] Restarting tasks ... done.
>> [  186.343733] PM: suspend exit
>>
>> My setup:
>>
>> Tag:    ath-202111221111
>>
>> [  151.123732] ath11k_pci 0000:06:00.0: MSI vectors: 32
>> [  151.123851] ath11k_pci 0000:06:00.0: qca6390 hw2.0
>> [  151.720618] ath11k_pci 0000:06:00.0: chip_id 0x0 chip_family 0xb
>> board_id 0xff soc_id 0xffffffff
>> [  151.720912] ath11k_pci 0000:06:00.0: fw_version 0x101c06cc
>> fw_build_timestamp 2020-06-24 19:50 fw_build_id
>>