Currently, the synchronization between ath9k_wmi_cmd() and
ath9k_wmi_ctrl_rx() is exposed to a race condition which, although being
rather unlikely, can lead to invalid behaviour of ath9k_wmi_cmd().
Consider the following scenario:
CPU0 CPU1
ath9k_wmi_cmd(...)
mutex_lock(&wmi->op_mutex)
ath9k_wmi_cmd_issue(...)
wait_for_completion_timeout(...)
---
timeout
---
/* the callback is being processed
* before last_seq_id became zero
*/
ath9k_wmi_ctrl_rx(...)
spin_lock_irqsave(...)
/* wmi->last_seq_id check here
* doesn't detect timeout yet
*/
spin_unlock_irqrestore(...)
/* last_seq_id is zeroed to
* indicate there was a timeout
*/
wmi->last_seq_id = 0
mutex_unlock(&wmi->op_mutex)
return -ETIMEDOUT
ath9k_wmi_cmd(...)
mutex_lock(&wmi->op_mutex)
/* the buffer is replaced with
* another one
*/
wmi->cmd_rsp_buf = rsp_buf
wmi->cmd_rsp_len = rsp_len
ath9k_wmi_cmd_issue(...)
spin_lock_irqsave(...)
spin_unlock_irqrestore(...)
wait_for_completion_timeout(...)
/* the continuation of the
* callback left after the first
* ath9k_wmi_cmd call
*/
ath9k_wmi_rsp_callback(...)
/* copying data designated
* to already timeouted
* WMI command into an
* inappropriate wmi_cmd_buf
*/
memcpy(...)
complete(&wmi->cmd_wait)
/* awakened by the bogus callback
* => invalid return result
*/
mutex_unlock(&wmi->op_mutex)
return 0
To fix this, move ath9k_wmi_rsp_callback() under wmi_lock inside
ath9k_wmi_ctrl_rx() so that the wmi->cmd_wait can be completed only for
initially designated wmi_cmd call, otherwise the path would be rejected
with last_seq_id check.
Also move recording the rsp buffer and length into ath9k_wmi_cmd_issue()
under the same wmi_lock with last_seq_id update to avoid their racy
changes.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: fb9987d0f748 ("ath9k_htc: Support for AR9271 chipset.")
Reported-and-tested-by: [email protected]
Signed-off-by: Fedor Pchelkin <[email protected]>
---
v2: do not extract ath9k_wmi_rsp_callback() internals, rephrase
description
drivers/net/wireless/ath/ath9k/wmi.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/drivers/net/wireless/ath/ath9k/wmi.c b/drivers/net/wireless/ath/ath9k/wmi.c
index d652c647d56b..688453a2e53a 100644
--- a/drivers/net/wireless/ath/ath9k/wmi.c
+++ b/drivers/net/wireless/ath/ath9k/wmi.c
@@ -242,10 +242,10 @@ static void ath9k_wmi_ctrl_rx(void *priv, struct sk_buff *skb,
spin_unlock_irqrestore(&wmi->wmi_lock, flags);
goto free_skb;
}
- spin_unlock_irqrestore(&wmi->wmi_lock, flags);
/* WMI command response */
ath9k_wmi_rsp_callback(wmi, skb);
+ spin_unlock_irqrestore(&wmi->wmi_lock, flags);
free_skb:
kfree_skb(skb);
@@ -283,7 +283,8 @@ int ath9k_wmi_connect(struct htc_target *htc, struct wmi *wmi,
static int ath9k_wmi_cmd_issue(struct wmi *wmi,
struct sk_buff *skb,
- enum wmi_cmd_id cmd, u16 len)
+ enum wmi_cmd_id cmd, u16 len,
+ u8 *rsp_buf, u32 rsp_len)
{
struct wmi_cmd_hdr *hdr;
unsigned long flags;
@@ -293,6 +294,11 @@ static int ath9k_wmi_cmd_issue(struct wmi *wmi,
hdr->seq_no = cpu_to_be16(++wmi->tx_seq_id);
spin_lock_irqsave(&wmi->wmi_lock, flags);
+
+ /* record the rsp buffer and length */
+ wmi->cmd_rsp_buf = rsp_buf;
+ wmi->cmd_rsp_len = rsp_len;
+
wmi->last_seq_id = wmi->tx_seq_id;
spin_unlock_irqrestore(&wmi->wmi_lock, flags);
@@ -333,11 +339,7 @@ int ath9k_wmi_cmd(struct wmi *wmi, enum wmi_cmd_id cmd_id,
goto out;
}
- /* record the rsp buffer and length */
- wmi->cmd_rsp_buf = rsp_buf;
- wmi->cmd_rsp_len = rsp_len;
-
- ret = ath9k_wmi_cmd_issue(wmi, skb, cmd_id, cmd_len);
+ ret = ath9k_wmi_cmd_issue(wmi, skb, cmd_id, cmd_len, rsp_buf, rsp_len);
if (ret)
goto out;
--
2.34.1
Hillf Danton <[email protected]> writes:
> On 24 Apr 2023 22:18:26 +0300 Fedor Pchelkin <[email protected]>
>> Currently, the synchronization between ath9k_wmi_cmd() and
>> ath9k_wmi_ctrl_rx() is exposed to a race condition which, although being
>> rather unlikely, can lead to invalid behaviour of ath9k_wmi_cmd().
>>
>> Consider the following scenario:
>>
>> CPU0 CPU1
>>
>> ath9k_wmi_cmd(...)
>> mutex_lock(&wmi->op_mutex)
>> ath9k_wmi_cmd_issue(...)
>> wait_for_completion_timeout(...)
>> ---
>> timeout
>> ---
>> /* the callback is being processed
>> * before last_seq_id became zero
>> */
>> ath9k_wmi_ctrl_rx(...)
>> spin_lock_irqsave(...)
>> /* wmi->last_seq_id check here
>> * doesn't detect timeout yet
>> */
>> spin_unlock_irqrestore(...)
>> /* last_seq_id is zeroed to
>> * indicate there was a timeout
>> */
>> wmi->last_seq_id = 0
>
> Without wmi->wmi_lock held, updating last_seq_id on the waiter side
> means it is random on the waker side, so the fix below is incorrect.
>
>> mutex_unlock(&wmi->op_mutex)
>> return -ETIMEDOUT
>>
>> ath9k_wmi_cmd(...)
>> mutex_lock(&wmi->op_mutex)
>> /* the buffer is replaced with
>> * another one
>> */
>> wmi->cmd_rsp_buf = rsp_buf
>> wmi->cmd_rsp_len = rsp_len
>> ath9k_wmi_cmd_issue(...)
>> spin_lock_irqsave(...)
>> spin_unlock_irqrestore(...)
>> wait_for_completion_timeout(...)
>> /* the continuation of the
>> * callback left after the first
>> * ath9k_wmi_cmd call
>> */
>> ath9k_wmi_rsp_callback(...)
>> /* copying data designated
>> * to already timeouted
>> * WMI command into an
>> * inappropriate wmi_cmd_buf
>> */
>> memcpy(...)
>> complete(&wmi->cmd_wait)
>> /* awakened by the bogus callback
>> * => invalid return result
>> */
>> mutex_unlock(&wmi->op_mutex)
>> return 0
>>
>> To fix this, move ath9k_wmi_rsp_callback() under wmi_lock inside
>> ath9k_wmi_ctrl_rx() so that the wmi->cmd_wait can be completed only for
>> initially designated wmi_cmd call, otherwise the path would be rejected
>> with last_seq_id check.
>>
>> Also move recording the rsp buffer and length into ath9k_wmi_cmd_issue()
>> under the same wmi_lock with last_seq_id update to avoid their racy
>> changes.
>
> Better in a seperate one.
Adding linux-wireless, please always CC the list with wireless patches.
--
https://patchwork.kernel.org/project/linux-wireless/list/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
On Tue, Apr 25, 2023 at 11:38:32AM +0800, Hillf Danton wrote:
> On 24 Apr 2023 22:18:26 +0300 Fedor Pchelkin <[email protected]>
> > Currently, the synchronization between ath9k_wmi_cmd() and
> > ath9k_wmi_ctrl_rx() is exposed to a race condition which, although being
> > rather unlikely, can lead to invalid behaviour of ath9k_wmi_cmd().
> >
> > Consider the following scenario:
> >
> > CPU0 CPU1
> >
> > ath9k_wmi_cmd(...)
> > mutex_lock(&wmi->op_mutex)
> > ath9k_wmi_cmd_issue(...)
> > wait_for_completion_timeout(...)
> > ---
> > timeout
> > ---
> > /* the callback is being processed
> > * before last_seq_id became zero
> > */
> > ath9k_wmi_ctrl_rx(...)
> > spin_lock_irqsave(...)
> > /* wmi->last_seq_id check here
> > * doesn't detect timeout yet
> > */
> > spin_unlock_irqrestore(...)
> > /* last_seq_id is zeroed to
> > * indicate there was a timeout
> > */
> > wmi->last_seq_id = 0
>
> Without wmi->wmi_lock held, updating last_seq_id on the waiter side
> means it is random on the waker side, so the fix below is incorrect.
>
Thank you for noticing! Of course that should be done.
> > mutex_unlock(&wmi->op_mutex)
> > return -ETIMEDOUT
> >
> > ath9k_wmi_cmd(...)
> > mutex_lock(&wmi->op_mutex)
> > /* the buffer is replaced with
> > * another one
> > */
> > wmi->cmd_rsp_buf = rsp_buf
> > wmi->cmd_rsp_len = rsp_len
> > ath9k_wmi_cmd_issue(...)
> > spin_lock_irqsave(...)
> > spin_unlock_irqrestore(...)
> > wait_for_completion_timeout(...)
> > /* the continuation of the
> > * callback left after the first
> > * ath9k_wmi_cmd call
> > */
> > ath9k_wmi_rsp_callback(...)
> > /* copying data designated
> > * to already timeouted
> > * WMI command into an
> > * inappropriate wmi_cmd_buf
> > */
> > memcpy(...)
> > complete(&wmi->cmd_wait)
> > /* awakened by the bogus callback
> > * => invalid return result
> > */
> > mutex_unlock(&wmi->op_mutex)
> > return 0
> >
> > To fix this, move ath9k_wmi_rsp_callback() under wmi_lock inside
> > ath9k_wmi_ctrl_rx() so that the wmi->cmd_wait can be completed only for
> > initially designated wmi_cmd call, otherwise the path would be rejected
> > with last_seq_id check.
> >
> > Also move recording the rsp buffer and length into ath9k_wmi_cmd_issue()
> > under the same wmi_lock with last_seq_id update to avoid their racy
> > changes.
>
> Better in a seperate one.
Well, they are parts of the same problem but now it seems more relevant
to divide the patch in two: the first one for incorrect last_seq_id
synchronization and the second one for recording rsp buffer under the
lock. Thanks!