2021-07-13 16:31:37

by Desmond Cheong Zhi Xi

[permalink] [raw]
Subject: [PATCH v2] Bluetooth: fix inconsistent lock state in sco

Syzbot reported an inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} lock
usage in sco_conn_del and sco_sock_timeout that could lead to
deadlocks.

This inconsistent lock state can also happen in sco_conn_ready,
rfcomm_connect_ind, and bt_accept_enqueue.

The issue is that these functions take a spin lock on the socket with
interrupts enabled, but sco_sock_timeout takes the lock in an IRQ
context. This could lead to deadlocks:

CPU0
----
lock(slock-AF_BLUETOOTH-BTPROTO_SCO);
<Interrupt>
lock(slock-AF_BLUETOOTH-BTPROTO_SCO);

*** DEADLOCK ***

We fix this by ensuring that local bh is disabled before calling
bh_lock_sock.

After doing this, we additionally need to protect sco_conn_lock by
disabling local bh.

This is necessary because sco_conn_del makes a call to sco_chan_del
while holding on to the sock lock, and sco_chan_del itself makes a
call to sco_conn_lock. If sco_conn_lock is held elsewhere with
interrupts enabled, there could still be a
slock-AF_BLUETOOTH-BTPROTO_SCO --> &conn->lock#2 lock inversion as
follows:

CPU0 CPU1
---- ----
lock(&conn->lock#2);
local_irq_disable();
lock(slock-AF_BLUETOOTH-BTPROTO_SCO);
lock(&conn->lock#2);
<Interrupt>
lock(slock-AF_BLUETOOTH-BTPROTO_SCO);

*** DEADLOCK ***

As sco_conn_del now disables local bh before calling sco_chan_del,
instead of disabling local bh for the calls to sco_conn_lock in
sco_chan_del, we instead wrap other calls to sco_chan_del with
local_bh_disable/enable.

Reported-by: [email protected]
Tested-by: [email protected]
Signed-off-by: Desmond Cheong Zhi Xi <[email protected]>
---

Hi,

The previous version of this patch was a bit of a mess, so I made the
following changes.

v1 -> v2:
- Instead of pulling out the clean-up code out from sco_chan_del and
using it directly in sco_conn_del, disable local irqs for relevant
sections.
- Disable local irqs more thoroughly for instances of
bh_lock_sock/bh_lock_sock_nested in the bluetooth subsystem.
Specifically, the calls in af_bluetooth.c and rfcomm/sock.c are now made
with local irqs disabled as well.

Best wishes,
Desmond

net/bluetooth/rfcomm/sock.c | 2 ++
net/bluetooth/sco.c | 26 +++++++++++++++++++++++++-
2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index ae6f80730561..d8734abb2df4 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -974,6 +974,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
if (!parent)
return 0;

+ local_bh_disable();
bh_lock_sock(parent);

/* Check for backlog size */
@@ -1002,6 +1003,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *

done:
bh_unlock_sock(parent);
+ local_bh_enable();

if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
parent->sk_state_change(parent);
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 3bd41563f118..2548b8f81473 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -167,16 +167,22 @@ static void sco_conn_del(struct hci_conn *hcon, int err)
BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);

/* Kill socket */
+ local_bh_disable();
sco_conn_lock(conn);
sk = conn->sk;
sco_conn_unlock(conn);
+ local_bh_enable();

if (sk) {
sock_hold(sk);
+
+ local_bh_disable();
bh_lock_sock(sk);
sco_sock_clear_timer(sk);
sco_chan_del(sk, err);
bh_unlock_sock(sk);
+ local_bh_enable();
+
sco_sock_kill(sk);
sock_put(sk);
}
@@ -202,6 +208,7 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
{
int err = 0;

+ local_bh_disable();
sco_conn_lock(conn);
if (conn->sk)
err = -EBUSY;
@@ -209,6 +216,7 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
__sco_chan_add(conn, sk, parent);

sco_conn_unlock(conn);
+ local_bh_enable();
return err;
}

@@ -303,9 +311,11 @@ static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
{
struct sock *sk;

+ local_bh_disable();
sco_conn_lock(conn);
sk = conn->sk;
sco_conn_unlock(conn);
+ local_bh_enable();

if (!sk)
goto drop;
@@ -420,18 +430,25 @@ static void __sco_sock_close(struct sock *sk)
if (sco_pi(sk)->conn->hcon) {
sk->sk_state = BT_DISCONN;
sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
+ local_bh_disable();
sco_conn_lock(sco_pi(sk)->conn);
hci_conn_drop(sco_pi(sk)->conn->hcon);
sco_pi(sk)->conn->hcon = NULL;
sco_conn_unlock(sco_pi(sk)->conn);
- } else
+ local_bh_enable();
+ } else {
+ local_bh_disable();
sco_chan_del(sk, ECONNRESET);
+ local_bh_enable();
+ }
break;

case BT_CONNECT2:
case BT_CONNECT:
case BT_DISCONN:
+ local_bh_disable();
sco_chan_del(sk, ECONNRESET);
+ local_bh_enable();
break;

default:
@@ -1084,21 +1101,26 @@ static void sco_conn_ready(struct sco_conn *conn)

if (sk) {
sco_sock_clear_timer(sk);
+ local_bh_disable();
bh_lock_sock(sk);
sk->sk_state = BT_CONNECTED;
sk->sk_state_change(sk);
bh_unlock_sock(sk);
+ local_bh_enable();
} else {
+ local_bh_disable();
sco_conn_lock(conn);

if (!conn->hcon) {
sco_conn_unlock(conn);
+ local_bh_enable();
return;
}

parent = sco_get_sock_listen(&conn->hcon->src);
if (!parent) {
sco_conn_unlock(conn);
+ local_bh_enable();
return;
}

@@ -1109,6 +1131,7 @@ static void sco_conn_ready(struct sco_conn *conn)
if (!sk) {
bh_unlock_sock(parent);
sco_conn_unlock(conn);
+ local_bh_enable();
return;
}

@@ -1131,6 +1154,7 @@ static void sco_conn_ready(struct sco_conn *conn)
bh_unlock_sock(parent);

sco_conn_unlock(conn);
+ local_bh_enable();
}
}

--
2.25.1


2021-07-13 17:27:49

by bluez.test.bot

[permalink] [raw]
Subject: RE: [v2] Bluetooth: fix inconsistent lock state in sco

This is automated email and please do not reply to this email!

Dear submitter,

Thank you for submitting the patches to the linux bluetooth mailing list.
This is a CI test results with your patch series:
PW Link:https://patchwork.kernel.org/project/bluetooth/list/?series=514869

---Test result---

Test Summary:
CheckPatch PASS 0.60 seconds
GitLint PASS 0.10 seconds
BuildKernel PASS 501.68 seconds
TestRunner: Setup PASS 336.91 seconds
TestRunner: l2cap-tester PASS 2.57 seconds
TestRunner: bnep-tester PASS 1.93 seconds
TestRunner: mgmt-tester PASS 29.64 seconds
TestRunner: rfcomm-tester PASS 2.08 seconds
TestRunner: sco-tester PASS 2.04 seconds
TestRunner: smp-tester FAIL 2.12 seconds
TestRunner: userchan-tester PASS 1.94 seconds

Details
##############################
Test: CheckPatch - PASS - 0.60 seconds
Run checkpatch.pl script with rule in .checkpatch.conf


##############################
Test: GitLint - PASS - 0.10 seconds
Run gitlint with rule in .gitlint


##############################
Test: BuildKernel - PASS - 501.68 seconds
Build Kernel with minimal configuration supports Bluetooth


##############################
Test: TestRunner: Setup - PASS - 336.91 seconds
Setup environment for running Test Runner


##############################
Test: TestRunner: l2cap-tester - PASS - 2.57 seconds
Run test-runner with l2cap-tester
Total: 40, Passed: 40 (100.0%), Failed: 0, Not Run: 0

##############################
Test: TestRunner: bnep-tester - PASS - 1.93 seconds
Run test-runner with bnep-tester
Total: 1, Passed: 1 (100.0%), Failed: 0, Not Run: 0

##############################
Test: TestRunner: mgmt-tester - PASS - 29.64 seconds
Run test-runner with mgmt-tester
Total: 446, Passed: 443 (99.3%), Failed: 0, Not Run: 3

##############################
Test: TestRunner: rfcomm-tester - PASS - 2.08 seconds
Run test-runner with rfcomm-tester
Total: 9, Passed: 9 (100.0%), Failed: 0, Not Run: 0

##############################
Test: TestRunner: sco-tester - PASS - 2.04 seconds
Run test-runner with sco-tester
Total: 8, Passed: 8 (100.0%), Failed: 0, Not Run: 0

##############################
Test: TestRunner: smp-tester - FAIL - 2.12 seconds
Run test-runner with smp-tester
Total: 8, Passed: 7 (87.5%), Failed: 1, Not Run: 0

Failed Test Cases
SMP Client - SC Request 2 Failed 0.027 seconds

##############################
Test: TestRunner: userchan-tester - PASS - 1.94 seconds
Run test-runner with userchan-tester
Total: 3, Passed: 3 (100.0%), Failed: 0, Not Run: 0



---
Regards,
Linux Bluetooth


Attachments:
l2cap-tester.log (43.31 kB)
bnep-tester.log (3.47 kB)
mgmt-tester.log (600.00 kB)
rfcomm-tester.log (11.40 kB)
sco-tester.log (9.68 kB)
smp-tester.log (11.43 kB)
userchan-tester.log (5.33 kB)
Download all attachments

2021-07-15 02:33:51

by Desmond Cheong Zhi Xi

[permalink] [raw]
Subject: Re: [PATCH v2] Bluetooth: fix inconsistent lock state in sco

On 15/7/21 3:12 am, Luiz Augusto von Dentz wrote:
> Hi Desmond,
>
> On Tue, Jul 13, 2021 at 9:29 AM Desmond Cheong Zhi Xi
> <[email protected]> wrote:
>>
>> Syzbot reported an inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} lock
>> usage in sco_conn_del and sco_sock_timeout that could lead to
>> deadlocks.
>>
>> This inconsistent lock state can also happen in sco_conn_ready,
>> rfcomm_connect_ind, and bt_accept_enqueue.
>>
>> The issue is that these functions take a spin lock on the socket with
>> interrupts enabled, but sco_sock_timeout takes the lock in an IRQ
>> context. This could lead to deadlocks:
>>
>> CPU0
>> ----
>> lock(slock-AF_BLUETOOTH-BTPROTO_SCO);
>> <Interrupt>
>> lock(slock-AF_BLUETOOTH-BTPROTO_SCO);
>>
>> *** DEADLOCK ***
>>
>> We fix this by ensuring that local bh is disabled before calling
>> bh_lock_sock.
>>
>> After doing this, we additionally need to protect sco_conn_lock by
>> disabling local bh.
>>
>> This is necessary because sco_conn_del makes a call to sco_chan_del
>> while holding on to the sock lock, and sco_chan_del itself makes a
>> call to sco_conn_lock. If sco_conn_lock is held elsewhere with
>> interrupts enabled, there could still be a
>> slock-AF_BLUETOOTH-BTPROTO_SCO --> &conn->lock#2 lock inversion as
>> follows:
>>
>> CPU0 CPU1
>> ---- ----
>> lock(&conn->lock#2);
>> local_irq_disable();
>> lock(slock-AF_BLUETOOTH-BTPROTO_SCO);
>> lock(&conn->lock#2);
>> <Interrupt>
>> lock(slock-AF_BLUETOOTH-BTPROTO_SCO);
>>
>> *** DEADLOCK ***
>>
>> As sco_conn_del now disables local bh before calling sco_chan_del,
>> instead of disabling local bh for the calls to sco_conn_lock in
>> sco_chan_del, we instead wrap other calls to sco_chan_del with
>> local_bh_disable/enable.
>>
>> Reported-by: [email protected]
>> Tested-by: [email protected]
>> Signed-off-by: Desmond Cheong Zhi Xi <[email protected]>
>> ---
>>
>> Hi,
>>
>> The previous version of this patch was a bit of a mess, so I made the
>> following changes.
>>
>> v1 -> v2:
>> - Instead of pulling out the clean-up code out from sco_chan_del and
>> using it directly in sco_conn_del, disable local irqs for relevant
>> sections.
>> - Disable local irqs more thoroughly for instances of
>> bh_lock_sock/bh_lock_sock_nested in the bluetooth subsystem.
>> Specifically, the calls in af_bluetooth.c and rfcomm/sock.c are now made
>> with local irqs disabled as well.
>>
>> Best wishes,
>> Desmond
>>
>> net/bluetooth/rfcomm/sock.c | 2 ++
>> net/bluetooth/sco.c | 26 +++++++++++++++++++++++++-
>> 2 files changed, 27 insertions(+), 1 deletion(-)
>>
>> diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
>> index ae6f80730561..d8734abb2df4 100644
>> --- a/net/bluetooth/rfcomm/sock.c
>> +++ b/net/bluetooth/rfcomm/sock.c
>> @@ -974,6 +974,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
>> if (!parent)
>> return 0;
>>
>> + local_bh_disable();
>> bh_lock_sock(parent);
>>
>> /* Check for backlog size */
>> @@ -1002,6 +1003,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
>>
>> done:
>> bh_unlock_sock(parent);
>> + local_bh_enable();
>
> Looks like you are touching RFCOMM as well, perhaps you should have it
> split, also how about other sockets like L2CAP and HCI are they
> affected? There seems to be a lot of problem with the likes of
> bh_lock_sock I wonder if going with local_bh_disable is overall a
> better way to handle.
>

Thanks for the feedback, Luiz. I'll separate the SCO and RFCOMM code
changes.

I believe other sockets should be fine. From what I see, they use
lock_sock, which acquires the spin lock via spin_lock_bh under the hood.
So only code that uses bh_lock_sock/bh_lock_sock_nested are affected.

Also I'm not sure what you meant by going with local_bh_disable? I'm
probably missing context about Bluetooth protocols, but I think the spin
locks still have their place to protect concurrent accesses and to make
it clear about what's being protected.

>> if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
>> parent->sk_state_change(parent);
>> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
>> index 3bd41563f118..2548b8f81473 100644
>> --- a/net/bluetooth/sco.c
>> +++ b/net/bluetooth/sco.c
>> @@ -167,16 +167,22 @@ static void sco_conn_del(struct hci_conn *hcon, int err)
>> BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
>>
>> /* Kill socket */
>> + local_bh_disable();
>> sco_conn_lock(conn);
>> sk = conn->sk;
>> sco_conn_unlock(conn);
>> + local_bh_enable();
>>
>> if (sk) {
>> sock_hold(sk);
>> +
>> + local_bh_disable();
>> bh_lock_sock(sk);
>> sco_sock_clear_timer(sk);
>> sco_chan_del(sk, err);
>> bh_unlock_sock(sk);
>> + local_bh_enable();
>> +
>> sco_sock_kill(sk);
>> sock_put(sk);
>> }
>> @@ -202,6 +208,7 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
>> {
>> int err = 0;
>>
>> + local_bh_disable();
>> sco_conn_lock(conn);
>> if (conn->sk)
>> err = -EBUSY;
>> @@ -209,6 +216,7 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
>> __sco_chan_add(conn, sk, parent);
>>
>> sco_conn_unlock(conn);
>> + local_bh_enable();
>> return err;
>> }
>>
>> @@ -303,9 +311,11 @@ static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
>> {
>> struct sock *sk;
>>
>> + local_bh_disable();
>> sco_conn_lock(conn);
>> sk = conn->sk;
>> sco_conn_unlock(conn);
>> + local_bh_enable();
>>
>> if (!sk)
>> goto drop;
>> @@ -420,18 +430,25 @@ static void __sco_sock_close(struct sock *sk)
>> if (sco_pi(sk)->conn->hcon) {
>> sk->sk_state = BT_DISCONN;
>> sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
>> + local_bh_disable();
>> sco_conn_lock(sco_pi(sk)->conn);
>> hci_conn_drop(sco_pi(sk)->conn->hcon);
>> sco_pi(sk)->conn->hcon = NULL;
>> sco_conn_unlock(sco_pi(sk)->conn);
>> - } else
>> + local_bh_enable();
>> + } else {
>> + local_bh_disable();
>> sco_chan_del(sk, ECONNRESET);
>> + local_bh_enable();
>> + }
>> break;
>>
>> case BT_CONNECT2:
>> case BT_CONNECT:
>> case BT_DISCONN:
>> + local_bh_disable();
>> sco_chan_del(sk, ECONNRESET);
>> + local_bh_enable();
>> break;
>>
>> default:
>> @@ -1084,21 +1101,26 @@ static void sco_conn_ready(struct sco_conn *conn)
>>
>> if (sk) {
>> sco_sock_clear_timer(sk);
>> + local_bh_disable();
>> bh_lock_sock(sk);
>> sk->sk_state = BT_CONNECTED;
>> sk->sk_state_change(sk);
>> bh_unlock_sock(sk);
>> + local_bh_enable();
>> } else {
>> + local_bh_disable();
>> sco_conn_lock(conn);
>>
>> if (!conn->hcon) {
>> sco_conn_unlock(conn);
>> + local_bh_enable();
>> return;
>> }
>>
>> parent = sco_get_sock_listen(&conn->hcon->src);
>> if (!parent) {
>> sco_conn_unlock(conn);
>> + local_bh_enable();
>> return;
>> }
>>
>> @@ -1109,6 +1131,7 @@ static void sco_conn_ready(struct sco_conn *conn)
>> if (!sk) {
>> bh_unlock_sock(parent);
>> sco_conn_unlock(conn);
>> + local_bh_enable();
>> return;
>> }
>>
>> @@ -1131,6 +1154,7 @@ static void sco_conn_ready(struct sco_conn *conn)
>> bh_unlock_sock(parent);
>>
>> sco_conn_unlock(conn);
>> + local_bh_enable();
>> }
>> }
>>
>> --
>> 2.25.1
>>
>
>