There are sleep in atomic context bugs when the request to secure
element of st21nfca is timeout. The root cause is that kzalloc and
alloc_skb with GFP_KERNEL parameter and mutex_lock are called in
st21nfca_se_wt_timeout which is a timer handler. The call tree shows
the execution paths that could lead to bugs:
(Interrupt context)
st21nfca_se_wt_timeout
nfc_hci_send_event
nfc_hci_hcp_message_tx
kzalloc(..., GFP_KERNEL) //may sleep
alloc_skb(..., GFP_KERNEL) //may sleep
mutex_lock() //may sleep
This patch changes allocation mode of kzalloc and alloc_skb from
GFP_KERNEL to GFP_ATOMIC and changes mutex_lock to spin_lock in
order to prevent atomic context from sleeping.
Fixes: 2130fb97fecf ("NFC: st21nfca: Adding support for secure element")
Signed-off-by: Duoming Zhou <[email protected]>
---
Changes in v2:
- Change mutex_lock to spin_lock.
include/net/nfc/hci.h | 3 ++-
net/nfc/hci/core.c | 18 +++++++++---------
net/nfc/hci/hcp.c | 10 +++++-----
3 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h
index 756c11084f6..8f66e6e6b91 100644
--- a/include/net/nfc/hci.h
+++ b/include/net/nfc/hci.h
@@ -103,7 +103,8 @@ struct nfc_hci_dev {
bool shutting_down;
- struct mutex msg_tx_mutex;
+ /* The spinlock is used to protect resources related with hci message TX */
+ spinlock_t msg_tx_spin;
struct list_head msg_tx_queue;
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index ceb87db57cd..fa22f9fe5fc 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -68,7 +68,7 @@ static void nfc_hci_msg_tx_work(struct work_struct *work)
struct sk_buff *skb;
int r = 0;
- mutex_lock(&hdev->msg_tx_mutex);
+ spin_lock(&hdev->msg_tx_spin);
if (hdev->shutting_down)
goto exit;
@@ -120,7 +120,7 @@ static void nfc_hci_msg_tx_work(struct work_struct *work)
msecs_to_jiffies(hdev->cmd_pending_msg->completion_delay));
exit:
- mutex_unlock(&hdev->msg_tx_mutex);
+ spin_unlock(&hdev->msg_tx_spin);
}
static void nfc_hci_msg_rx_work(struct work_struct *work)
@@ -165,7 +165,7 @@ static void __nfc_hci_cmd_completion(struct nfc_hci_dev *hdev, int err,
void nfc_hci_resp_received(struct nfc_hci_dev *hdev, u8 result,
struct sk_buff *skb)
{
- mutex_lock(&hdev->msg_tx_mutex);
+ spin_lock(&hdev->msg_tx_spin);
if (hdev->cmd_pending_msg == NULL) {
kfree_skb(skb);
@@ -175,7 +175,7 @@ void nfc_hci_resp_received(struct nfc_hci_dev *hdev, u8 result,
__nfc_hci_cmd_completion(hdev, nfc_hci_result_to_errno(result), skb);
exit:
- mutex_unlock(&hdev->msg_tx_mutex);
+ spin_unlock(&hdev->msg_tx_spin);
}
void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
@@ -833,7 +833,7 @@ static int hci_se_io(struct nfc_dev *nfc_dev, u32 se_idx,
static void nfc_hci_failure(struct nfc_hci_dev *hdev, int err)
{
- mutex_lock(&hdev->msg_tx_mutex);
+ spin_lock(&hdev->msg_tx_spin);
if (hdev->cmd_pending_msg == NULL) {
nfc_driver_failure(hdev->ndev, err);
@@ -843,7 +843,7 @@ static void nfc_hci_failure(struct nfc_hci_dev *hdev, int err)
__nfc_hci_cmd_completion(hdev, err, NULL);
exit:
- mutex_unlock(&hdev->msg_tx_mutex);
+ spin_unlock(&hdev->msg_tx_spin);
}
static void nfc_hci_llc_failure(struct nfc_hci_dev *hdev, int err)
@@ -1009,7 +1009,7 @@ EXPORT_SYMBOL(nfc_hci_free_device);
int nfc_hci_register_device(struct nfc_hci_dev *hdev)
{
- mutex_init(&hdev->msg_tx_mutex);
+ spin_lock_init(&hdev->msg_tx_spin);
INIT_LIST_HEAD(&hdev->msg_tx_queue);
@@ -1031,7 +1031,7 @@ void nfc_hci_unregister_device(struct nfc_hci_dev *hdev)
{
struct hci_msg *msg, *n;
- mutex_lock(&hdev->msg_tx_mutex);
+ spin_lock(&hdev->msg_tx_spin);
if (hdev->cmd_pending_msg) {
if (hdev->cmd_pending_msg->cb)
@@ -1044,7 +1044,7 @@ void nfc_hci_unregister_device(struct nfc_hci_dev *hdev)
hdev->shutting_down = true;
- mutex_unlock(&hdev->msg_tx_mutex);
+ spin_unlock(&hdev->msg_tx_spin);
del_timer_sync(&hdev->cmd_timer);
cancel_work_sync(&hdev->msg_tx_work);
diff --git a/net/nfc/hci/hcp.c b/net/nfc/hci/hcp.c
index 05c60988f59..f7eccb4ce35 100644
--- a/net/nfc/hci/hcp.c
+++ b/net/nfc/hci/hcp.c
@@ -30,7 +30,7 @@ int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe,
int hci_len, err;
bool firstfrag = true;
- cmd = kzalloc(sizeof(struct hci_msg), GFP_KERNEL);
+ cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
if (cmd == NULL)
return -ENOMEM;
@@ -58,7 +58,7 @@ int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe,
data_link_len + ndev->tx_tailroom;
hci_len -= data_link_len;
- skb = alloc_skb(skb_len, GFP_KERNEL);
+ skb = alloc_skb(skb_len, GFP_ATOMIC);
if (skb == NULL) {
err = -ENOMEM;
goto out_skb_err;
@@ -90,16 +90,16 @@ int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe,
skb_queue_tail(&cmd->msg_frags, skb);
}
- mutex_lock(&hdev->msg_tx_mutex);
+ spin_lock(&hdev->msg_tx_spin);
if (hdev->shutting_down) {
err = -ESHUTDOWN;
- mutex_unlock(&hdev->msg_tx_mutex);
+ spin_unlock(&hdev->msg_tx_spin);
goto out_skb_err;
}
list_add_tail(&cmd->msg_l, &hdev->msg_tx_queue);
- mutex_unlock(&hdev->msg_tx_mutex);
+ spin_unlock(&hdev->msg_tx_spin);
schedule_work(&hdev->msg_tx_work);
--
2.17.1
On 17/05/2022 12:55, Duoming Zhou wrote:
> There are sleep in atomic context bugs when the request to secure
> element of st21nfca is timeout. The root cause is that kzalloc and
> alloc_skb with GFP_KERNEL parameter and mutex_lock are called in
> st21nfca_se_wt_timeout which is a timer handler. The call tree shows
> the execution paths that could lead to bugs:
>
> (Interrupt context)
> st21nfca_se_wt_timeout
> nfc_hci_send_event
> nfc_hci_hcp_message_tx
> kzalloc(..., GFP_KERNEL) //may sleep
> alloc_skb(..., GFP_KERNEL) //may sleep
> mutex_lock() //may sleep
>
> This patch changes allocation mode of kzalloc and alloc_skb from
> GFP_KERNEL to GFP_ATOMIC and changes mutex_lock to spin_lock in
> order to prevent atomic context from sleeping.
>
> Fixes: 2130fb97fecf ("NFC: st21nfca: Adding support for secure element")
> Signed-off-by: Duoming Zhou <[email protected]>
> ---
> Changes in v2:
> - Change mutex_lock to spin_lock.
>
> include/net/nfc/hci.h | 3 ++-
> net/nfc/hci/core.c | 18 +++++++++---------
> net/nfc/hci/hcp.c | 10 +++++-----
> 3 files changed, 16 insertions(+), 15 deletions(-)
>
> diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h
> index 756c11084f6..8f66e6e6b91 100644
> --- a/include/net/nfc/hci.h
> +++ b/include/net/nfc/hci.h
> @@ -103,7 +103,8 @@ struct nfc_hci_dev {
>
> bool shutting_down;
>
> - struct mutex msg_tx_mutex;
> + /* The spinlock is used to protect resources related with hci message TX */
> + spinlock_t msg_tx_spin;
>
> struct list_head msg_tx_queue;
>
> diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
> index ceb87db57cd..fa22f9fe5fc 100644
> --- a/net/nfc/hci/core.c
> +++ b/net/nfc/hci/core.c
> @@ -68,7 +68,7 @@ static void nfc_hci_msg_tx_work(struct work_struct *work)
> struct sk_buff *skb;
> int r = 0;
>
> - mutex_lock(&hdev->msg_tx_mutex);
> + spin_lock(&hdev->msg_tx_spin);
> if (hdev->shutting_down)
> goto exit;
How did you test your patch?
Did you check, really check, that this can be an atomic (non-sleeping)
section?
I have doubts because I found at least one path leading to device_lock
(which is a mutex) called within your new code.
Before sending a new version, please wait for discussion to reach some
consensus. The quality of these fixes is really poor. :(
Best regards,
Krzysztof
Hello,
On Tue, 17 May 2022 13:42:41 +0200 Krzysztof wrote:
> On 17/05/2022 12:55, Duoming Zhou wrote:
> > There are sleep in atomic context bugs when the request to secure
> > element of st21nfca is timeout. The root cause is that kzalloc and
> > alloc_skb with GFP_KERNEL parameter and mutex_lock are called in
> > st21nfca_se_wt_timeout which is a timer handler. The call tree shows
> > the execution paths that could lead to bugs:
> >
> > (Interrupt context)
> > st21nfca_se_wt_timeout
> > nfc_hci_send_event
> > nfc_hci_hcp_message_tx
> > kzalloc(..., GFP_KERNEL) //may sleep
> > alloc_skb(..., GFP_KERNEL) //may sleep
> > mutex_lock() //may sleep
> >
> > This patch changes allocation mode of kzalloc and alloc_skb from
> > GFP_KERNEL to GFP_ATOMIC and changes mutex_lock to spin_lock in
> > order to prevent atomic context from sleeping.
> >
> > Fixes: 2130fb97fecf ("NFC: st21nfca: Adding support for secure element")
> > Signed-off-by: Duoming Zhou <[email protected]>
> > ---
> > Changes in v2:
> > - Change mutex_lock to spin_lock.
> >
> > include/net/nfc/hci.h | 3 ++-
> > net/nfc/hci/core.c | 18 +++++++++---------
> > net/nfc/hci/hcp.c | 10 +++++-----
> > 3 files changed, 16 insertions(+), 15 deletions(-)
> >
> > diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h
> > index 756c11084f6..8f66e6e6b91 100644
> > --- a/include/net/nfc/hci.h
> > +++ b/include/net/nfc/hci.h
> > @@ -103,7 +103,8 @@ struct nfc_hci_dev {
> >
> > bool shutting_down;
> >
> > - struct mutex msg_tx_mutex;
> > + /* The spinlock is used to protect resources related with hci message TX */
> > + spinlock_t msg_tx_spin;
> >
> > struct list_head msg_tx_queue;
> >
> > diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
> > index ceb87db57cd..fa22f9fe5fc 100644
> > --- a/net/nfc/hci/core.c
> > +++ b/net/nfc/hci/core.c
> > @@ -68,7 +68,7 @@ static void nfc_hci_msg_tx_work(struct work_struct *work)
> > struct sk_buff *skb;
> > int r = 0;
> >
> > - mutex_lock(&hdev->msg_tx_mutex);
> > + spin_lock(&hdev->msg_tx_spin);
> > if (hdev->shutting_down)
> > goto exit;
>
> How did you test your patch?
>
> Did you check, really check, that this can be an atomic (non-sleeping)
> section?
>
> I have doubts because I found at least one path leading to device_lock
> (which is a mutex) called within your new code.
The nfc_hci_hcp_message_tx() is called by both process context(hci_dev_up and so on)
and interrupt context(st21nfca_se_wt_timeout()). The process context(hci_dev_up and so on)
calls device_lock, but I think calling spin_lock() within device_lock() is ok. There is
no device_lock() called within spin_lock().
The spinlock could also improve the performance of the program, because processing the
hci messages should be finished in a short time.
> Before sending a new version, please wait for discussion to reach some
> consensus. The quality of these fixes is really poor. :(
Ok, I will wait for discussion to reach consensus.
Best regards,
Duoming Zhou
On 18/05/2022 06:39, [email protected] wrote:
>> There is.
>>
>> nfc_hci_failure -> spin lock -> nfc_driver_failure -> nfc_targets_found
>> -> device_lock
>>
>> I found it just by a very quick look, so I suspect there are several
>> other places, not really checked.
>
> I agree with you, the spin_lock is not a good solution to this problem. There is another solution:
>
> We could put the nfc_hci_send_event() of st21nfca_se_wt_timeout() in a work item, then, using
> schedule_work() in st21nfca_se_wt_timeout() to execute the work item. The schedule_work() will
> wake up another kernel thread which is in process context to execute the bottom half of the interrupt,
> so it allows sleep.
>
> The following is the details.
Yes, this seems good solution. You might also need to add
cancel_work_sync to all places removing the timer.
Best regards,
Krzysztof
Hello,
Date: Wed, 18 May 2022 11:39:27 +0200 Krzysztof wrote:
> >> There is.
> >>
> >> nfc_hci_failure -> spin lock -> nfc_driver_failure -> nfc_targets_found
> >> -> device_lock
> >>
> >> I found it just by a very quick look, so I suspect there are several
> >> other places, not really checked.
> >
> > I agree with you, the spin_lock is not a good solution to this problem. There is another solution:
> >
> > We could put the nfc_hci_send_event() of st21nfca_se_wt_timeout() in a work item, then, using
> > schedule_work() in st21nfca_se_wt_timeout() to execute the work item. The schedule_work() will
> > wake up another kernel thread which is in process context to execute the bottom half of the interrupt,
> > so it allows sleep.
> >
> > The following is the details.
>
> Yes, this seems good solution. You might also need to add
> cancel_work_sync to all places removing the timer.
Thanks for your approval.
There are two del_timer_sync() functions related with bwi_timer timer.
I neglect one site in st21nfca_apdu_reader_event_received(), I add
cancel_work_sync() in it this time.
diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c
index c922f10d0d7..7e213f8ddc9 100644
--- a/drivers/nfc/st21nfca/se.c
+++ b/drivers/nfc/st21nfca/se.c
@@ -241,7 +241,7 @@ int st21nfca_hci_se_io(struct nfc_hci_dev *hdev, u32 se_idx,
}
EXPORT_SYMBOL(st21nfca_hci_se_io);
-static void st21nfca_se_wt_timeout(struct timer_list *t)
+static void st21nfca_se_wt_work(struct work_struct *work)
{
/*
* No answer from the secure element
@@ -254,8 +254,9 @@ static void st21nfca_se_wt_timeout(struct timer_list *t)
*/
/* hardware reset managed through VCC_UICC_OUT power supply */
u8 param = 0x01;
- struct st21nfca_hci_info *info = from_timer(info, t,
- se_info.bwi_timer);
+ struct st21nfca_hci_info *info = container_of(work,
+ struct st21nfca_hci_info,
+ se_info.timeout_work);
info->se_info.bwi_active = false;
@@ -271,6 +272,13 @@ static void st21nfca_se_wt_timeout(struct timer_list *t)
info->se_info.cb(info->se_info.cb_context, NULL, 0, -ETIME);
}
+static void st21nfca_se_wt_timeout(struct timer_list *t)
+{
+ struct st21nfca_hci_info *info = from_timer(info, t, se_info.bwi_timer);
+
+ schedule_work(&info->se_info.timeout_work);
+}
+
static void st21nfca_se_activation_timeout(struct timer_list *t)
{
struct st21nfca_hci_info *info = from_timer(info, t,
@@ -360,6 +368,7 @@ int st21nfca_apdu_reader_event_received(struct nfc_hci_dev *hdev,
switch (event) {
case ST21NFCA_EVT_TRANSMIT_DATA:
del_timer_sync(&info->se_info.bwi_timer);
+ cancel_work_sync(&info->se_info.timeout_work);
info->se_info.bwi_active = false;
r = nfc_hci_send_event(hdev, ST21NFCA_DEVICE_MGNT_GATE,
ST21NFCA_EVT_SE_END_OF_APDU_TRANSFER, NULL, 0);
@@ -389,6 +398,7 @@ void st21nfca_se_init(struct nfc_hci_dev *hdev)
struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
init_completion(&info->se_info.req_completion);
+ INIT_WORK(&info->se_info.timeout_work, st21nfca_se_wt_work);
/* initialize timers */
timer_setup(&info->se_info.bwi_timer, st21nfca_se_wt_timeout, 0);
info->se_info.bwi_active = false;
@@ -416,6 +426,7 @@ void st21nfca_se_deinit(struct nfc_hci_dev *hdev)
if (info->se_info.se_active)
del_timer_sync(&info->se_info.se_active_timer);
+ cancel_work_sync(&info->se_info.timeout_work);
info->se_info.bwi_active = false;
info->se_info.se_active = false;
}
diff --git a/drivers/nfc/st21nfca/st21nfca.h b/drivers/nfc/st21nfca/st21nfca.h
index cb6ad916be9..ae6771cc989 100644
--- a/drivers/nfc/st21nfca/st21nfca.h
+++ b/drivers/nfc/st21nfca/st21nfca.h
@@ -141,6 +141,7 @@ struct st21nfca_se_info {
se_io_cb_t cb;
void *cb_context;
+ struct work_struct timeout_work;
};
struct st21nfca_hci_info {
If you think this solution is ok, I will send "PATCH v3".
Best regards,
Duoming Zhou
On 18/05/2022 13:05, [email protected] wrote:
> Hello,
>
>
> struct st21nfca_hci_info {
>
> If you think this solution is ok, I will send "PATCH v3".
More or less, send entire patch, so we'll see.
>
> Best regards,
> Duoming Zhou
Best regards,
Krzysztof