When the host controller is not responding, all URBs queued to all
endpoints need to be killed. This can cause a kernel panic if we
dereference an invalid endpoint.
Fix this by using xhci_get_virt_ep() helper to find the endpoint and
checking if the endpoint is valid before dereferencing it.
[233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
[233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
[233311.853964] pc : xhci_hc_died+0x10c/0x270
[233311.853971] lr : xhci_hc_died+0x1ac/0x270
[233311.854077] Call trace:
[233311.854085] xhci_hc_died+0x10c/0x270
[233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
[233311.854105] call_timer_fn+0x50/0x2d4
[233311.854112] expire_timers+0xac/0x2e4
[233311.854118] run_timer_softirq+0x300/0xabc
[233311.854127] __do_softirq+0x148/0x528
[233311.854135] irq_exit+0x194/0x1a8
[233311.854143] __handle_domain_irq+0x164/0x1d0
[233311.854149] gic_handle_irq.22273+0x10c/0x188
[233311.854156] el1_irq+0xfc/0x1a8
[233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
[233311.854185] cpuidle_enter_state+0x1f0/0x764
[233311.854194] do_idle+0x594/0x6ac
[233311.854201] cpu_startup_entry+0x7c/0x80
[233311.854209] secondary_start_kernel+0x170/0x198
Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
Cc: [email protected]
Signed-off-by: Jimmy Hu <[email protected]>
---
drivers/usb/host/xhci-ring.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index ddc30037f9ce..f5b0e1ce22af 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
struct xhci_virt_ep *ep;
struct xhci_ring *ring;
- ep = &xhci->devs[slot_id]->eps[ep_index];
+ ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
+ if (!ep)
+ return;
+
if ((ep->ep_state & EP_HAS_STREAMS) ||
(ep->ep_state & EP_GETTING_NO_STREAMS)) {
int stream_id;
--
2.39.0.314.g84b9a713c41-goog
On Thu, Dec 22, 2022 at 07:29:12AM +0000, Jimmy Hu wrote:
> When the host controller is not responding, all URBs queued to all
> endpoints need to be killed. This can cause a kernel panic if we
> dereference an invalid endpoint.
>
> Fix this by using xhci_get_virt_ep() helper to find the endpoint and
> checking if the endpoint is valid before dereferencing it.
>
> [233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
> [233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
>
> [233311.853964] pc : xhci_hc_died+0x10c/0x270
> [233311.853971] lr : xhci_hc_died+0x1ac/0x270
>
> [233311.854077] Call trace:
> [233311.854085] xhci_hc_died+0x10c/0x270
> [233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
> [233311.854105] call_timer_fn+0x50/0x2d4
> [233311.854112] expire_timers+0xac/0x2e4
> [233311.854118] run_timer_softirq+0x300/0xabc
> [233311.854127] __do_softirq+0x148/0x528
> [233311.854135] irq_exit+0x194/0x1a8
> [233311.854143] __handle_domain_irq+0x164/0x1d0
> [233311.854149] gic_handle_irq.22273+0x10c/0x188
> [233311.854156] el1_irq+0xfc/0x1a8
> [233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
> [233311.854185] cpuidle_enter_state+0x1f0/0x764
> [233311.854194] do_idle+0x594/0x6ac
> [233311.854201] cpu_startup_entry+0x7c/0x80
> [233311.854209] secondary_start_kernel+0x170/0x198
>
> Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
> Cc: [email protected]
> Signed-off-by: Jimmy Hu <[email protected]>
> ---
> drivers/usb/host/xhci-ring.c | 5 ++++-
> 1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
> index ddc30037f9ce..f5b0e1ce22af 100644
> --- a/drivers/usb/host/xhci-ring.c
> +++ b/drivers/usb/host/xhci-ring.c
> @@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> struct xhci_virt_ep *ep;
> struct xhci_ring *ring;
>
> - ep = &xhci->devs[slot_id]->eps[ep_index];
> + ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
> + if (!ep)
> + return;
> +
xhci_get_virt_ep also adds check for slot_id == 0. It changes behaviour,
do we really want to skip that slot? Original code went from 0 to
MAX_HC_SLOTS-1.
It seems to be off by one to me. Am I missing anything?
Also, what about passing ep directly to xhci_kill_endpoint_urbs
and do the check in xhci_hc_died? Not even compile tested:
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index ddc30037f9ce..5dac483c562a 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1162,14 +1162,12 @@ static void xhci_kill_ring_urbs(struct xhci_hcd *xhci, struct xhci_ring *ring)
}
static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
- int slot_id, int ep_index)
+ struct xhci_virt_ep *ep)
{
struct xhci_td *cur_td;
struct xhci_td *tmp;
- struct xhci_virt_ep *ep;
struct xhci_ring *ring;
- ep = &xhci->devs[slot_id]->eps[ep_index];
if ((ep->ep_state & EP_HAS_STREAMS) ||
(ep->ep_state & EP_GETTING_NO_STREAMS)) {
int stream_id;
@@ -1180,18 +1178,12 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
if (!ring)
continue;
- xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
- "Killing URBs for slot ID %u, ep index %u, stream %u",
- slot_id, ep_index, stream_id);
xhci_kill_ring_urbs(xhci, ring);
}
} else {
ring = ep->ring;
if (!ring)
return;
- xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
- "Killing URBs for slot ID %u, ep index %u",
- slot_id, ep_index);
xhci_kill_ring_urbs(xhci, ring);
}
@@ -1217,6 +1209,7 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
void xhci_hc_died(struct xhci_hcd *xhci)
{
int i, j;
+ struct xhci_virt_ep *ep;
if (xhci->xhc_state & XHCI_STATE_DYING)
return;
@@ -1227,11 +1220,14 @@ void xhci_hc_died(struct xhci_hcd *xhci)
xhci_cleanup_command_queue(xhci);
/* return any pending urbs, remove may be waiting for them */
- for (i = 0; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
+ for (i = 0; i < HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
if (!xhci->devs[i])
continue;
- for (j = 0; j < 31; j++)
- xhci_kill_endpoint_urbs(xhci, i, j);
+ for (j = 0; j < EP_CTX_PER_DEV; j++) {
+ ep = &xhci->devs[i]->eps[j];
+ if (ep)
+ xhci_kill_endpoint_urbs(xhci, ep);
+ }
}
/* inform usb core hc died if PCI remove isn't already handling it */
> if ((ep->ep_state & EP_HAS_STREAMS) ||
> (ep->ep_state & EP_GETTING_NO_STREAMS)) {
> int stream_id;
> --
> 2.39.0.314.g84b9a713c41-goog
On 22.12.2022 9.29, Jimmy Hu wrote:
> When the host controller is not responding, all URBs queued to all
> endpoints need to be killed. This can cause a kernel panic if we
> dereference an invalid endpoint.
>
> Fix this by using xhci_get_virt_ep() helper to find the endpoint and
> checking if the endpoint is valid before dereferencing it.
>
> [233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
> [233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
>
> [233311.853964] pc : xhci_hc_died+0x10c/0x270
> [233311.853971] lr : xhci_hc_died+0x1ac/0x270
>
> [233311.854077] Call trace:
> [233311.854085] xhci_hc_died+0x10c/0x270
> [233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
> [233311.854105] call_timer_fn+0x50/0x2d4
> [233311.854112] expire_timers+0xac/0x2e4
> [233311.854118] run_timer_softirq+0x300/0xabc
> [233311.854127] __do_softirq+0x148/0x528
> [233311.854135] irq_exit+0x194/0x1a8
> [233311.854143] __handle_domain_irq+0x164/0x1d0
> [233311.854149] gic_handle_irq.22273+0x10c/0x188
> [233311.854156] el1_irq+0xfc/0x1a8
> [233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
> [233311.854185] cpuidle_enter_state+0x1f0/0x764
> [233311.854194] do_idle+0x594/0x6ac
> [233311.854201] cpu_startup_entry+0x7c/0x80
> [233311.854209] secondary_start_kernel+0x170/0x198
>
> Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
> Cc: [email protected]
> Signed-off-by: Jimmy Hu <[email protected]>
> ---
> drivers/usb/host/xhci-ring.c | 5 ++++-
> 1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
> index ddc30037f9ce..f5b0e1ce22af 100644
> --- a/drivers/usb/host/xhci-ring.c
> +++ b/drivers/usb/host/xhci-ring.c
> @@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> struct xhci_virt_ep *ep;
> struct xhci_ring *ring;
>
> - ep = &xhci->devs[slot_id]->eps[ep_index];
> + ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
> + if (!ep)
> + return;
> +
This is a good generic change that should be added anyway, but but might not fix
the rootcause. It will reduce the risk of the race drastically.
We do check that xhci-devs[slot_id] exists once before calling xhci_kill_endpoint_urbs()
for the endpoints of that virt_dev in xhci_hc_died().
And xhci_hc_died() should always be calling with lock held (need to doublecheck this is true).
Older kernels used release and re-aquire the lock while giving back urbs for an endpoint,
so older kernels really need this change.
But newer kernels don't release the lock anymore when giving back URBs.
Looks like real issue is that we don't take the lock when we free the virt_dev.
-Mathias
On 22.12.2022 11.01, Ladislav Michl wrote:
> On Thu, Dec 22, 2022 at 07:29:12AM +0000, Jimmy Hu wrote:
>> When the host controller is not responding, all URBs queued to all
>> endpoints need to be killed. This can cause a kernel panic if we
>> dereference an invalid endpoint.
>>
>> Fix this by using xhci_get_virt_ep() helper to find the endpoint and
>> checking if the endpoint is valid before dereferencing it.
>>
>> [233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
>> [233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
>>
>> [233311.853964] pc : xhci_hc_died+0x10c/0x270
>> [233311.853971] lr : xhci_hc_died+0x1ac/0x270
>>
>> [233311.854077] Call trace:
>> [233311.854085] xhci_hc_died+0x10c/0x270
>> [233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
>> [233311.854105] call_timer_fn+0x50/0x2d4
>> [233311.854112] expire_timers+0xac/0x2e4
>> [233311.854118] run_timer_softirq+0x300/0xabc
>> [233311.854127] __do_softirq+0x148/0x528
>> [233311.854135] irq_exit+0x194/0x1a8
>> [233311.854143] __handle_domain_irq+0x164/0x1d0
>> [233311.854149] gic_handle_irq.22273+0x10c/0x188
>> [233311.854156] el1_irq+0xfc/0x1a8
>> [233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
>> [233311.854185] cpuidle_enter_state+0x1f0/0x764
>> [233311.854194] do_idle+0x594/0x6ac
>> [233311.854201] cpu_startup_entry+0x7c/0x80
>> [233311.854209] secondary_start_kernel+0x170/0x198
>>
>> Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
>> Cc: [email protected]
>> Signed-off-by: Jimmy Hu <[email protected]>
>> ---
>> drivers/usb/host/xhci-ring.c | 5 ++++-
>> 1 file changed, 4 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
>> index ddc30037f9ce..f5b0e1ce22af 100644
>> --- a/drivers/usb/host/xhci-ring.c
>> +++ b/drivers/usb/host/xhci-ring.c
>> @@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
>> struct xhci_virt_ep *ep;
>> struct xhci_ring *ring;
>>
>> - ep = &xhci->devs[slot_id]->eps[ep_index];
>> + ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
>> + if (!ep)
>> + return;
>> +
>
> xhci_get_virt_ep also adds check for slot_id == 0. It changes behaviour,
> do we really want to skip that slot? Original code went from 0 to
> MAX_HC_SLOTS-1.
>
> It seems to be off by one to me. Am I missing anything?
slot_id 0 is always invalid, so this is a good change.
> Also, what about passing ep directly to xhci_kill_endpoint_urbs
> and do the check in xhci_hc_died? Not even compile tested:
passing ep to a function named kill_endpoint_urbs() sound like the
right thing to do, but as a generic change.
I think its a good idea to first do a targeted fix for this null pointer
issue that we can send to stable fist.
>
> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
> index ddc30037f9ce..5dac483c562a 100644
> --- a/drivers/usb/host/xhci-ring.c
> +++ b/drivers/usb/host/xhci-ring.c
> @@ -1162,14 +1162,12 @@ static void xhci_kill_ring_urbs(struct xhci_hcd *xhci, struct xhci_ring *ring)
> }
>
> static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> - int slot_id, int ep_index)
> + struct xhci_virt_ep *ep)
> {
> struct xhci_td *cur_td;
> struct xhci_td *tmp;
> - struct xhci_virt_ep *ep;
> struct xhci_ring *ring;
>
> - ep = &xhci->devs[slot_id]->eps[ep_index];
> if ((ep->ep_state & EP_HAS_STREAMS) ||
> (ep->ep_state & EP_GETTING_NO_STREAMS)) {
> int stream_id;
> @@ -1180,18 +1178,12 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> if (!ring)
> continue;
>
> - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
> - "Killing URBs for slot ID %u, ep index %u, stream %u",
> - slot_id, ep_index, stream_id);
> xhci_kill_ring_urbs(xhci, ring);
> }
> } else {
> ring = ep->ring;
> if (!ring)
> return;
> - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
> - "Killing URBs for slot ID %u, ep index %u",
> - slot_id, ep_index);
> xhci_kill_ring_urbs(xhci, ring);
> }
>
> @@ -1217,6 +1209,7 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> void xhci_hc_died(struct xhci_hcd *xhci)
> {
> int i, j;
> + struct xhci_virt_ep *ep;
>
> if (xhci->xhc_state & XHCI_STATE_DYING)
> return;
> @@ -1227,11 +1220,14 @@ void xhci_hc_died(struct xhci_hcd *xhci)
> xhci_cleanup_command_queue(xhci);
>
> /* return any pending urbs, remove may be waiting for them */
> - for (i = 0; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
> + for (i = 0; i < HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
> if (!xhci->devs[i])
> continue;
> - for (j = 0; j < 31; j++)
> - xhci_kill_endpoint_urbs(xhci, i, j);
> + for (j = 0; j < EP_CTX_PER_DEV; j++) {
> + ep = &xhci->devs[i]->eps[j];
> + if (ep)
> + xhci_kill_endpoint_urbs(xhci, ep);
> + }
This does loop a bit more than the existing code.
With this change its always HCS_MAX_SLOTS * EP_CTX_PER_DEV.
Previously best case was just HCS_MAX_SLOTS.
-Mathias
On Thu, Dec 22, 2022 at 01:08:47PM +0200, Mathias Nyman wrote:
> On 22.12.2022 11.01, Ladislav Michl wrote:
> > On Thu, Dec 22, 2022 at 07:29:12AM +0000, Jimmy Hu wrote:
> > > When the host controller is not responding, all URBs queued to all
> > > endpoints need to be killed. This can cause a kernel panic if we
> > > dereference an invalid endpoint.
> > >
> > > Fix this by using xhci_get_virt_ep() helper to find the endpoint and
> > > checking if the endpoint is valid before dereferencing it.
> > >
> > > [233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
> > > [233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
> > >
> > > [233311.853964] pc : xhci_hc_died+0x10c/0x270
> > > [233311.853971] lr : xhci_hc_died+0x1ac/0x270
> > >
> > > [233311.854077] Call trace:
> > > [233311.854085] xhci_hc_died+0x10c/0x270
> > > [233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
> > > [233311.854105] call_timer_fn+0x50/0x2d4
> > > [233311.854112] expire_timers+0xac/0x2e4
> > > [233311.854118] run_timer_softirq+0x300/0xabc
> > > [233311.854127] __do_softirq+0x148/0x528
> > > [233311.854135] irq_exit+0x194/0x1a8
> > > [233311.854143] __handle_domain_irq+0x164/0x1d0
> > > [233311.854149] gic_handle_irq.22273+0x10c/0x188
> > > [233311.854156] el1_irq+0xfc/0x1a8
> > > [233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
> > > [233311.854185] cpuidle_enter_state+0x1f0/0x764
> > > [233311.854194] do_idle+0x594/0x6ac
> > > [233311.854201] cpu_startup_entry+0x7c/0x80
> > > [233311.854209] secondary_start_kernel+0x170/0x198
> > >
> > > Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
> > > Cc: [email protected]
> > > Signed-off-by: Jimmy Hu <[email protected]>
> > > ---
> > > drivers/usb/host/xhci-ring.c | 5 ++++-
> > > 1 file changed, 4 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
> > > index ddc30037f9ce..f5b0e1ce22af 100644
> > > --- a/drivers/usb/host/xhci-ring.c
> > > +++ b/drivers/usb/host/xhci-ring.c
> > > @@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> > > struct xhci_virt_ep *ep;
> > > struct xhci_ring *ring;
> > > - ep = &xhci->devs[slot_id]->eps[ep_index];
> > > + ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
> > > + if (!ep)
> > > + return;
> > > +
> >
> > xhci_get_virt_ep also adds check for slot_id == 0. It changes behaviour,
> > do we really want to skip that slot? Original code went from 0 to
> > MAX_HC_SLOTS-1.
> >
> > It seems to be off by one to me. Am I missing anything?
>
> slot_id 0 is always invalid, so this is a good change.
I see. Now reading more carefully:
#define HCS_MAX_SLOTS(p) (((p) >> 0) & 0xff)
#define MAX_HC_SLOTS 256
So the loop should go:
for (i = 1; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++)
> > Also, what about passing ep directly to xhci_kill_endpoint_urbs
> > and do the check in xhci_hc_died? Not even compile tested:
>
> passing ep to a function named kill_endpoint_urbs() sound like the
> right thing to do, but as a generic change.
>
> I think its a good idea to first do a targeted fix for this null pointer
> issue that we can send to stable fist.
Agree. But I still do not understand the root cause. There is a check
for NULL xhci->devs[i] already, so patch does not add much more, except
for test for slot_id == 0. And the eps array is just array of
struct xhci_virt_ep, not a pointers to them, so &xhci->devs[i]->eps[j]
should be always valid pointer. However struct xhci_ring in each eps
is allocated and not protected by any lock here. Is that correct?
> > diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
> > index ddc30037f9ce..5dac483c562a 100644
> > --- a/drivers/usb/host/xhci-ring.c
> > +++ b/drivers/usb/host/xhci-ring.c
> > @@ -1162,14 +1162,12 @@ static void xhci_kill_ring_urbs(struct xhci_hcd *xhci, struct xhci_ring *ring)
> > }
> > static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> > - int slot_id, int ep_index)
> > + struct xhci_virt_ep *ep)
> > {
> > struct xhci_td *cur_td;
> > struct xhci_td *tmp;
> > - struct xhci_virt_ep *ep;
> > struct xhci_ring *ring;
> > - ep = &xhci->devs[slot_id]->eps[ep_index];
> > if ((ep->ep_state & EP_HAS_STREAMS) ||
> > (ep->ep_state & EP_GETTING_NO_STREAMS)) {
> > int stream_id;
> > @@ -1180,18 +1178,12 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> > if (!ring)
> > continue;
> > - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
> > - "Killing URBs for slot ID %u, ep index %u, stream %u",
> > - slot_id, ep_index, stream_id);
> > xhci_kill_ring_urbs(xhci, ring);
> > }
> > } else {
> > ring = ep->ring;
> > if (!ring)
> > return;
> > - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
> > - "Killing URBs for slot ID %u, ep index %u",
> > - slot_id, ep_index);
> > xhci_kill_ring_urbs(xhci, ring);
> > }
> > @@ -1217,6 +1209,7 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> > void xhci_hc_died(struct xhci_hcd *xhci)
> > {
> > int i, j;
> > + struct xhci_virt_ep *ep;
> > if (xhci->xhc_state & XHCI_STATE_DYING)
> > return;
> > @@ -1227,11 +1220,14 @@ void xhci_hc_died(struct xhci_hcd *xhci)
> > xhci_cleanup_command_queue(xhci);
> > /* return any pending urbs, remove may be waiting for them */
> > - for (i = 0; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
> > + for (i = 0; i < HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
> > if (!xhci->devs[i])
> > continue;
> > - for (j = 0; j < 31; j++)
> > - xhci_kill_endpoint_urbs(xhci, i, j);
> > + for (j = 0; j < EP_CTX_PER_DEV; j++) {
> > + ep = &xhci->devs[i]->eps[j];
> > + if (ep)
> > + xhci_kill_endpoint_urbs(xhci, ep);
> > + }
>
> This does loop a bit more than the existing code.
> With this change its always HCS_MAX_SLOTS * EP_CTX_PER_DEV.
> Previously best case was just HCS_MAX_SLOTS.
No, that's just the same:
#define EP_CTX_PER_DEV 31
to make clear where that 31 come from. Taken into acount your comment
above, the loop should be
for (i = 1; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) ...
for (j = 0; j < EP_CTX_PER_DEV; j++) ...
ladis
On 22.12.2022 13.52, Ladislav Michl wrote:
> On Thu, Dec 22, 2022 at 01:08:47PM +0200, Mathias Nyman wrote:
>> On 22.12.2022 11.01, Ladislav Michl wrote:
>>> On Thu, Dec 22, 2022 at 07:29:12AM +0000, Jimmy Hu wrote:
>>>> When the host controller is not responding, all URBs queued to all
>>>> endpoints need to be killed. This can cause a kernel panic if we
>>>> dereference an invalid endpoint.
>>>>
>>>> Fix this by using xhci_get_virt_ep() helper to find the endpoint and
>>>> checking if the endpoint is valid before dereferencing it.
>>>>
>>>> [233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
>>>> [233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
>>>>
>>>> [233311.853964] pc : xhci_hc_died+0x10c/0x270
>>>> [233311.853971] lr : xhci_hc_died+0x1ac/0x270
>>>>
>>>> [233311.854077] Call trace:
>>>> [233311.854085] xhci_hc_died+0x10c/0x270
>>>> [233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
>>>> [233311.854105] call_timer_fn+0x50/0x2d4
>>>> [233311.854112] expire_timers+0xac/0x2e4
>>>> [233311.854118] run_timer_softirq+0x300/0xabc
>>>> [233311.854127] __do_softirq+0x148/0x528
>>>> [233311.854135] irq_exit+0x194/0x1a8
>>>> [233311.854143] __handle_domain_irq+0x164/0x1d0
>>>> [233311.854149] gic_handle_irq.22273+0x10c/0x188
>>>> [233311.854156] el1_irq+0xfc/0x1a8
>>>> [233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
>>>> [233311.854185] cpuidle_enter_state+0x1f0/0x764
>>>> [233311.854194] do_idle+0x594/0x6ac
>>>> [233311.854201] cpu_startup_entry+0x7c/0x80
>>>> [233311.854209] secondary_start_kernel+0x170/0x198
>>>>
>>>> Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
>>>> Cc: [email protected]
>>>> Signed-off-by: Jimmy Hu <[email protected]>
>>>> ---
>>>> drivers/usb/host/xhci-ring.c | 5 ++++-
>>>> 1 file changed, 4 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
>>>> index ddc30037f9ce..f5b0e1ce22af 100644
>>>> --- a/drivers/usb/host/xhci-ring.c
>>>> +++ b/drivers/usb/host/xhci-ring.c
>>>> @@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
>>>> struct xhci_virt_ep *ep;
>>>> struct xhci_ring *ring;
>>>> - ep = &xhci->devs[slot_id]->eps[ep_index];
>>>> + ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
>>>> + if (!ep)
>>>> + return;
>>>> +
>>>
>>> xhci_get_virt_ep also adds check for slot_id == 0. It changes behaviour,
>>> do we really want to skip that slot? Original code went from 0 to
>>> MAX_HC_SLOTS-1.
>>>
>>> It seems to be off by one to me. Am I missing anything?
>>
>> slot_id 0 is always invalid, so this is a good change.
>
> I see. Now reading more carefully:
> #define HCS_MAX_SLOTS(p) (((p) >> 0) & 0xff)
> #define MAX_HC_SLOTS 256
> So the loop should go:
> for (i = 1; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++)
yes
>
>>> Also, what about passing ep directly to xhci_kill_endpoint_urbs
>>> and do the check in xhci_hc_died? Not even compile tested:
>>
>> passing ep to a function named kill_endpoint_urbs() sound like the
>> right thing to do, but as a generic change.
>>
>> I think its a good idea to first do a targeted fix for this null pointer
>> issue that we can send to stable fist.
>
> Agree. But I still do not understand the root cause. There is a check
> for NULL xhci->devs[i] already, so patch does not add much more, except
> for test for slot_id == 0. And the eps array is just array of
> struct xhci_virt_ep, not a pointers to them, so &xhci->devs[i]->eps[j]
> should be always valid pointer. However struct xhci_ring in each eps
> is allocated and not protected by any lock here. Is that correct?
I think root cause is that freeing xhci->devs[i] and including rings isn't
protected by the lock, this happens in xhci_free_virt_device() called by
xhci_free_dev(), which in turn may be called by usbcore at any time
So xhci->devs[i] might just suddenly disappear
Patch just checks more often if xhci->devs[i] is valid, between every endpoint.
So the race between xhci_free_virt_device() and xhci_kill_endpoint_urbs()
doesn't trigger null pointer deref as easily.
>
>>> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
>>> index ddc30037f9ce..5dac483c562a 100644
>>> --- a/drivers/usb/host/xhci-ring.c
>>> +++ b/drivers/usb/host/xhci-ring.c
>>> @@ -1162,14 +1162,12 @@ static void xhci_kill_ring_urbs(struct xhci_hcd *xhci, struct xhci_ring *ring)
>>> }
>>> static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
>>> - int slot_id, int ep_index)
>>> + struct xhci_virt_ep *ep)
>>> {
>>> struct xhci_td *cur_td;
>>> struct xhci_td *tmp;
>>> - struct xhci_virt_ep *ep;
>>> struct xhci_ring *ring;
>>> - ep = &xhci->devs[slot_id]->eps[ep_index];
>>> if ((ep->ep_state & EP_HAS_STREAMS) ||
>>> (ep->ep_state & EP_GETTING_NO_STREAMS)) {
>>> int stream_id;
>>> @@ -1180,18 +1178,12 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
>>> if (!ring)
>>> continue;
>>> - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
>>> - "Killing URBs for slot ID %u, ep index %u, stream %u",
>>> - slot_id, ep_index, stream_id);
>>> xhci_kill_ring_urbs(xhci, ring);
>>> }
>>> } else {
>>> ring = ep->ring;
>>> if (!ring)
>>> return;
>>> - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
>>> - "Killing URBs for slot ID %u, ep index %u",
>>> - slot_id, ep_index);
>>> xhci_kill_ring_urbs(xhci, ring);
>>> }
>>> @@ -1217,6 +1209,7 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
>>> void xhci_hc_died(struct xhci_hcd *xhci)
>>> {
>>> int i, j;
>>> + struct xhci_virt_ep *ep;
>>> if (xhci->xhc_state & XHCI_STATE_DYING)
>>> return;
>>> @@ -1227,11 +1220,14 @@ void xhci_hc_died(struct xhci_hcd *xhci)
>>> xhci_cleanup_command_queue(xhci);
>>> /* return any pending urbs, remove may be waiting for them */
>>> - for (i = 0; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
>>> + for (i = 0; i < HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
>>> if (!xhci->devs[i])
>>> continue;
>>> - for (j = 0; j < 31; j++)
>>> - xhci_kill_endpoint_urbs(xhci, i, j);
>>> + for (j = 0; j < EP_CTX_PER_DEV; j++) {
>>> + ep = &xhci->devs[i]->eps[j];
>>> + if (ep)
>>> + xhci_kill_endpoint_urbs(xhci, ep);
>>> + }
>>
>> This does loop a bit more than the existing code.
>> With this change its always HCS_MAX_SLOTS * EP_CTX_PER_DEV.
>> Previously best case was just HCS_MAX_SLOTS.
>
> No, that's just the same:
you're right, incorrectly read that your patch deleted the
"if (!xhci->devs[i]) continue;" lines.
Thanks
-Mathias
On 22.12.2022 15.18, Mathias Nyman wrote:
> On 22.12.2022 13.52, Ladislav Michl wrote:
>> On Thu, Dec 22, 2022 at 01:08:47PM +0200, Mathias Nyman wrote:
>>> On 22.12.2022 11.01, Ladislav Michl wrote:
>>>> On Thu, Dec 22, 2022 at 07:29:12AM +0000, Jimmy Hu wrote:
>>>>> When the host controller is not responding, all URBs queued to all
>>>>> endpoints need to be killed. This can cause a kernel panic if we
>>>>> dereference an invalid endpoint.
>>>>>
>>>>> Fix this by using xhci_get_virt_ep() helper to find the endpoint and
>>>>> checking if the endpoint is valid before dereferencing it.
>>>>>
>>>>> [233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
>>>>> [233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
>>>>>
>>>>> [233311.853964] pc : xhci_hc_died+0x10c/0x270
>>>>> [233311.853971] lr : xhci_hc_died+0x1ac/0x270
>>>>>
>>>>> [233311.854077] Call trace:
>>>>> [233311.854085] xhci_hc_died+0x10c/0x270
>>>>> [233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
>>>>> [233311.854105] call_timer_fn+0x50/0x2d4
>>>>> [233311.854112] expire_timers+0xac/0x2e4
>>>>> [233311.854118] run_timer_softirq+0x300/0xabc
>>>>> [233311.854127] __do_softirq+0x148/0x528
>>>>> [233311.854135] irq_exit+0x194/0x1a8
>>>>> [233311.854143] __handle_domain_irq+0x164/0x1d0
>>>>> [233311.854149] gic_handle_irq.22273+0x10c/0x188
>>>>> [233311.854156] el1_irq+0xfc/0x1a8
>>>>> [233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
>>>>> [233311.854185] cpuidle_enter_state+0x1f0/0x764
>>>>> [233311.854194] do_idle+0x594/0x6ac
>>>>> [233311.854201] cpu_startup_entry+0x7c/0x80
>>>>> [233311.854209] secondary_start_kernel+0x170/0x198
>>>>>
>>>>> Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
>>>>> Cc: [email protected]
>>>>> Signed-off-by: Jimmy Hu <[email protected]>
>>>>> ---
>>>>> drivers/usb/host/xhci-ring.c | 5 ++++-
>>>>> 1 file changed, 4 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
>>>>> index ddc30037f9ce..f5b0e1ce22af 100644
>>>>> --- a/drivers/usb/host/xhci-ring.c
>>>>> +++ b/drivers/usb/host/xhci-ring.c
>>>>> @@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
>>>>> struct xhci_virt_ep *ep;
>>>>> struct xhci_ring *ring;
>>>>> - ep = &xhci->devs[slot_id]->eps[ep_index];
>>>>> + ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
>>>>> + if (!ep)
>>>>> + return;
>>>>> +
>>>>
>>>> xhci_get_virt_ep also adds check for slot_id == 0. It changes behaviour,
>>>> do we really want to skip that slot? Original code went from 0 to
>>>> MAX_HC_SLOTS-1.
>>>>
>>>> It seems to be off by one to me. Am I missing anything?
>>>
>>> slot_id 0 is always invalid, so this is a good change.
>>
>> I see. Now reading more carefully:
>> #define HCS_MAX_SLOTS(p) (((p) >> 0) & 0xff)
>> #define MAX_HC_SLOTS 256
>> So the loop should go:
>> for (i = 1; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++)
>
> yes
>
>>
>>>> Also, what about passing ep directly to xhci_kill_endpoint_urbs
>>>> and do the check in xhci_hc_died? Not even compile tested:
>>>
>>> passing ep to a function named kill_endpoint_urbs() sound like the
>>> right thing to do, but as a generic change.
>>>
>>> I think its a good idea to first do a targeted fix for this null pointer
>>> issue that we can send to stable fist.
>>
>> Agree. But I still do not understand the root cause. There is a check
>> for NULL xhci->devs[i] already, so patch does not add much more, except
>> for test for slot_id == 0. And the eps array is just array of
>> struct xhci_virt_ep, not a pointers to them, so &xhci->devs[i]->eps[j]
>> should be always valid pointer. However struct xhci_ring in each eps
>> is allocated and not protected by any lock here. Is that correct?
>
> I think root cause is that freeing xhci->devs[i] and including rings isn't
> protected by the lock, this happens in xhci_free_virt_device() called by
> xhci_free_dev(), which in turn may be called by usbcore at any time
>
> So xhci->devs[i] might just suddenly disappear
>
> Patch just checks more often if xhci->devs[i] is valid, between every endpoint.
> So the race between xhci_free_virt_device() and xhci_kill_endpoint_urbs()
> doesn't trigger null pointer deref as easily.
Jimmy Hu,
Any chance you could try if the change below works for you instead of
using xhci_get_virt_ep().
I don't have a easy way to trigger the issue-
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 79d7931c048a..50b41213e827 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -3974,6 +3974,7 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
struct xhci_hcd *xhci = hcd_to_xhci(hcd);
struct xhci_virt_device *virt_dev;
struct xhci_slot_ctx *slot_ctx;
+ unsigned long flags;
int i, ret;
/*
@@ -4000,7 +4001,11 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
virt_dev->eps[i].ep_state &= ~EP_STOP_CMD_PENDING;
virt_dev->udev = NULL;
xhci_disable_slot(xhci, udev->slot_id);
+
+ spin_lock_irqsave(&xhci->lock, flags);
xhci_free_virt_device(xhci, udev->slot_id);
+ spin_unlock_irqrestore(&xhci->lock, flags);
+
}
Thanks
Mathias
On Thu, Dec 29, 2022 at 7:49 PM Mathias Nyman
<[email protected]> wrote:
>
> On 22.12.2022 15.18, Mathias Nyman wrote:
> > On 22.12.2022 13.52, Ladislav Michl wrote:
> >> On Thu, Dec 22, 2022 at 01:08:47PM +0200, Mathias Nyman wrote:
> >>> On 22.12.2022 11.01, Ladislav Michl wrote:
> >>>> On Thu, Dec 22, 2022 at 07:29:12AM +0000, Jimmy Hu wrote:
> >>>>> When the host controller is not responding, all URBs queued to all
> >>>>> endpoints need to be killed. This can cause a kernel panic if we
> >>>>> dereference an invalid endpoint.
> >>>>>
> >>>>> Fix this by using xhci_get_virt_ep() helper to find the endpoint and
> >>>>> checking if the endpoint is valid before dereferencing it.
> >>>>>
> >>>>> [233311.853271] xhci-hcd xhci-hcd.1.auto: xHCI host controller not responding, assume dead
> >>>>> [233311.853393] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000e8
> >>>>>
> >>>>> [233311.853964] pc : xhci_hc_died+0x10c/0x270
> >>>>> [233311.853971] lr : xhci_hc_died+0x1ac/0x270
> >>>>>
> >>>>> [233311.854077] Call trace:
> >>>>> [233311.854085] xhci_hc_died+0x10c/0x270
> >>>>> [233311.854093] xhci_stop_endpoint_command_watchdog+0x100/0x1a4
> >>>>> [233311.854105] call_timer_fn+0x50/0x2d4
> >>>>> [233311.854112] expire_timers+0xac/0x2e4
> >>>>> [233311.854118] run_timer_softirq+0x300/0xabc
> >>>>> [233311.854127] __do_softirq+0x148/0x528
> >>>>> [233311.854135] irq_exit+0x194/0x1a8
> >>>>> [233311.854143] __handle_domain_irq+0x164/0x1d0
> >>>>> [233311.854149] gic_handle_irq.22273+0x10c/0x188
> >>>>> [233311.854156] el1_irq+0xfc/0x1a8
> >>>>> [233311.854175] lpm_cpuidle_enter+0x25c/0x418 [msm_pm]
> >>>>> [233311.854185] cpuidle_enter_state+0x1f0/0x764
> >>>>> [233311.854194] do_idle+0x594/0x6ac
> >>>>> [233311.854201] cpu_startup_entry+0x7c/0x80
> >>>>> [233311.854209] secondary_start_kernel+0x170/0x198
> >>>>>
> >>>>> Fixes: 50e8725e7c42 ("xhci: Refactor command watchdog and fix split string.")
> >>>>> Cc: [email protected]
> >>>>> Signed-off-by: Jimmy Hu <[email protected]>
> >>>>> ---
> >>>>> drivers/usb/host/xhci-ring.c | 5 ++++-
> >>>>> 1 file changed, 4 insertions(+), 1 deletion(-)
> >>>>>
> >>>>> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
> >>>>> index ddc30037f9ce..f5b0e1ce22af 100644
> >>>>> --- a/drivers/usb/host/xhci-ring.c
> >>>>> +++ b/drivers/usb/host/xhci-ring.c
> >>>>> @@ -1169,7 +1169,10 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
> >>>>> struct xhci_virt_ep *ep;
> >>>>> struct xhci_ring *ring;
> >>>>> - ep = &xhci->devs[slot_id]->eps[ep_index];
> >>>>> + ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
> >>>>> + if (!ep)
> >>>>> + return;
> >>>>> +
> >>>>
> >>>> xhci_get_virt_ep also adds check for slot_id == 0. It changes behaviour,
> >>>> do we really want to skip that slot? Original code went from 0 to
> >>>> MAX_HC_SLOTS-1.
> >>>>
> >>>> It seems to be off by one to me. Am I missing anything?
> >>>
> >>> slot_id 0 is always invalid, so this is a good change.
> >>
> >> I see. Now reading more carefully:
> >> #define HCS_MAX_SLOTS(p) (((p) >> 0) & 0xff)
> >> #define MAX_HC_SLOTS 256
> >> So the loop should go:
> >> for (i = 1; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++)
> >
> > yes
> >
> >>
> >>>> Also, what about passing ep directly to xhci_kill_endpoint_urbs
> >>>> and do the check in xhci_hc_died? Not even compile tested:
> >>>
> >>> passing ep to a function named kill_endpoint_urbs() sound like the
> >>> right thing to do, but as a generic change.
> >>>
> >>> I think its a good idea to first do a targeted fix for this null pointer
> >>> issue that we can send to stable fist.
> >>
> >> Agree. But I still do not understand the root cause. There is a check
> >> for NULL xhci->devs[i] already, so patch does not add much more, except
> >> for test for slot_id == 0. And the eps array is just array of
> >> struct xhci_virt_ep, not a pointers to them, so &xhci->devs[i]->eps[j]
> >> should be always valid pointer. However struct xhci_ring in each eps
> >> is allocated and not protected by any lock here. Is that correct?
> >
> > I think root cause is that freeing xhci->devs[i] and including rings isn't
> > protected by the lock, this happens in xhci_free_virt_device() called by
> > xhci_free_dev(), which in turn may be called by usbcore at any time
> >
> > So xhci->devs[i] might just suddenly disappear
> >
> > Patch just checks more often if xhci->devs[i] is valid, between every endpoint.
> > So the race between xhci_free_virt_device() and xhci_kill_endpoint_urbs()
> > doesn't trigger null pointer deref as easily.
>
> Jimmy Hu,
>
> Any chance you could try if the change below works for you instead of
> using xhci_get_virt_ep().
> I don't have a easy way to trigger the issue-
>
>
> diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
> index 79d7931c048a..50b41213e827 100644
> --- a/drivers/usb/host/xhci.c
> +++ b/drivers/usb/host/xhci.c
> @@ -3974,6 +3974,7 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
> struct xhci_hcd *xhci = hcd_to_xhci(hcd);
> struct xhci_virt_device *virt_dev;
> struct xhci_slot_ctx *slot_ctx;
> + unsigned long flags;
> int i, ret;
>
> /*
> @@ -4000,7 +4001,11 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
> virt_dev->eps[i].ep_state &= ~EP_STOP_CMD_PENDING;
> virt_dev->udev = NULL;
> xhci_disable_slot(xhci, udev->slot_id);
> +
> + spin_lock_irqsave(&xhci->lock, flags);
> xhci_free_virt_device(xhci, udev->slot_id);
> + spin_unlock_irqrestore(&xhci->lock, flags);
> +
> }
>
>
> Thanks
> Mathias
Mathias,
Sorry. I also don't have an easy way to trigger this issue.
Thanks,
Jimmy