2022-02-16 07:29:10

by Wesley Cheng

[permalink] [raw]
Subject: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

While running the pullup disable sequence, if there are pending SETUP
transfers stored in the controller, then the ENDTRANSFER commands on non
control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
to drain potentially cached SETUP packets, SW needs to issue a
STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
ENDTRANSFER, the command should succeed. Else, if the endpoints are not
properly stopped, the controller halt sequence will fail as well.

One limitation is that the current logic will drop the SETUP data
being received (ie dropping the SETUP packet), however, it should be
acceptable in the pullup disable case, as the device is eventually
going to disconnect from the host.

Signed-off-by: Wesley Cheng <[email protected]>
---
drivers/usb/dwc3/core.h | 7 +++++++
drivers/usb/dwc3/ep0.c | 20 +++++++++++--------
drivers/usb/dwc3/gadget.c | 42 ++++++++++++++++++++++++++++++++++-----
3 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index eb9c1efced05..a411682e7f44 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -1551,6 +1551,8 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
int dwc3_send_gadget_generic_command(struct dwc3 *dwc, unsigned int cmd,
u32 param);
void dwc3_gadget_clear_tx_fifos(struct dwc3 *dwc);
+void dwc3_ep0_stall_and_restart(struct dwc3 *dwc);
+void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep);
#else
static inline int dwc3_gadget_init(struct dwc3 *dwc)
{ return 0; }
@@ -1572,6 +1574,11 @@ static inline int dwc3_send_gadget_generic_command(struct dwc3 *dwc,
{ return 0; }
static inline void dwc3_gadget_clear_tx_fifos(struct dwc3 *dwc)
{ }
+static inline void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
+{ }
+static inline void dwc3_ep0_end_control_data(struct dwc3 *dwc,
+ struct dwc3_ep *dep)
+{ }
#endif

#if IS_ENABLED(CONFIG_USB_DWC3_DUAL_ROLE)
diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
index 658739410992..a2cc94c25dcf 100644
--- a/drivers/usb/dwc3/ep0.c
+++ b/drivers/usb/dwc3/ep0.c
@@ -197,7 +197,7 @@ int dwc3_gadget_ep0_queue(struct usb_ep *ep, struct usb_request *request,
int ret;

spin_lock_irqsave(&dwc->lock, flags);
- if (!dep->endpoint.desc || !dwc->pullups_connected) {
+ if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected) {
dev_err(dwc->dev, "%s: can't queue to disabled endpoint\n",
dep->name);
ret = -ESHUTDOWN;
@@ -218,19 +218,21 @@ int dwc3_gadget_ep0_queue(struct usb_ep *ep, struct usb_request *request,
return ret;
}

-static void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
+void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
{
struct dwc3_ep *dep;

/* reinitialize physical ep1 */
dep = dwc->eps[1];
dep->flags = DWC3_EP_ENABLED;
+ dep->trb_enqueue = 0;

/* stall is always issued on EP0 */
dep = dwc->eps[0];
__dwc3_gadget_ep_set_halt(dep, 1, false);
dep->flags = DWC3_EP_ENABLED;
dwc->delayed_status = false;
+ dep->trb_enqueue = 0;

if (!list_empty(&dep->pending_list)) {
struct dwc3_request *req;
@@ -240,7 +242,9 @@ static void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
}

dwc->ep0state = EP0_SETUP_PHASE;
- dwc3_ep0_out_start(dwc);
+ complete(&dwc->ep0_in_setup);
+ if (dwc->softconnect)
+ dwc3_ep0_out_start(dwc);
}

int __dwc3_gadget_ep0_set_halt(struct usb_ep *ep, int value)
@@ -272,8 +276,6 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
struct dwc3_ep *dep;
int ret;

- complete(&dwc->ep0_in_setup);
-
dep = dwc->eps[0];
dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 8,
DWC3_TRBCTL_CONTROL_SETUP, false);
@@ -922,7 +924,9 @@ static void dwc3_ep0_complete_status(struct dwc3 *dwc,
dwc->setup_packet_pending = true;

dwc->ep0state = EP0_SETUP_PHASE;
- dwc3_ep0_out_start(dwc);
+ complete(&dwc->ep0_in_setup);
+ if (dwc->softconnect)
+ dwc3_ep0_out_start(dwc);
}

static void dwc3_ep0_xfer_complete(struct dwc3 *dwc,
@@ -1073,7 +1077,7 @@ void dwc3_ep0_send_delayed_status(struct dwc3 *dwc)
__dwc3_ep0_do_control_status(dwc, dwc->eps[direction]);
}

-static void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
+void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
{
struct dwc3_gadget_ep_cmd_params params;
u32 cmd;
@@ -1083,7 +1087,7 @@ static void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
return;

cmd = DWC3_DEPCMD_ENDTRANSFER;
- cmd |= DWC3_DEPCMD_CMDIOC;
+ cmd |= dwc->connected ? DWC3_DEPCMD_CMDIOC : DWC3_DEPCMD_HIPRI_FORCERM;
cmd |= DWC3_DEPCMD_PARAM(dep->resource_index);
memset(&params, 0, sizeof(params));
ret = dwc3_send_gadget_ep_cmd(dep, cmd, &params);
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 183b90923f51..f6801199440c 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -885,12 +885,13 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep *dep, unsigned int action)
reg |= DWC3_DALEPENA_EP(dep->number);
dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);

- if (usb_endpoint_xfer_control(desc))
- goto out;
-
/* Initialize the TRB ring */
dep->trb_dequeue = 0;
dep->trb_enqueue = 0;
+
+ if (usb_endpoint_xfer_control(desc))
+ goto out;
+
memset(dep->trb_pool, 0,
sizeof(struct dwc3_trb) * DWC3_TRB_NUM);

@@ -2476,7 +2477,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
* Per databook, when we want to stop the gadget, if a control transfer
* is still in process, complete it and get the core into setup phase.
*/
- if (!is_on && dwc->ep0state != EP0_SETUP_PHASE) {
+ if ((!is_on && (dwc->ep0state != EP0_SETUP_PHASE ||
+ dwc->ep0_next_event != DWC3_EP0_COMPLETE))) {
reinit_completion(&dwc->ep0_in_setup);

ret = wait_for_completion_timeout(&dwc->ep0_in_setup,
@@ -2519,6 +2521,17 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
u32 count;

dwc->connected = false;
+
+ /*
+ * Ensure no pending data/setup stages, and disable ep0 to
+ * block further EP0 transactions before stopping pending
+ * transfers.
+ */
+ dwc3_ep0_end_control_data(dwc, dwc->eps[1]);
+ dwc3_ep0_stall_and_restart(dwc);
+ __dwc3_gadget_ep_disable(dwc->eps[0]);
+ __dwc3_gadget_ep_disable(dwc->eps[1]);
+
/*
* In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
* Section 4.1.8 Table 4-7, it states that for a device-initiated
@@ -3600,8 +3613,10 @@ static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force,
bool interrupt)
{
struct dwc3_gadget_ep_cmd_params params;
+ struct dwc3 *dwc = dep->dwc;
u32 cmd;
int ret;
+ int retries = 1;

if (!(dep->flags & DWC3_EP_TRANSFER_STARTED) ||
(dep->flags & DWC3_EP_END_TRANSFER_PENDING))
@@ -3633,7 +3648,7 @@ static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force,
*
* This mode is NOT available on the DWC_usb31 IP.
*/
-
+retry:
cmd = DWC3_DEPCMD_ENDTRANSFER;
cmd |= force ? DWC3_DEPCMD_HIPRI_FORCERM : 0;
cmd |= interrupt ? DWC3_DEPCMD_CMDIOC : 0;
@@ -3641,6 +3656,23 @@ static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force,
memset(&params, 0, sizeof(params));
ret = dwc3_send_gadget_ep_cmd(dep, cmd, &params);
WARN_ON_ONCE(ret);
+ if (ret == -ETIMEDOUT) {
+ if (!dwc->connected) {
+ /*
+ * While the controller is in an active setup/control
+ * transfer, the HW is unable to service other eps
+ * potentially leading to an endxfer command timeout.
+ * It was recommended to ensure that there are no
+ * pending/cached setup packets stored in internal
+ * memory. Only way to achieve this is to issue another
+ * start transfer, and retry.
+ */
+ if (retries--) {
+ dwc3_ep0_out_start(dwc);
+ goto retry;
+ }
+ }
+ }
dep->resource_index = 0;

if (!interrupt)


2022-03-23 11:10:47

by Thinh Nguyen

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Hi Wesley,

Wesley Cheng wrote:
> Hi Thinh,
>
> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>> While running the pullup disable sequence, if there are pending SETUP
>> transfers stored in the controller, then the ENDTRANSFER commands on non
>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>> to drain potentially cached SETUP packets, SW needs to issue a
>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>> properly stopped, the controller halt sequence will fail as well.
>>
>> One limitation is that the current logic will drop the SETUP data
>> being received (ie dropping the SETUP packet), however, it should be
>> acceptable in the pullup disable case, as the device is eventually
>> going to disconnect from the host.
>>
>> Signed-off-by: Wesley Cheng <[email protected]>
>> ---
> Just wondering if you had any inputs for this particular change? I
> think on the dequeue path discussion you had some concerns with parts of
> this change? I think the difficult part for the pullup disable path is
> that we have this API running w/ interrupts disabled, so the EP0 state
> won't be able to advance compared to the dequeue case.

This doesn't sound right. The pullup disable (or device initiated
disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
proceeding, which it does.

>
> Ideally, if there is a way to ensure that we avoid reading further setup
> packets, that would be nice, but from our discussions with Synopsys,
> this was not possible. (controller is always armed and ready to ACK
> setup tokens)
>
> I did evaluate keeping IRQs enabled and periodically releasing/attaining
> the lock between the stop active xfer calls, but that opened another can
> of worms. If you think this is the approach we should take, I can take
> a look at this implementation further.
>

This patch doesn't look right to me. The change I suggested before
should address this (I believe Greg already picked it up). What other
problem do you see, I'm not clear what's the problem here. One potential
problem that I can see is that currently dwc3 driver doesn't wait for
active endpoints to complete/end before clearing the run_stop bit on
device initiated disconnect, but I'm not sure if that's what you're seeing.

Please help clarify further. If there's trace points log, that'd help.

Thanks,
Thinh

2022-03-23 17:37:15

by Wesley Cheng

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Hi Thinh,

On 2/15/2022 4:08 PM, Wesley Cheng wrote:
> While running the pullup disable sequence, if there are pending SETUP
> transfers stored in the controller, then the ENDTRANSFER commands on non
> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
> to drain potentially cached SETUP packets, SW needs to issue a
> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
> properly stopped, the controller halt sequence will fail as well.
>
> One limitation is that the current logic will drop the SETUP data
> being received (ie dropping the SETUP packet), however, it should be
> acceptable in the pullup disable case, as the device is eventually
> going to disconnect from the host.
>
> Signed-off-by: Wesley Cheng <[email protected]>
> ---
Just wondering if you had any inputs for this particular change? I
think on the dequeue path discussion you had some concerns with parts of
this change? I think the difficult part for the pullup disable path is
that we have this API running w/ interrupts disabled, so the EP0 state
won't be able to advance compared to the dequeue case.

Ideally, if there is a way to ensure that we avoid reading further setup
packets, that would be nice, but from our discussions with Synopsys,
this was not possible. (controller is always armed and ready to ACK
setup tokens)

I did evaluate keeping IRQs enabled and periodically releasing/attaining
the lock between the stop active xfer calls, but that opened another can
of worms. If you think this is the approach we should take, I can take
a look at this implementation further.

Thanks
Wesley Cheng

> drivers/usb/dwc3/core.h | 7 +++++++
> drivers/usb/dwc3/ep0.c | 20 +++++++++++--------
> drivers/usb/dwc3/gadget.c | 42 ++++++++++++++++++++++++++++++++++-----
> 3 files changed, 56 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
> index eb9c1efced05..a411682e7f44 100644
> --- a/drivers/usb/dwc3/core.h
> +++ b/drivers/usb/dwc3/core.h
> @@ -1551,6 +1551,8 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
> int dwc3_send_gadget_generic_command(struct dwc3 *dwc, unsigned int cmd,
> u32 param);
> void dwc3_gadget_clear_tx_fifos(struct dwc3 *dwc);
> +void dwc3_ep0_stall_and_restart(struct dwc3 *dwc);
> +void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep);
> #else
> static inline int dwc3_gadget_init(struct dwc3 *dwc)
> { return 0; }
> @@ -1572,6 +1574,11 @@ static inline int dwc3_send_gadget_generic_command(struct dwc3 *dwc,
> { return 0; }
> static inline void dwc3_gadget_clear_tx_fifos(struct dwc3 *dwc)
> { }
> +static inline void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
> +{ }
> +static inline void dwc3_ep0_end_control_data(struct dwc3 *dwc,
> + struct dwc3_ep *dep)
> +{ }
> #endif
>
> #if IS_ENABLED(CONFIG_USB_DWC3_DUAL_ROLE)
> diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
> index 658739410992..a2cc94c25dcf 100644
> --- a/drivers/usb/dwc3/ep0.c
> +++ b/drivers/usb/dwc3/ep0.c
> @@ -197,7 +197,7 @@ int dwc3_gadget_ep0_queue(struct usb_ep *ep, struct usb_request *request,
> int ret;
>
> spin_lock_irqsave(&dwc->lock, flags);
> - if (!dep->endpoint.desc || !dwc->pullups_connected) {
> + if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected) {
> dev_err(dwc->dev, "%s: can't queue to disabled endpoint\n",
> dep->name);
> ret = -ESHUTDOWN;
> @@ -218,19 +218,21 @@ int dwc3_gadget_ep0_queue(struct usb_ep *ep, struct usb_request *request,
> return ret;
> }
>
> -static void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
> +void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
> {
> struct dwc3_ep *dep;
>
> /* reinitialize physical ep1 */
> dep = dwc->eps[1];
> dep->flags = DWC3_EP_ENABLED;
> + dep->trb_enqueue = 0;
>
> /* stall is always issued on EP0 */
> dep = dwc->eps[0];
> __dwc3_gadget_ep_set_halt(dep, 1, false);
> dep->flags = DWC3_EP_ENABLED;
> dwc->delayed_status = false;
> + dep->trb_enqueue = 0;
>
> if (!list_empty(&dep->pending_list)) {
> struct dwc3_request *req;
> @@ -240,7 +242,9 @@ static void dwc3_ep0_stall_and_restart(struct dwc3 *dwc)
> }
>
> dwc->ep0state = EP0_SETUP_PHASE;
> - dwc3_ep0_out_start(dwc);
> + complete(&dwc->ep0_in_setup);
> + if (dwc->softconnect)
> + dwc3_ep0_out_start(dwc);
> }
>
> int __dwc3_gadget_ep0_set_halt(struct usb_ep *ep, int value)
> @@ -272,8 +276,6 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
> struct dwc3_ep *dep;
> int ret;
>
> - complete(&dwc->ep0_in_setup);
> -
> dep = dwc->eps[0];
> dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 8,
> DWC3_TRBCTL_CONTROL_SETUP, false);
> @@ -922,7 +924,9 @@ static void dwc3_ep0_complete_status(struct dwc3 *dwc,
> dwc->setup_packet_pending = true;
>
> dwc->ep0state = EP0_SETUP_PHASE;
> - dwc3_ep0_out_start(dwc);
> + complete(&dwc->ep0_in_setup);
> + if (dwc->softconnect)
> + dwc3_ep0_out_start(dwc);
> }
>
> static void dwc3_ep0_xfer_complete(struct dwc3 *dwc,
> @@ -1073,7 +1077,7 @@ void dwc3_ep0_send_delayed_status(struct dwc3 *dwc)
> __dwc3_ep0_do_control_status(dwc, dwc->eps[direction]);
> }
>
> -static void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
> +void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
> {
> struct dwc3_gadget_ep_cmd_params params;
> u32 cmd;
> @@ -1083,7 +1087,7 @@ static void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
> return;
>
> cmd = DWC3_DEPCMD_ENDTRANSFER;
> - cmd |= DWC3_DEPCMD_CMDIOC;
> + cmd |= dwc->connected ? DWC3_DEPCMD_CMDIOC : DWC3_DEPCMD_HIPRI_FORCERM;
> cmd |= DWC3_DEPCMD_PARAM(dep->resource_index);
> memset(&params, 0, sizeof(params));
> ret = dwc3_send_gadget_ep_cmd(dep, cmd, &params);
> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index 183b90923f51..f6801199440c 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -885,12 +885,13 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep *dep, unsigned int action)
> reg |= DWC3_DALEPENA_EP(dep->number);
> dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>
> - if (usb_endpoint_xfer_control(desc))
> - goto out;
> -
> /* Initialize the TRB ring */
> dep->trb_dequeue = 0;
> dep->trb_enqueue = 0;
> +
> + if (usb_endpoint_xfer_control(desc))
> + goto out;
> +
> memset(dep->trb_pool, 0,
> sizeof(struct dwc3_trb) * DWC3_TRB_NUM);
>
> @@ -2476,7 +2477,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
> * Per databook, when we want to stop the gadget, if a control transfer
> * is still in process, complete it and get the core into setup phase.
> */
> - if (!is_on && dwc->ep0state != EP0_SETUP_PHASE) {
> + if ((!is_on && (dwc->ep0state != EP0_SETUP_PHASE ||
> + dwc->ep0_next_event != DWC3_EP0_COMPLETE))) {
> reinit_completion(&dwc->ep0_in_setup);
>
> ret = wait_for_completion_timeout(&dwc->ep0_in_setup,
> @@ -2519,6 +2521,17 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
> u32 count;
>
> dwc->connected = false;
> +
> + /*
> + * Ensure no pending data/setup stages, and disable ep0 to
> + * block further EP0 transactions before stopping pending
> + * transfers.
> + */
> + dwc3_ep0_end_control_data(dwc, dwc->eps[1]);
> + dwc3_ep0_stall_and_restart(dwc);
> + __dwc3_gadget_ep_disable(dwc->eps[0]);
> + __dwc3_gadget_ep_disable(dwc->eps[1]);
> +
> /*
> * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
> * Section 4.1.8 Table 4-7, it states that for a device-initiated
> @@ -3600,8 +3613,10 @@ static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force,
> bool interrupt)
> {
> struct dwc3_gadget_ep_cmd_params params;
> + struct dwc3 *dwc = dep->dwc;
> u32 cmd;
> int ret;
> + int retries = 1;
>
> if (!(dep->flags & DWC3_EP_TRANSFER_STARTED) ||
> (dep->flags & DWC3_EP_END_TRANSFER_PENDING))
> @@ -3633,7 +3648,7 @@ static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force,
> *
> * This mode is NOT available on the DWC_usb31 IP.
> */
> -
> +retry:
> cmd = DWC3_DEPCMD_ENDTRANSFER;
> cmd |= force ? DWC3_DEPCMD_HIPRI_FORCERM : 0;
> cmd |= interrupt ? DWC3_DEPCMD_CMDIOC : 0;
> @@ -3641,6 +3656,23 @@ static void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force,
> memset(&params, 0, sizeof(params));
> ret = dwc3_send_gadget_ep_cmd(dep, cmd, &params);
> WARN_ON_ONCE(ret);
> + if (ret == -ETIMEDOUT) {
> + if (!dwc->connected) {
> + /*
> + * While the controller is in an active setup/control
> + * transfer, the HW is unable to service other eps
> + * potentially leading to an endxfer command timeout.
> + * It was recommended to ensure that there are no
> + * pending/cached setup packets stored in internal
> + * memory. Only way to achieve this is to issue another
> + * start transfer, and retry.
> + */
> + if (retries--) {
> + dwc3_ep0_out_start(dwc);
> + goto retry;
> + }
> + }
> + }
> dep->resource_index = 0;
>
> if (!interrupt)

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

2022-03-23 23:35:35

by Wesley Cheng

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Hi Thinh,

On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
> Hi Wesley,
>
> Wesley Cheng wrote:
>> Hi Thinh,
>>
>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>> While running the pullup disable sequence, if there are pending SETUP
>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>> to drain potentially cached SETUP packets, SW needs to issue a
>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>> properly stopped, the controller halt sequence will fail as well.
>>>
>>> One limitation is that the current logic will drop the SETUP data
>>> being received (ie dropping the SETUP packet), however, it should be
>>> acceptable in the pullup disable case, as the device is eventually
>>> going to disconnect from the host.
>>>
>>> Signed-off-by: Wesley Cheng <[email protected]>
>>> ---
>> Just wondering if you had any inputs for this particular change? I
>> think on the dequeue path discussion you had some concerns with parts of
>> this change? I think the difficult part for the pullup disable path is
>> that we have this API running w/ interrupts disabled, so the EP0 state
>> won't be able to advance compared to the dequeue case.
>
> This doesn't sound right. The pullup disable (or device initiated
> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
> proceeding, which it does.
>
That is correct, but even though that check is passed, it doesn't
prevent the host from sending another SETUP token between the pending
setup packet check and run/stop clear.

>>
>> Ideally, if there is a way to ensure that we avoid reading further setup
>> packets, that would be nice, but from our discussions with Synopsys,
>> this was not possible. (controller is always armed and ready to ACK
>> setup tokens)
>>
>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>> the lock between the stop active xfer calls, but that opened another can
>> of worms. If you think this is the approach we should take, I can take
>> a look at this implementation further.
>>
>
> This patch doesn't look right to me. The change I suggested before
> should address this (I believe Greg already picked it up). What other
> problem do you see, I'm not clear what's the problem here. One potential
> problem that I can see is that currently dwc3 driver doesn't wait for
> active endpoints to complete/end before clearing the run_stop bit on
> device initiated disconnect, but I'm not sure if that's what you're seeing.
>
> Please help clarify further. If there's trace points log, that'd help.
>
Main difference between the issue Greg recently pulled in and this one
is that this is on the pullup disable patch (no dequeue involved). In
this situation we will also stop active transfers, so that the
controller can halt properly.

I attached a few files, and will give a summary of them below:
1. pullup_disable_timeout.txt - ftrace of an instance of when we see
several endxfer timeouts. Refer to line#2016.

2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
and a ftrace collected (difference instance to #1)

#2 will show that we completed a SETUP transfer before entering into
dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
another SETUP token, which leads to endxfer timeouts on all subsequent
endpoints.

Thanks
Wesley Cheng

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


Attachments:
pullup_disable_timeout.txt (293.67 kB)
lecroy+ftrace_snip.png (139.55 kB)
Download all attachments

2022-03-24 20:54:16

by Thinh Nguyen

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Hi Wesley,

Wesley Cheng wrote:
> Hi Thinh,
>
> On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
>> Hi Wesley,
>>
>> Wesley Cheng wrote:
>>> Hi Thinh,
>>>
>>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>>> While running the pullup disable sequence, if there are pending SETUP
>>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>>> to drain potentially cached SETUP packets, SW needs to issue a
>>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>>> properly stopped, the controller halt sequence will fail as well.
>>>>
>>>> One limitation is that the current logic will drop the SETUP data
>>>> being received (ie dropping the SETUP packet), however, it should be
>>>> acceptable in the pullup disable case, as the device is eventually
>>>> going to disconnect from the host.
>>>>
>>>> Signed-off-by: Wesley Cheng <[email protected]>
>>>> ---
>>> Just wondering if you had any inputs for this particular change? I
>>> think on the dequeue path discussion you had some concerns with parts of
>>> this change? I think the difficult part for the pullup disable path is
>>> that we have this API running w/ interrupts disabled, so the EP0 state
>>> won't be able to advance compared to the dequeue case.
>>
>> This doesn't sound right. The pullup disable (or device initiated
>> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
>> proceeding, which it does.
>>
> That is correct, but even though that check is passed, it doesn't
> prevent the host from sending another SETUP token between the pending
> setup packet check and run/stop clear.
>

That should be fine, because we would have the TRB ready for that SETUP
packet.

>>>
>>> Ideally, if there is a way to ensure that we avoid reading further setup
>>> packets, that would be nice, but from our discussions with Synopsys,
>>> this was not possible. (controller is always armed and ready to ACK
>>> setup tokens)
>>>
>>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>>> the lock between the stop active xfer calls, but that opened another can
>>> of worms. If you think this is the approach we should take, I can take
>>> a look at this implementation further.
>>>
>>
>> This patch doesn't look right to me. The change I suggested before
>> should address this (I believe Greg already picked it up). What other
>> problem do you see, I'm not clear what's the problem here. One potential
>> problem that I can see is that currently dwc3 driver doesn't wait for
>> active endpoints to complete/end before clearing the run_stop bit on
>> device initiated disconnect, but I'm not sure if that's what you're seeing.
>>
>> Please help clarify further. If there's trace points log, that'd help.
>>
> Main difference between the issue Greg recently pulled in and this one
> is that this is on the pullup disable patch (no dequeue involved). In
> this situation we will also stop active transfers, so that the
> controller can halt properly.
>
> I attached a few files, and will give a summary of them below:
> 1. pullup_disable_timeout.txt - ftrace of an instance of when we see
> several endxfer timeouts. Refer to line#2016.
>
> 2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
> and a ftrace collected (difference instance to #1)
>
> #2 will show that we completed a SETUP transfer before entering into
> dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
> another SETUP token, which leads to endxfer timeouts on all subsequent
> endpoints.

Thanks for the captures. I think the problem here is because the dwc3
driver disables the control endpoint. It shouldn't do that.

Can you try this:

diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index ab725d2262d6..f0b9ea353620 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1010,12 +1010,12 @@ static int __dwc3_gadget_ep_disable(struct
dwc3_ep *dep)
if (dep->flags & DWC3_EP_STALL)
__dwc3_gadget_ep_set_halt(dep, 0, false);

- reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
- reg &= ~DWC3_DALEPENA_EP(dep->number);
- dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
-
- /* Clear out the ep descriptors for non-ep0 */
if (dep->number > 1) {
+ reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
+ reg &= ~DWC3_DALEPENA_EP(dep->number);
+ dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
+
+ /* Clear out the ep descriptors for non-ep0 */
dep->endpoint.comp_desc = NULL;
dep->endpoint.desc = NULL;
}


Thanks,
Thinh

2022-03-24 23:33:03

by Wesley Cheng

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Hi Thinh,

On 3/23/2022 5:39 PM, Thinh Nguyen wrote:
> Hi Wesley,
>
> Wesley Cheng wrote:
>> Hi Thinh,
>>
>> On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
>>> Hi Wesley,
>>>
>>> Wesley Cheng wrote:
>>>> Hi Thinh,
>>>>
>>>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>>>> While running the pullup disable sequence, if there are pending SETUP
>>>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>>>> to drain potentially cached SETUP packets, SW needs to issue a
>>>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>>>> properly stopped, the controller halt sequence will fail as well.
>>>>>
>>>>> One limitation is that the current logic will drop the SETUP data
>>>>> being received (ie dropping the SETUP packet), however, it should be
>>>>> acceptable in the pullup disable case, as the device is eventually
>>>>> going to disconnect from the host.
>>>>>
>>>>> Signed-off-by: Wesley Cheng <[email protected]>
>>>>> ---
>>>> Just wondering if you had any inputs for this particular change? I
>>>> think on the dequeue path discussion you had some concerns with parts of
>>>> this change? I think the difficult part for the pullup disable path is
>>>> that we have this API running w/ interrupts disabled, so the EP0 state
>>>> won't be able to advance compared to the dequeue case.
>>>
>>> This doesn't sound right. The pullup disable (or device initiated
>>> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
>>> proceeding, which it does.
>>>
>> That is correct, but even though that check is passed, it doesn't
>> prevent the host from sending another SETUP token between the pending
>> setup packet check and run/stop clear.
>>
>
> That should be fine, because we would have the TRB ready for that SETUP
> packet.
>
I agree, its valid for the controller to be able to receive the next
setup packet.

>>>>
>>>> Ideally, if there is a way to ensure that we avoid reading further setup
>>>> packets, that would be nice, but from our discussions with Synopsys,
>>>> this was not possible. (controller is always armed and ready to ACK
>>>> setup tokens)
>>>>
>>>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>>>> the lock between the stop active xfer calls, but that opened another can
>>>> of worms. If you think this is the approach we should take, I can take
>>>> a look at this implementation further.
>>>>
>>>
>>> This patch doesn't look right to me. The change I suggested before
>>> should address this (I believe Greg already picked it up). What other
>>> problem do you see, I'm not clear what's the problem here. One potential
>>> problem that I can see is that currently dwc3 driver doesn't wait for
>>> active endpoints to complete/end before clearing the run_stop bit on
>>> device initiated disconnect, but I'm not sure if that's what you're seeing.
>>>
>>> Please help clarify further. If there's trace points log, that'd help.
>>>
>> Main difference between the issue Greg recently pulled in and this one
>> is that this is on the pullup disable patch (no dequeue involved). In
>> this situation we will also stop active transfers, so that the
>> controller can halt properly.
>>
>> I attached a few files, and will give a summary of them below:
>> 1. pullup_disable_timeout.txt - ftrace of an instance of when we see
>> several endxfer timeouts. Refer to line#2016.
>>
>> 2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
>> and a ftrace collected (difference instance to #1)
>>
>> #2 will show that we completed a SETUP transfer before entering into
>> dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
>> another SETUP token, which leads to endxfer timeouts on all subsequent
>> endpoints.
>
> Thanks for the captures. I think the problem here is because the dwc3
> driver disables the control endpoint. It shouldn't do that.
>
> Can you try this:
>
> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index ab725d2262d6..f0b9ea353620 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -1010,12 +1010,12 @@ static int __dwc3_gadget_ep_disable(struct
> dwc3_ep *dep)
> if (dep->flags & DWC3_EP_STALL)
> __dwc3_gadget_ep_set_halt(dep, 0, false);
>
> - reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
> - reg &= ~DWC3_DALEPENA_EP(dep->number);
> - dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
> -
> - /* Clear out the ep descriptors for non-ep0 */
> if (dep->number > 1) {
> + reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
> + reg &= ~DWC3_DALEPENA_EP(dep->number);
> + dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
> +
> + /* Clear out the ep descriptors for non-ep0 */
> dep->endpoint.comp_desc = NULL;
> dep->endpoint.desc = NULL;
> }
>
I was able to reproduce the endxfer timeout w/ the above change. I'm
assuming you didn't want me to include any parts of my change while
testing, right?

Current sequence in dwc3_gadget_pullup(0) is that we should call
dwc3_stop_active_transfers() before we call __dwc3_gadget_stop().
(gadget stop will call the ep disable for EP[0] and EP[1]) This might
be why the issue would still be occurring.

The attached ftrace that captures a situation where a forced BUG will
occur on the first instance of the timeout.

[ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer
Not Ready [0] (Not Active) [Status Phase]
[ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd
'Start Transfer' [406] params 00000000 efffa000 00000000 --> status:
Successful
[ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End
Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out

We definitely started the STATUS phase, so host could have read it and
sent the next SETUP packet while we were already in the
dwc3_stop_active_transfers() loop.

Thanks
Wesley Cheng


Attachments:
a600000.ssusb.trace.txt (162.92 kB)

2022-03-25 04:29:27

by Thinh Nguyen

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Thinh Nguyen wrote:
> Hi Wesley,
>
> Wesley Cheng wrote:
>> Hi Thinh,
>>
>> On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
>>> Hi Wesley,
>>>
>>> Wesley Cheng wrote:
>>>> Hi Thinh,
>>>>
>>>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>>>> While running the pullup disable sequence, if there are pending SETUP
>>>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>>>> to drain potentially cached SETUP packets, SW needs to issue a
>>>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>>>> properly stopped, the controller halt sequence will fail as well.
>>>>>
>>>>> One limitation is that the current logic will drop the SETUP data
>>>>> being received (ie dropping the SETUP packet), however, it should be
>>>>> acceptable in the pullup disable case, as the device is eventually
>>>>> going to disconnect from the host.
>>>>>
>>>>> Signed-off-by: Wesley Cheng <[email protected]>
>>>>> ---
>>>> Just wondering if you had any inputs for this particular change? I
>>>> think on the dequeue path discussion you had some concerns with parts of
>>>> this change? I think the difficult part for the pullup disable path is
>>>> that we have this API running w/ interrupts disabled, so the EP0 state
>>>> won't be able to advance compared to the dequeue case.
>>>
>>> This doesn't sound right. The pullup disable (or device initiated
>>> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
>>> proceeding, which it does.
>>>
>> That is correct, but even though that check is passed, it doesn't
>> prevent the host from sending another SETUP token between the pending
>> setup packet check and run/stop clear.
>>
>
> That should be fine, because we would have the TRB ready for that SETUP
> packet.
>
>>>>
>>>> Ideally, if there is a way to ensure that we avoid reading further setup
>>>> packets, that would be nice, but from our discussions with Synopsys,
>>>> this was not possible. (controller is always armed and ready to ACK
>>>> setup tokens)
>>>>
>>>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>>>> the lock between the stop active xfer calls, but that opened another can
>>>> of worms. If you think this is the approach we should take, I can take
>>>> a look at this implementation further.
>>>>
>>>
>>> This patch doesn't look right to me. The change I suggested before
>>> should address this (I believe Greg already picked it up). What other
>>> problem do you see, I'm not clear what's the problem here. One potential
>>> problem that I can see is that currently dwc3 driver doesn't wait for
>>> active endpoints to complete/end before clearing the run_stop bit on
>>> device initiated disconnect, but I'm not sure if that's what you're seeing.
>>>
>>> Please help clarify further. If there's trace points log, that'd help.
>>>
>> Main difference between the issue Greg recently pulled in and this one
>> is that this is on the pullup disable patch (no dequeue involved). In
>> this situation we will also stop active transfers, so that the
>> controller can halt properly.
>>
>> I attached a few files, and will give a summary of them below:
>> 1. pullup_disable_timeout.txt - ftrace of an instance of when we see
>> several endxfer timeouts. Refer to line#2016.
>>
>> 2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
>> and a ftrace collected (difference instance to #1)
>>
>> #2 will show that we completed a SETUP transfer before entering into
>> dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
>> another SETUP token, which leads to endxfer timeouts on all subsequent
>> endpoints.
>
> Thanks for the captures. I think the problem here is because the dwc3
> driver disables the control endpoint. It shouldn't do that.
>
> Can you try this:
>
> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index ab725d2262d6..f0b9ea353620 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -1010,12 +1010,12 @@ static int __dwc3_gadget_ep_disable(struct
> dwc3_ep *dep)
> if (dep->flags & DWC3_EP_STALL)
> __dwc3_gadget_ep_set_halt(dep, 0, false);
>
> - reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
> - reg &= ~DWC3_DALEPENA_EP(dep->number);
> - dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
> -
> - /* Clear out the ep descriptors for non-ep0 */
> if (dep->number > 1) {
> + reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
> + reg &= ~DWC3_DALEPENA_EP(dep->number);
> + dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
> +
> + /* Clear out the ep descriptors for non-ep0 */
> dep->endpoint.comp_desc = NULL;
> dep->endpoint.desc = NULL;
> }
>

Also, don't issue End Transfer on control endpoint either.

Thanks,
Thinh

2022-03-25 19:38:57

by Thinh Nguyen

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Wesley Cheng wrote:
> Hi Thinh,
>
> On 3/23/2022 5:39 PM, Thinh Nguyen wrote:
>> Hi Wesley,
>>
>> Wesley Cheng wrote:
>>> Hi Thinh,
>>>
>>> On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
>>>> Hi Wesley,
>>>>
>>>> Wesley Cheng wrote:
>>>>> Hi Thinh,
>>>>>
>>>>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>>>>> While running the pullup disable sequence, if there are pending SETUP
>>>>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>>>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>>>>> to drain potentially cached SETUP packets, SW needs to issue a
>>>>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>>>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>>>>> properly stopped, the controller halt sequence will fail as well.
>>>>>>
>>>>>> One limitation is that the current logic will drop the SETUP data
>>>>>> being received (ie dropping the SETUP packet), however, it should be
>>>>>> acceptable in the pullup disable case, as the device is eventually
>>>>>> going to disconnect from the host.
>>>>>>
>>>>>> Signed-off-by: Wesley Cheng <[email protected]>
>>>>>> ---
>>>>> Just wondering if you had any inputs for this particular change? I
>>>>> think on the dequeue path discussion you had some concerns with parts of
>>>>> this change? I think the difficult part for the pullup disable path is
>>>>> that we have this API running w/ interrupts disabled, so the EP0 state
>>>>> won't be able to advance compared to the dequeue case.
>>>>
>>>> This doesn't sound right. The pullup disable (or device initiated
>>>> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
>>>> proceeding, which it does.
>>>>
>>> That is correct, but even though that check is passed, it doesn't
>>> prevent the host from sending another SETUP token between the pending
>>> setup packet check and run/stop clear.
>>>
>>
>> That should be fine, because we would have the TRB ready for that SETUP
>> packet.
>>
> I agree, its valid for the controller to be able to receive the next
> setup packet.
>
>>>>>
>>>>> Ideally, if there is a way to ensure that we avoid reading further setup
>>>>> packets, that would be nice, but from our discussions with Synopsys,
>>>>> this was not possible. (controller is always armed and ready to ACK
>>>>> setup tokens)
>>>>>
>>>>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>>>>> the lock between the stop active xfer calls, but that opened another can
>>>>> of worms. If you think this is the approach we should take, I can take
>>>>> a look at this implementation further.
>>>>>
>>>>
>>>> This patch doesn't look right to me. The change I suggested before
>>>> should address this (I believe Greg already picked it up). What other
>>>> problem do you see, I'm not clear what's the problem here. One potential
>>>> problem that I can see is that currently dwc3 driver doesn't wait for
>>>> active endpoints to complete/end before clearing the run_stop bit on
>>>> device initiated disconnect, but I'm not sure if that's what you're seeing.
>>>>
>>>> Please help clarify further. If there's trace points log, that'd help.
>>>>
>>> Main difference between the issue Greg recently pulled in and this one
>>> is that this is on the pullup disable patch (no dequeue involved). In
>>> this situation we will also stop active transfers, so that the
>>> controller can halt properly.
>>>
>>> I attached a few files, and will give a summary of them below:
>>> 1. pullup_disable_timeout.txt - ftrace of an instance of when we see
>>> several endxfer timeouts. Refer to line#2016.
>>>
>>> 2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
>>> and a ftrace collected (difference instance to #1)
>>>
>>> #2 will show that we completed a SETUP transfer before entering into
>>> dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
>>> another SETUP token, which leads to endxfer timeouts on all subsequent
>>> endpoints.
>>
>> Thanks for the captures. I think the problem here is because the dwc3
>> driver disables the control endpoint. It shouldn't do that.
>>
>> Can you try this:
>>
>> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
>> index ab725d2262d6..f0b9ea353620 100644
>> --- a/drivers/usb/dwc3/gadget.c
>> +++ b/drivers/usb/dwc3/gadget.c
>> @@ -1010,12 +1010,12 @@ static int __dwc3_gadget_ep_disable(struct
>> dwc3_ep *dep)
>> if (dep->flags & DWC3_EP_STALL)
>> __dwc3_gadget_ep_set_halt(dep, 0, false);
>>
>> - reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>> - reg &= ~DWC3_DALEPENA_EP(dep->number);
>> - dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>> -
>> - /* Clear out the ep descriptors for non-ep0 */
>> if (dep->number > 1) {
>> + reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>> + reg &= ~DWC3_DALEPENA_EP(dep->number);
>> + dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>> +
>> + /* Clear out the ep descriptors for non-ep0 */
>> dep->endpoint.comp_desc = NULL;
>> dep->endpoint.desc = NULL;
>> }
>>
> I was able to reproduce the endxfer timeout w/ the above change. I'm
> assuming you didn't want me to include any parts of my change while
> testing, right?
>

Right, please don't add additional code. I can't review knowing what
else was changed.

> Current sequence in dwc3_gadget_pullup(0) is that we should call
> dwc3_stop_active_transfers() before we call __dwc3_gadget_stop().
> (gadget stop will call the ep disable for EP[0] and EP[1]) This might
> be why the issue would still be occurring.
>
> The attached ftrace that captures a situation where a forced BUG will
> occur on the first instance of the timeout.
>
> [ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer
> Not Ready [0] (Not Active) [Status Phase]
> [ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd
> 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status:
> Successful
> [ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End
> Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out
>
> We definitely started the STATUS phase, so host could have read it and
> sent the next SETUP packet while we were already in the
> dwc3_stop_active_transfers() loop.

Yeah, pullup() is another async call that can trigger the same problem.



[ 1573.687193134 0x7248836e8] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
[ 1573.687263291 0x724883c2b] event (0000c040): ep0out: Transfer Complete (sIL) [Setup Phase]
[ 1573.687287145 0x724883df6] dbg_trace_log_ctrl: Clear Interface Feature(Function Suspend)

Trace indicates that this is ClearFeature(Interface.FuncSuspend)


[ 1573.687372301 0x724884459] dbg_send_ep_cmd: ep0in: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
[ 1573.687400634 0x724884679] event (000010c2): ep0in: Transfer Not Ready [0] (Not Active) [Data Phase]
[ 1573.687471155 0x724884bc2] event (000090c2): ep0in: Transfer Not Ready [0] (Active) [Data Phase]
[ 1573.687495009 0x724884d8d] event (0000c042): ep0in: Transfer Complete (sIL) [Data Phase]
[ 1573.687544697 0x724885146] dbg_gadget_giveback: ep0out: req ffffff87a6768800 length 20/20 ZsI ==> 0

But there's data stage. Is this non-standard/vendor request?
We probably need to fix the tracepoint to properly print vendor
requests.


[ 1573.687593187 0x7248854e9] dbg_ep_queue: ep5in: req ffffff87a6769000 length 0/8 zsI ==> -115
[ 1573.687619645 0x7248856e6] dbg_prepare: ep5in: trb ffffffc05a97d1a0 (E27:D26) buf 00000000ee07f600 size 1x 8 ctrl 00000811 (Hlcs:sC:normal)
[ 1573.687660270 0x7248859f2] dbg_send_ep_cmd: ep5in: cmd 'Update Transfer' [b0007] params 00000000 00000000 00000000 --> status: Successful
[ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer Not Ready [0] (Not Active) [Status Phase]
[ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful

It's still in status stage, it never reached Setup phase.


[ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out

That's why it's possible to time out here. The Setup Phase should
have been prepared before we can issue End Transfer.

I can't see where the pullup() begins as this log is missing register
read/write trace events. Looks like the wait_for_completion_timeout()
is insufficient. I assume it happens right after previous Setup
phase. There's no spin_lock, so the driver can proceed with the
new control request before it reaches dwc3_stop_active_transfers().

Can you try this. (yes, the code looks ugly right now, but it's just
to confirm my suspicion)

diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index ab725d2262d6..3b40411556ff 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -2506,6 +2506,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)

is_on = !!is_on;
dwc->softconnect = is_on;
+
+retry:
/*
* Per databook, when we want to stop the gadget, if a control transfer
* is still in process, complete it and get the core into setup phase.
@@ -2541,6 +2543,24 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
return 0;
}

+ if (!is_on) {
+ spin_lock_irqsave(&dwc->lock, flags);
+ if (dwc->ep0state != EP0_SETUP_PHASE) {
+ spin_unlock_irqrestore(&dwc->lock, flags);
+ goto retry;
+ }
+
+ /*
+ * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
+ * Section 4.1.8 Table 4-7, it states that for a device-initiated
+ * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
+ * command for any active transfers" before clearing the RunStop
+ * bit.
+ */
+ dwc3_stop_active_transfers(dwc);
+ spin_unlock_irqrestore(&dwc->lock, flags);
+ }
+
/*
* Synchronize and disable any further event handling while controller
* is being enabled/disabled.
@@ -2553,14 +2573,6 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
u32 count;

dwc->connected = false;
- /*
- * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
- * Section 4.1.8 Table 4-7, it states that for a device-initiated
- * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
- * command for any active transfers" before clearing the RunStop
- * bit.
- */
- dwc3_stop_active_transfers(dwc);
__dwc3_gadget_stop(dwc);

/*

Thanks,
Thinh

2022-03-25 20:27:11

by Wesley Cheng

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers



On 3/24/2022 6:51 PM, Thinh Nguyen wrote:
> Wesley Cheng wrote:
>> Hi Thinh,
>>
>> On 3/23/2022 5:39 PM, Thinh Nguyen wrote:
>>> Hi Wesley,
>>>
>>> Wesley Cheng wrote:
>>>> Hi Thinh,
>>>>
>>>> On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
>>>>> Hi Wesley,
>>>>>
>>>>> Wesley Cheng wrote:
>>>>>> Hi Thinh,
>>>>>>
>>>>>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>>>>>> While running the pullup disable sequence, if there are pending SETUP
>>>>>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>>>>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>>>>>> to drain potentially cached SETUP packets, SW needs to issue a
>>>>>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>>>>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>>>>>> properly stopped, the controller halt sequence will fail as well.
>>>>>>>
>>>>>>> One limitation is that the current logic will drop the SETUP data
>>>>>>> being received (ie dropping the SETUP packet), however, it should be
>>>>>>> acceptable in the pullup disable case, as the device is eventually
>>>>>>> going to disconnect from the host.
>>>>>>>
>>>>>>> Signed-off-by: Wesley Cheng <[email protected]>
>>>>>>> ---
>>>>>> Just wondering if you had any inputs for this particular change? I
>>>>>> think on the dequeue path discussion you had some concerns with parts of
>>>>>> this change? I think the difficult part for the pullup disable path is
>>>>>> that we have this API running w/ interrupts disabled, so the EP0 state
>>>>>> won't be able to advance compared to the dequeue case.
>>>>>
>>>>> This doesn't sound right. The pullup disable (or device initiated
>>>>> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
>>>>> proceeding, which it does.
>>>>>
>>>> That is correct, but even though that check is passed, it doesn't
>>>> prevent the host from sending another SETUP token between the pending
>>>> setup packet check and run/stop clear.
>>>>
>>>
>>> That should be fine, because we would have the TRB ready for that SETUP
>>> packet.
>>>
>> I agree, its valid for the controller to be able to receive the next
>> setup packet.
>>
>>>>>>
>>>>>> Ideally, if there is a way to ensure that we avoid reading further setup
>>>>>> packets, that would be nice, but from our discussions with Synopsys,
>>>>>> this was not possible. (controller is always armed and ready to ACK
>>>>>> setup tokens)
>>>>>>
>>>>>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>>>>>> the lock between the stop active xfer calls, but that opened another can
>>>>>> of worms. If you think this is the approach we should take, I can take
>>>>>> a look at this implementation further.
>>>>>>
>>>>>
>>>>> This patch doesn't look right to me. The change I suggested before
>>>>> should address this (I believe Greg already picked it up). What other
>>>>> problem do you see, I'm not clear what's the problem here. One potential
>>>>> problem that I can see is that currently dwc3 driver doesn't wait for
>>>>> active endpoints to complete/end before clearing the run_stop bit on
>>>>> device initiated disconnect, but I'm not sure if that's what you're seeing.
>>>>>
>>>>> Please help clarify further. If there's trace points log, that'd help.
>>>>>
>>>> Main difference between the issue Greg recently pulled in and this one
>>>> is that this is on the pullup disable patch (no dequeue involved). In
>>>> this situation we will also stop active transfers, so that the
>>>> controller can halt properly.
>>>>
>>>> I attached a few files, and will give a summary of them below:
>>>> 1. pullup_disable_timeout.txt - ftrace of an instance of when we see
>>>> several endxfer timeouts. Refer to line#2016.
>>>>
>>>> 2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
>>>> and a ftrace collected (difference instance to #1)
>>>>
>>>> #2 will show that we completed a SETUP transfer before entering into
>>>> dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
>>>> another SETUP token, which leads to endxfer timeouts on all subsequent
>>>> endpoints.
>>>
>>> Thanks for the captures. I think the problem here is because the dwc3
>>> driver disables the control endpoint. It shouldn't do that.
>>>
>>> Can you try this:
>>>
>>> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
>>> index ab725d2262d6..f0b9ea353620 100644
>>> --- a/drivers/usb/dwc3/gadget.c
>>> +++ b/drivers/usb/dwc3/gadget.c
>>> @@ -1010,12 +1010,12 @@ static int __dwc3_gadget_ep_disable(struct
>>> dwc3_ep *dep)
>>> if (dep->flags & DWC3_EP_STALL)
>>> __dwc3_gadget_ep_set_halt(dep, 0, false);
>>>
>>> - reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>>> - reg &= ~DWC3_DALEPENA_EP(dep->number);
>>> - dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>>> -
>>> - /* Clear out the ep descriptors for non-ep0 */
>>> if (dep->number > 1) {
>>> + reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>>> + reg &= ~DWC3_DALEPENA_EP(dep->number);
>>> + dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>>> +
>>> + /* Clear out the ep descriptors for non-ep0 */
>>> dep->endpoint.comp_desc = NULL;
>>> dep->endpoint.desc = NULL;
>>> }
>>>
>> I was able to reproduce the endxfer timeout w/ the above change. I'm
>> assuming you didn't want me to include any parts of my change while
>> testing, right?
>>
Hi Thinh,

>
> Right, please don't add additional code. I can't review knowing what
> else was changed.
>
Sure. There was one change from my patch that I pulled over, which
resolved some enumeration issues I saw (while testing your change, as well):

@@ -885,12 +885,13 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep
*dep, unsigned int action)
reg |= DWC3_DALEPENA_EP(dep->number);
dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);

- if (usb_endpoint_xfer_control(desc))
- goto out;
-
/* Initialize the TRB ring */
dep->trb_dequeue = 0;
dep->trb_enqueue = 0;
+
+ if (usb_endpoint_xfer_control(desc))
+ goto out;
+
memset(dep->trb_pool, 0,
sizeof(struct dwc3_trb) * DWC3_TRB_NUM);

This was added because I saw that there were instances where the SETUP
TRB was referencing dwc->ep0_trb[1]. However, I believe that for the
SETUP phase, we should always be using dwc->ep0_trb[0] as we are passing
in the ep0_trb_addr (DMA addr for ep0_trb[0]) as part of
dwc3_ep0_start_trans()

>> Current sequence in dwc3_gadget_pullup(0) is that we should call
>> dwc3_stop_active_transfers() before we call __dwc3_gadget_stop().
>> (gadget stop will call the ep disable for EP[0] and EP[1]) This might
>> be why the issue would still be occurring.
>>
>> The attached ftrace that captures a situation where a forced BUG will
>> occur on the first instance of the timeout.
>>
>> [ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer
>> Not Ready [0] (Not Active) [Status Phase]
>> [ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd
>> 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status:
>> Successful
>> [ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End
>> Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out
>>
>> We definitely started the STATUS phase, so host could have read it and
>> sent the next SETUP packet while we were already in the
>> dwc3_stop_active_transfers() loop.
>
> Yeah, pullup() is another async call that can trigger the same problem.
>
>
>
> [ 1573.687193134 0x7248836e8] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
> [ 1573.687263291 0x724883c2b] event (0000c040): ep0out: Transfer Complete (sIL) [Setup Phase]
> [ 1573.687287145 0x724883df6] dbg_trace_log_ctrl: Clear Interface Feature(Function Suspend)
>
> Trace indicates that this is ClearFeature(Interface.FuncSuspend)
>
>
> [ 1573.687372301 0x724884459] dbg_send_ep_cmd: ep0in: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
> [ 1573.687400634 0x724884679] event (000010c2): ep0in: Transfer Not Ready [0] (Not Active) [Data Phase]
> [ 1573.687471155 0x724884bc2] event (000090c2): ep0in: Transfer Not Ready [0] (Active) [Data Phase]
> [ 1573.687495009 0x724884d8d] event (0000c042): ep0in: Transfer Complete (sIL) [Data Phase]
> [ 1573.687544697 0x724885146] dbg_gadget_giveback: ep0out: req ffffff87a6768800 length 20/20 ZsI ==> 0
>
> But there's data stage. Is this non-standard/vendor request?
> We probably need to fix the tracepoint to properly print vendor
> requests.
>
Correct, its most likely a vendor specific command handled by our
function driver.

>
> [ 1573.687593187 0x7248854e9] dbg_ep_queue: ep5in: req ffffff87a6769000 length 0/8 zsI ==> -115
> [ 1573.687619645 0x7248856e6] dbg_prepare: ep5in: trb ffffffc05a97d1a0 (E27:D26) buf 00000000ee07f600 size 1x 8 ctrl 00000811 (Hlcs:sC:normal)
> [ 1573.687660270 0x7248859f2] dbg_send_ep_cmd: ep5in: cmd 'Update Transfer' [b0007] params 00000000 00000000 00000000 --> status: Successful
> [ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer Not Ready [0] (Not Active) [Status Phase]
> [ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
>
> It's still in status stage, it never reached Setup phase.
>
>
> [ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out
>
> That's why it's possible to time out here. The Setup Phase should
> have been prepared before we can issue End Transfer.
>
> I can't see where the pullup() begins as this log is missing register
> read/write trace events. Looks like the wait_for_completion_timeout()
> is insufficient. I assume it happens right after previous Setup
Yes, I've noticed that the wait_for_completion() call isn't too reliable
as it isn't under a spinlock. This mechanism worked well for the
dequeue case since we checked for the EP0 state while the lock was held.

> phase. There's no spin_lock, so the driver can proceed with the
> new control request before it reaches dwc3_stop_active_transfers().
>
> Can you try this. (yes, the code looks ugly right now, but it's just
> to confirm my suspicion)
>
Sure, I think I did something earlier as well, where I just manually
polled for EP0state to reach back to SETUP, but there was always a
possibility for host to initiate a SETUP packet while we loop across all
15 eps. If this doesn't work, I can take it a step further and check
EP0 state in between each stop active transfer call. I'll try to repro
it w/ this change and get you some traces.

Thanks
Wesley Cheng

> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index ab725d2262d6..3b40411556ff 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -2506,6 +2506,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
>
> is_on = !!is_on;
> dwc->softconnect = is_on;
> +
> +retry:
> /*
> * Per databook, when we want to stop the gadget, if a control transfer
> * is still in process, complete it and get the core into setup phase.
> @@ -2541,6 +2543,24 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
> return 0;
> }
>
> + if (!is_on) {
> + spin_lock_irqsave(&dwc->lock, flags);
> + if (dwc->ep0state != EP0_SETUP_PHASE) {
> + spin_unlock_irqrestore(&dwc->lock, flags);
> + goto retry;
> + }
> +
> + /*
> + * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
> + * Section 4.1.8 Table 4-7, it states that for a device-initiated
> + * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
> + * command for any active transfers" before clearing the RunStop
> + * bit.
> + */
> + dwc3_stop_active_transfers(dwc);
> + spin_unlock_irqrestore(&dwc->lock, flags);
> + }
> +
> /*
> * Synchronize and disable any further event handling while controller
> * is being enabled/disabled.
> @@ -2553,14 +2573,6 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
> u32 count;
>
> dwc->connected = false;
> - /*
> - * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
> - * Section 4.1.8 Table 4-7, it states that for a device-initiated
> - * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
> - * command for any active transfers" before clearing the RunStop
> - * bit.
> - */
> - dwc3_stop_active_transfers(dwc);
> __dwc3_gadget_stop(dwc);
>
> /*
>
> Thanks,
> Thinh

2022-04-02 13:55:37

by Wesley Cheng

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers



On 3/24/2022 6:51 PM, Thinh Nguyen wrote:
> Wesley Cheng wrote:
>> Hi Thinh,
>>
>> On 3/23/2022 5:39 PM, Thinh Nguyen wrote:
>>> Hi Wesley,
>>>
>>> Wesley Cheng wrote:
>>>> Hi Thinh,
>>>>
>>>> On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
>>>>> Hi Wesley,
>>>>>
>>>>> Wesley Cheng wrote:
>>>>>> Hi Thinh,
>>>>>>
>>>>>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>>>>>> While running the pullup disable sequence, if there are pending SETUP
>>>>>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>>>>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>>>>>> to drain potentially cached SETUP packets, SW needs to issue a
>>>>>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>>>>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>>>>>> properly stopped, the controller halt sequence will fail as well.
>>>>>>>
>>>>>>> One limitation is that the current logic will drop the SETUP data
>>>>>>> being received (ie dropping the SETUP packet), however, it should be
>>>>>>> acceptable in the pullup disable case, as the device is eventually
>>>>>>> going to disconnect from the host.
>>>>>>>
>>>>>>> Signed-off-by: Wesley Cheng <[email protected]>
>>>>>>> ---
>>>>>> Just wondering if you had any inputs for this particular change? I
>>>>>> think on the dequeue path discussion you had some concerns with parts of
>>>>>> this change? I think the difficult part for the pullup disable path is
>>>>>> that we have this API running w/ interrupts disabled, so the EP0 state
>>>>>> won't be able to advance compared to the dequeue case.
>>>>>
>>>>> This doesn't sound right. The pullup disable (or device initiated
>>>>> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
>>>>> proceeding, which it does.
>>>>>
>>>> That is correct, but even though that check is passed, it doesn't
>>>> prevent the host from sending another SETUP token between the pending
>>>> setup packet check and run/stop clear.
>>>>
>>>
>>> That should be fine, because we would have the TRB ready for that SETUP
>>> packet.
>>>
>> I agree, its valid for the controller to be able to receive the next
>> setup packet.
>>
>>>>>>
>>>>>> Ideally, if there is a way to ensure that we avoid reading further setup
>>>>>> packets, that would be nice, but from our discussions with Synopsys,
>>>>>> this was not possible. (controller is always armed and ready to ACK
>>>>>> setup tokens)
>>>>>>
>>>>>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>>>>>> the lock between the stop active xfer calls, but that opened another can
>>>>>> of worms. If you think this is the approach we should take, I can take
>>>>>> a look at this implementation further.
>>>>>>
>>>>>
>>>>> This patch doesn't look right to me. The change I suggested before
>>>>> should address this (I believe Greg already picked it up). What other
>>>>> problem do you see, I'm not clear what's the problem here. One potential
>>>>> problem that I can see is that currently dwc3 driver doesn't wait for
>>>>> active endpoints to complete/end before clearing the run_stop bit on
>>>>> device initiated disconnect, but I'm not sure if that's what you're seeing.
>>>>>
>>>>> Please help clarify further. If there's trace points log, that'd help.
>>>>>
>>>> Main difference between the issue Greg recently pulled in and this one
>>>> is that this is on the pullup disable patch (no dequeue involved). In
>>>> this situation we will also stop active transfers, so that the
>>>> controller can halt properly.
>>>>
>>>> I attached a few files, and will give a summary of them below:
>>>> 1. pullup_disable_timeout.txt - ftrace of an instance of when we see
>>>> several endxfer timeouts. Refer to line#2016.
>>>>
>>>> 2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
>>>> and a ftrace collected (difference instance to #1)
>>>>
>>>> #2 will show that we completed a SETUP transfer before entering into
>>>> dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
>>>> another SETUP token, which leads to endxfer timeouts on all subsequent
>>>> endpoints.
>>>
>>> Thanks for the captures. I think the problem here is because the dwc3
>>> driver disables the control endpoint. It shouldn't do that.
>>>
>>> Can you try this:
>>>
>>> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
>>> index ab725d2262d6..f0b9ea353620 100644
>>> --- a/drivers/usb/dwc3/gadget.c
>>> +++ b/drivers/usb/dwc3/gadget.c
>>> @@ -1010,12 +1010,12 @@ static int __dwc3_gadget_ep_disable(struct
>>> dwc3_ep *dep)
>>> if (dep->flags & DWC3_EP_STALL)
>>> __dwc3_gadget_ep_set_halt(dep, 0, false);
>>>
>>> - reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>>> - reg &= ~DWC3_DALEPENA_EP(dep->number);
>>> - dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>>> -
>>> - /* Clear out the ep descriptors for non-ep0 */
>>> if (dep->number > 1) {
>>> + reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>>> + reg &= ~DWC3_DALEPENA_EP(dep->number);
>>> + dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>>> +
>>> + /* Clear out the ep descriptors for non-ep0 */
>>> dep->endpoint.comp_desc = NULL;
>>> dep->endpoint.desc = NULL;
>>> }
>>>
>> I was able to reproduce the endxfer timeout w/ the above change. I'm
>> assuming you didn't want me to include any parts of my change while
>> testing, right?
>>
>
> Right, please don't add additional code. I can't review knowing what
> else was changed.
>
>> Current sequence in dwc3_gadget_pullup(0) is that we should call
>> dwc3_stop_active_transfers() before we call __dwc3_gadget_stop().
>> (gadget stop will call the ep disable for EP[0] and EP[1]) This might
>> be why the issue would still be occurring.
>>
>> The attached ftrace that captures a situation where a forced BUG will
>> occur on the first instance of the timeout.
>>
>> [ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer
>> Not Ready [0] (Not Active) [Status Phase]
>> [ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd
>> 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status:
>> Successful
>> [ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End
>> Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out
>>
>> We definitely started the STATUS phase, so host could have read it and
>> sent the next SETUP packet while we were already in the
>> dwc3_stop_active_transfers() loop.
>
> Yeah, pullup() is another async call that can trigger the same problem.
>
>
>
> [ 1573.687193134 0x7248836e8] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
> [ 1573.687263291 0x724883c2b] event (0000c040): ep0out: Transfer Complete (sIL) [Setup Phase]
> [ 1573.687287145 0x724883df6] dbg_trace_log_ctrl: Clear Interface Feature(Function Suspend)
>
> Trace indicates that this is ClearFeature(Interface.FuncSuspend)
>
>
> [ 1573.687372301 0x724884459] dbg_send_ep_cmd: ep0in: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
> [ 1573.687400634 0x724884679] event (000010c2): ep0in: Transfer Not Ready [0] (Not Active) [Data Phase]
> [ 1573.687471155 0x724884bc2] event (000090c2): ep0in: Transfer Not Ready [0] (Active) [Data Phase]
> [ 1573.687495009 0x724884d8d] event (0000c042): ep0in: Transfer Complete (sIL) [Data Phase]
> [ 1573.687544697 0x724885146] dbg_gadget_giveback: ep0out: req ffffff87a6768800 length 20/20 ZsI ==> 0
>
> But there's data stage. Is this non-standard/vendor request?
> We probably need to fix the tracepoint to properly print vendor
> requests.
>
>
> [ 1573.687593187 0x7248854e9] dbg_ep_queue: ep5in: req ffffff87a6769000 length 0/8 zsI ==> -115
> [ 1573.687619645 0x7248856e6] dbg_prepare: ep5in: trb ffffffc05a97d1a0 (E27:D26) buf 00000000ee07f600 size 1x 8 ctrl 00000811 (Hlcs:sC:normal)
> [ 1573.687660270 0x7248859f2] dbg_send_ep_cmd: ep5in: cmd 'Update Transfer' [b0007] params 00000000 00000000 00000000 --> status: Successful
> [ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer Not Ready [0] (Not Active) [Status Phase]
> [ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
>
> It's still in status stage, it never reached Setup phase.
>
>
> [ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out
>
> That's why it's possible to time out here. The Setup Phase should
> have been prepared before we can issue End Transfer.
>
> I can't see where the pullup() begins as this log is missing register
> read/write trace events. Looks like the wait_for_completion_timeout()
> is insufficient. I assume it happens right after previous Setup
> phase. There's no spin_lock, so the driver can proceed with the
> new control request before it reaches dwc3_stop_active_transfers().
>
> Can you try this. (yes, the code looks ugly right now, but it's just
> to confirm my suspicion)
>

Hi Thinh,

So I tried a few things, and will highlight the results below.

#1 Changes recommended below:
- Ran into a situation where there was still a potential SETUP packet
being received while looping through the available EPs.
- Attached the log to this email. Please refer to the notations near
the end of the trace (testchanges_trace.txt):

//Last SETUP transaction before dwc3_gadget_stop_active_transfers()
//dwc3_gadet_stop_active_transfers() loop starting below w/ ep1out
[ 15144.915547414 0x43cfbf36a2] dbg_send_ep_cmd: ep1out: cmd
'End Transfer' [20c08] params 00000000 00000000 00000000 --> status:
Successful
[ 15144.915585487 0x43cfbf397c] dbg_gadget_giveback: ep1out: req
ffffff87a5f89700 length 0/16384 zsI ==> -108
[ 15144.915619966 0x43cfbf3c13] event (0000c040): ep0out:
Transfer Complete (sIL) [Setup Phase]
[ 15144.915662153 0x43cfbf3f3d] dbg_trace_log_ctrl: Get String
Descriptor(Index = 9, Length = 510)
...

//Finished 'Get String Descriptor" packet, and able to receive next SETUP
...
Transfer' [f0c08] params 00000000 00000000 00000000 --> status: Timed Out

> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index ab725d2262d6..3b40411556ff 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -2506,6 +2506,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
>
> is_on = !!is_on;
> dwc->softconnect = is_on;
> +
> +retry:
> /*
> * Per databook, when we want to stop the gadget, if a control transfer
> * is still in process, complete it and get the core into setup phase.
> @@ -2541,6 +2543,24 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
> return 0;
> }
>
> + if (!is_on) {
> + spin_lock_irqsave(&dwc->lock, flags);
> + if (dwc->ep0state != EP0_SETUP_PHASE) {
> + spin_unlock_irqrestore(&dwc->lock, flags);
> + goto retry;
> + }
> +
> + /*
> + * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
> + * Section 4.1.8 Table 4-7, it states that for a device-initiated
> + * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
> + * command for any active transfers" before clearing the RunStop
> + * bit.
> + */
> + dwc3_stop_active_transfers(dwc);
> + spin_unlock_irqrestore(&dwc->lock, flags);
> + }
> +
> /*
> * Synchronize and disable any further event handling while controller
> * is being enabled/disabled.
> @@ -2553,14 +2573,6 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
> u32 count;
>
> dwc->connected = false;
> - /*
> - * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
> - * Section 4.1.8 Table 4-7, it states that for a device-initiated
> - * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
> - * command for any active transfers" before clearing the RunStop
> - * bit.
> - */
> - dwc3_stop_active_transfers(dwc);
> __dwc3_gadget_stop(dwc);
>
> /*
>

#2 I built on top of the changes you suggested, and added the ep0state
check within dwc3_stop_active_transfers(). Utilized the
DWC3_EP_DELAY_STOP (flag from the dequeue endxfer patch) to have the ep0
driver issue the endxfer. With the changes below, I haven't seen the
failure so far. Anyway, this is just the initial version of the changes
I've made, but I think its a cleaner approach than the changes that were
submitted in the RFC.

Also, with releasing the lock here, I had to add the changes we had
previously proposed here as well to prevent a list corruption:

https://lore.kernel.org/linux-usb/[email protected]/

https://lore.kernel.org/linux-usb/[email protected]/


diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
index 1064be5518f6..aba3491eadf9 100644
--- a/drivers/usb/dwc3/ep0.c
+++ b/drivers/usb/dwc3/ep0.c
@@ -273,8 +273,6 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
int ret;
int i;

- complete(&dwc->ep0_in_setup);
-
dep = dwc->eps[0];
dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 8,
DWC3_TRBCTL_CONTROL_SETUP, false);
@@ -291,8 +289,10 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
continue;

dwc3_ep->flags &= ~DWC3_EP_DELAY_STOP;
- dwc3_stop_active_transfer(dwc3_ep, true, true);
+ dwc3_stop_active_transfer(dwc3_ep, true, dwc->softconnect ?
+ true : false);
}
+ complete(&dwc->ep0_in_setup);
}

static struct dwc3_ep *dwc3_wIndex_to_dep(struct dwc3 *dwc, __le16
wIndex_le)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index b0b225b157b4..86f81e5f66ba 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -882,12 +882,13 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep
*dep, unsigned int action)
reg |= DWC3_DALEPENA_EP(dep->number);
dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);

- if (usb_endpoint_xfer_control(desc))
- goto out;
-
/* Initialize the TRB ring */
dep->trb_dequeue = 0;
dep->trb_enqueue = 0;
+
+ if (usb_endpoint_xfer_control(desc))
+ goto out;
+
memset(dep->trb_pool, 0,
sizeof(struct dwc3_trb) * DWC3_TRB_NUM);

@@ -1889,7 +1890,8 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep
*dep, struct dwc3_request *req)
{
struct dwc3 *dwc = dep->dwc;

- if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected) {
+ if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected ||
+ !dwc->softconnect) {
dev_dbg(dwc->dev, "%s: can't queue to disabled endpoint\n",
dep->name);
return -ESHUTDOWN;
@@ -2348,6 +2350,7 @@ static int dwc3_gadget_set_selfpowered(struct
usb_gadget *g,

static void dwc3_stop_active_transfers(struct dwc3 *dwc)
{
+ int ret;
u32 epnum;

for (epnum = 2; epnum < dwc->num_eps; epnum++) {
@@ -2357,6 +2360,19 @@ static void dwc3_stop_active_transfers(struct
dwc3 *dwc)
if (!dep)
continue;

+ if (dwc->ep0state != EP0_SETUP_PHASE) {
+ dep->flags |= DWC3_EP_DELAY_STOP;
+ spin_unlock(&dwc->lock);
+
+ reinit_completion(&dwc->ep0_in_setup);
+
+ ret = wait_for_completion_timeout(&dwc->ep0_in_setup,
+ msecs_to_jiffies(DWC3_PULL_UP_TIMEOUT));
+ if (ret == 0)
+ dev_warn(dwc->dev, "timed out waiting for SETUP phase\n");
+ spin_lock(&dwc->lock);
+ }
+
dwc3_remove_requests(dwc, dep);
}
}
@@ -2547,6 +2563,19 @@ static int dwc3_gadget_pullup(struct usb_gadget
*g, int is_on)
return 0;
}

+ if (!is_on) {
+ spin_lock_irqsave(&dwc->lock, flags);
+ /*
+ * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
+ * Section 4.1.8 Table 4-7, it states that for a device-initiated
+ * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
+ * command for any active transfers" before clearing the RunStop
+ * bit.
+ */
+ dwc3_stop_active_transfers(dwc);
+ spin_unlock_irqrestore(&dwc->lock, flags);
+ }
+
/*
* Synchronize and disable any further event handling while controller
* is being enabled/disabled.
@@ -2559,14 +2588,6 @@ static int dwc3_gadget_pullup(struct usb_gadget
*g, int is_on)
u32 count;

dwc->connected = false;
- /*
- * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
- * Section 4.1.8 Table 4-7, it states that for a device-initiated
- * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
- * command for any active transfers" before clearing the RunStop
- * bit.
- */
- dwc3_stop_active_transfers(dwc);
__dwc3_gadget_stop(dwc);

/*
@@ -3334,7 +3355,7 @@ static bool dwc3_gadget_ep_should_continue(struct
dwc3_ep *dep)
struct dwc3 *dwc = dep->dwc;

if (!dep->endpoint.desc || !dwc->pullups_connected ||
- !dwc->connected)
+ !dwc->connected || !dwc->softconnect)
return false;

if (!list_empty(&dep->pending_list))

Thanks
Wesley Cheng


Attachments:
testchanges_trace.txt (165.68 kB)

2022-04-05 03:33:37

by Thinh Nguyen

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] usb: dwc3: Flush pending SETUP data during stop active xfers

Wesley Cheng wrote:
>
>
> On 3/24/2022 6:51 PM, Thinh Nguyen wrote:
>> Wesley Cheng wrote:
>>> Hi Thinh,
>>>
>>> On 3/23/2022 5:39 PM, Thinh Nguyen wrote:
>>>> Hi Wesley,
>>>>
>>>> Wesley Cheng wrote:
>>>>> Hi Thinh,
>>>>>
>>>>> On 3/22/2022 6:31 PM, Thinh Nguyen wrote:
>>>>>> Hi Wesley,
>>>>>>
>>>>>> Wesley Cheng wrote:
>>>>>>> Hi Thinh,
>>>>>>>
>>>>>>> On 2/15/2022 4:08 PM, Wesley Cheng wrote:
>>>>>>>> While running the pullup disable sequence, if there are pending SETUP
>>>>>>>> transfers stored in the controller, then the ENDTRANSFER commands on non
>>>>>>>> control eps will fail w/ ETIMEDOUT. As a suggestion from SNPS, in order
>>>>>>>> to drain potentially cached SETUP packets, SW needs to issue a
>>>>>>>> STARTTRANSFER command. After issuing the STARTTRANSFER, and retrying the
>>>>>>>> ENDTRANSFER, the command should succeed. Else, if the endpoints are not
>>>>>>>> properly stopped, the controller halt sequence will fail as well.
>>>>>>>>
>>>>>>>> One limitation is that the current logic will drop the SETUP data
>>>>>>>> being received (ie dropping the SETUP packet), however, it should be
>>>>>>>> acceptable in the pullup disable case, as the device is eventually
>>>>>>>> going to disconnect from the host.
>>>>>>>>
>>>>>>>> Signed-off-by: Wesley Cheng <[email protected]>
>>>>>>>> ---
>>>>>>> Just wondering if you had any inputs for this particular change? I
>>>>>>> think on the dequeue path discussion you had some concerns with parts of
>>>>>>> this change? I think the difficult part for the pullup disable path is
>>>>>>> that we have this API running w/ interrupts disabled, so the EP0 state
>>>>>>> won't be able to advance compared to the dequeue case.
>>>>>>
>>>>>> This doesn't sound right. The pullup disable (or device initiated
>>>>>> disconnect) should wait for the EP0 state to be EP0_SETUP_PHASE before
>>>>>> proceeding, which it does.
>>>>>>
>>>>> That is correct, but even though that check is passed, it doesn't
>>>>> prevent the host from sending another SETUP token between the pending
>>>>> setup packet check and run/stop clear.
>>>>>
>>>>
>>>> That should be fine, because we would have the TRB ready for that SETUP
>>>> packet.
>>>>
>>> I agree, its valid for the controller to be able to receive the next
>>> setup packet.
>>>
>>>>>>>
>>>>>>> Ideally, if there is a way to ensure that we avoid reading further setup
>>>>>>> packets, that would be nice, but from our discussions with Synopsys,
>>>>>>> this was not possible. (controller is always armed and ready to ACK
>>>>>>> setup tokens)
>>>>>>>
>>>>>>> I did evaluate keeping IRQs enabled and periodically releasing/attaining
>>>>>>> the lock between the stop active xfer calls, but that opened another can
>>>>>>> of worms. If you think this is the approach we should take, I can take
>>>>>>> a look at this implementation further.
>>>>>>>
>>>>>>
>>>>>> This patch doesn't look right to me. The change I suggested before
>>>>>> should address this (I believe Greg already picked it up). What other
>>>>>> problem do you see, I'm not clear what's the problem here. One potential
>>>>>> problem that I can see is that currently dwc3 driver doesn't wait for
>>>>>> active endpoints to complete/end before clearing the run_stop bit on
>>>>>> device initiated disconnect, but I'm not sure if that's what you're seeing.
>>>>>>
>>>>>> Please help clarify further. If there's trace points log, that'd help.
>>>>>>
>>>>> Main difference between the issue Greg recently pulled in and this one
>>>>> is that this is on the pullup disable patch (no dequeue involved). In
>>>>> this situation we will also stop active transfers, so that the
>>>>> controller can halt properly.
>>>>>
>>>>> I attached a few files, and will give a summary of them below:
>>>>> 1. pullup_disable_timeout.txt - ftrace of an instance of when we see
>>>>> several endxfer timeouts. Refer to line#2016.
>>>>>
>>>>> 2. lecroy+ftrace_snip.png - picture showing a matching bus sniffer log
>>>>> and a ftrace collected (difference instance to #1)
>>>>>
>>>>> #2 will show that we completed a SETUP transfer before entering into
>>>>> dwc3_gadget_stop_active_transfers(). In here, we then see DWC ACK
>>>>> another SETUP token, which leads to endxfer timeouts on all subsequent
>>>>> endpoints.
>>>>
>>>> Thanks for the captures. I think the problem here is because the dwc3
>>>> driver disables the control endpoint. It shouldn't do that.
>>>>
>>>> Can you try this:
>>>>
>>>> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
>>>> index ab725d2262d6..f0b9ea353620 100644
>>>> --- a/drivers/usb/dwc3/gadget.c
>>>> +++ b/drivers/usb/dwc3/gadget.c
>>>> @@ -1010,12 +1010,12 @@ static int __dwc3_gadget_ep_disable(struct
>>>> dwc3_ep *dep)
>>>> if (dep->flags & DWC3_EP_STALL)
>>>> __dwc3_gadget_ep_set_halt(dep, 0, false);
>>>>
>>>> - reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>>>> - reg &= ~DWC3_DALEPENA_EP(dep->number);
>>>> - dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>>>> -
>>>> - /* Clear out the ep descriptors for non-ep0 */
>>>> if (dep->number > 1) {
>>>> + reg = dwc3_readl(dwc->regs, DWC3_DALEPENA);
>>>> + reg &= ~DWC3_DALEPENA_EP(dep->number);
>>>> + dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>>>> +
>>>> + /* Clear out the ep descriptors for non-ep0 */
>>>> dep->endpoint.comp_desc = NULL;
>>>> dep->endpoint.desc = NULL;
>>>> }
>>>>
>>> I was able to reproduce the endxfer timeout w/ the above change. I'm
>>> assuming you didn't want me to include any parts of my change while
>>> testing, right?
>>>
>>
>> Right, please don't add additional code. I can't review knowing what
>> else was changed.
>>
>>> Current sequence in dwc3_gadget_pullup(0) is that we should call
>>> dwc3_stop_active_transfers() before we call __dwc3_gadget_stop().
>>> (gadget stop will call the ep disable for EP[0] and EP[1]) This might
>>> be why the issue would still be occurring.
>>>
>>> The attached ftrace that captures a situation where a forced BUG will
>>> occur on the first instance of the timeout.
>>>
>>> [ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer
>>> Not Ready [0] (Not Active) [Status Phase]
>>> [ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd
>>> 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status:
>>> Successful
>>> [ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End
>>> Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out
>>>
>>> We definitely started the STATUS phase, so host could have read it and
>>> sent the next SETUP packet while we were already in the
>>> dwc3_stop_active_transfers() loop.
>>
>> Yeah, pullup() is another async call that can trigger the same problem.
>>
>>
>>
>> [ 1573.687193134 0x7248836e8] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
>> [ 1573.687263291 0x724883c2b] event (0000c040): ep0out: Transfer Complete (sIL) [Setup Phase]
>> [ 1573.687287145 0x724883df6] dbg_trace_log_ctrl: Clear Interface Feature(Function Suspend)
>>
>> Trace indicates that this is ClearFeature(Interface.FuncSuspend)
>>
>>
>> [ 1573.687372301 0x724884459] dbg_send_ep_cmd: ep0in: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
>> [ 1573.687400634 0x724884679] event (000010c2): ep0in: Transfer Not Ready [0] (Not Active) [Data Phase]
>> [ 1573.687471155 0x724884bc2] event (000090c2): ep0in: Transfer Not Ready [0] (Active) [Data Phase]
>> [ 1573.687495009 0x724884d8d] event (0000c042): ep0in: Transfer Complete (sIL) [Data Phase]
>> [ 1573.687544697 0x724885146] dbg_gadget_giveback: ep0out: req ffffff87a6768800 length 20/20 ZsI ==> 0
>>
>> But there's data stage. Is this non-standard/vendor request?
>> We probably need to fix the tracepoint to properly print vendor
>> requests.
>>
>>
>> [ 1573.687593187 0x7248854e9] dbg_ep_queue: ep5in: req ffffff87a6769000 length 0/8 zsI ==> -115
>> [ 1573.687619645 0x7248856e6] dbg_prepare: ep5in: trb ffffffc05a97d1a0 (E27:D26) buf 00000000ee07f600 size 1x 8 ctrl 00000811 (Hlcs:sC:normal)
>> [ 1573.687660270 0x7248859f2] dbg_send_ep_cmd: ep5in: cmd 'Update Transfer' [b0007] params 00000000 00000000 00000000 --> status: Successful
>> [ 1573.687689437 0x724885c22] event (000020c0): ep0out: Transfer Not Ready [0] (Not Active) [Status Phase]
>> [ 1573.687738707 0x724885fd3] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful
>>
>> It's still in status stage, it never reached Setup phase.
>>
>>
>> [ 1573.698040582 0x7248b6477] dbg_send_ep_cmd: ep1out: cmd 'End Transfer' [20c08] params 00000000 00000000 00000000 --> status: Timed Out
>>
>> That's why it's possible to time out here. The Setup Phase should
>> have been prepared before we can issue End Transfer.
>>
>> I can't see where the pullup() begins as this log is missing register
>> read/write trace events. Looks like the wait_for_completion_timeout()
>> is insufficient. I assume it happens right after previous Setup
>> phase. There's no spin_lock, so the driver can proceed with the
>> new control request before it reaches dwc3_stop_active_transfers().
>>
>> Can you try this. (yes, the code looks ugly right now, but it's just
>> to confirm my suspicion)
>>
>
> Hi Thinh,
>
> So I tried a few things, and will highlight the results below.
>
> #1 Changes recommended below:
> - Ran into a situation where there was still a potential SETUP packet
> being received while looping through the available EPs.
> - Attached the log to this email. Please refer to the notations near
> the end of the trace (testchanges_trace.txt):
>
> //Last SETUP transaction before dwc3_gadget_stop_active_transfers()
> //dwc3_gadet_stop_active_transfers() loop starting below w/ ep1out
> [ 15144.915547414 0x43cfbf36a2] dbg_send_ep_cmd: ep1out: cmd
> 'End Transfer' [20c08] params 00000000 00000000 00000000 --> status:
> Successful
> [ 15144.915585487 0x43cfbf397c] dbg_gadget_giveback: ep1out: req
> ffffff87a5f89700 length 0/16384 zsI ==> -108
> [ 15144.915619966 0x43cfbf3c13] event (0000c040): ep0out:
> Transfer Complete (sIL) [Setup Phase]
> [ 15144.915662153 0x43cfbf3f3d] dbg_trace_log_ctrl: Get String
> Descriptor(Index = 9, Length = 510)
> ...
>

> //Finished 'Get String Descriptor" packet, and able to receive next SETUP

No, it's not finished here and the SETUP is not prepared yet.

[ 15144.916351580 0x43cfbf72f2] event (000020c0): ep0out: Transfer Not Ready [0] (Not Active) [Status Phase]
[ 15144.916419185 0x43cfbf7803] dbg_send_ep_cmd: ep0out: cmd 'Start Transfer' [406] params 00000000 efffa000 00000000 --> status: Successful

dwc3 just started the Status stage. It did not prepare for the next SETUP
phase yet.


> ...
> Transfer' [f0c08] params 00000000 00000000 00000000 --> status: Timed Out

That's why it's timed out.

>
>> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
>> index ab725d2262d6..3b40411556ff 100644
>> --- a/drivers/usb/dwc3/gadget.c
>> +++ b/drivers/usb/dwc3/gadget.c
>> @@ -2506,6 +2506,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
>>
>> is_on = !!is_on;
>> dwc->softconnect = is_on;
>> +
>> +retry:
>> /*
>> * Per databook, when we want to stop the gadget, if a control transfer
>> * is still in process, complete it and get the core into setup phase.
>> @@ -2541,6 +2543,24 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
>> return 0;
>> }
>>
>> + if (!is_on) {
>> + spin_lock_irqsave(&dwc->lock, flags);
>> + if (dwc->ep0state != EP0_SETUP_PHASE) {
>> + spin_unlock_irqrestore(&dwc->lock, flags);
>> + goto retry;
>> + }
>> +
>> + /*
>> + * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
>> + * Section 4.1.8 Table 4-7, it states that for a device-initiated
>> + * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
>> + * command for any active transfers" before clearing the RunStop
>> + * bit.
>> + */
>> + dwc3_stop_active_transfers(dwc);

The reason why my suggestion test patch didn't work is because
dwc3_stop_active_transfers() also "gives back" requests in its loop. When
the driver gives back requests, it does spin_unlock() allowing the
driver to service the next request. I was hoping that the driver will
block servicing new control requests until dwc3_stop_active_transfers()
completes.

>> + spin_unlock_irqrestore(&dwc->lock, flags);
>> + }
>> +
>> /*
>> * Synchronize and disable any further event handling while controller
>> * is being enabled/disabled.
>> @@ -2553,14 +2573,6 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
>> u32 count;
>>
>> dwc->connected = false;
>> - /*
>> - * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
>> - * Section 4.1.8 Table 4-7, it states that for a device-initiated
>> - * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
>> - * command for any active transfers" before clearing the RunStop
>> - * bit.
>> - */
>> - dwc3_stop_active_transfers(dwc);
>> __dwc3_gadget_stop(dwc);
>>
>> /*
>>
>
> #2 I built on top of the changes you suggested, and added the ep0state
> check within dwc3_stop_active_transfers(). Utilized the
> DWC3_EP_DELAY_STOP (flag from the dequeue endxfer patch) to have the ep0
> driver issue the endxfer. With the changes below, I haven't seen the
> failure so far. Anyway, this is just the initial version of the changes
> I've made, but I think its a cleaner approach than the changes that were
> submitted in the RFC.
>
> Also, with releasing the lock here, I had to add the changes we had
> previously proposed here as well to prevent a list corruption:
>
> https://urldefense.com/v3/__https://lore.kernel.org/linux-usb/[email protected]/__;!!A4F2R9G_pg!OrdWZ7WUdndzskqgSWKiIV9RRJTgw1rptjjxGBhEeb_VRVNTCkmIQs29ls-DGqjp3OJa$
>
> https://urldefense.com/v3/__https://lore.kernel.org/linux-usb/[email protected]/__;!!A4F2R9G_pg!OrdWZ7WUdndzskqgSWKiIV9RRJTgw1rptjjxGBhEeb_VRVNTCkmIQs29ls-DGr-w0Z-S$
>
>
> diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
> index 1064be5518f6..aba3491eadf9 100644
> --- a/drivers/usb/dwc3/ep0.c
> +++ b/drivers/usb/dwc3/ep0.c
> @@ -273,8 +273,6 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
> int ret;
> int i;
>
> - complete(&dwc->ep0_in_setup);
> -
> dep = dwc->eps[0];
> dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 8,
> DWC3_TRBCTL_CONTROL_SETUP, false);
> @@ -291,8 +289,10 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
> continue;
>
> dwc3_ep->flags &= ~DWC3_EP_DELAY_STOP;
> - dwc3_stop_active_transfer(dwc3_ep, true, true);
> + dwc3_stop_active_transfer(dwc3_ep, true, dwc->softconnect ?
> + true : false);

Do you mean to do this?

dwc3_stop_active_transfer(dwc3_ep, true, !dwc->softconnect)

> }
> + complete(&dwc->ep0_in_setup);
> }
>
> static struct dwc3_ep *dwc3_wIndex_to_dep(struct dwc3 *dwc, __le16
> wIndex_le)
> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index b0b225b157b4..86f81e5f66ba 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -882,12 +882,13 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep
> *dep, unsigned int action)
> reg |= DWC3_DALEPENA_EP(dep->number);
> dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
>
> - if (usb_endpoint_xfer_control(desc))
> - goto out;
> -
> /* Initialize the TRB ring */
> dep->trb_dequeue = 0;
> dep->trb_enqueue = 0;
> +
> + if (usb_endpoint_xfer_control(desc))
> + goto out;
> +
> memset(dep->trb_pool, 0,
> sizeof(struct dwc3_trb) * DWC3_TRB_NUM);
>
> @@ -1889,7 +1890,8 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep
> *dep, struct dwc3_request *req)
> {
> struct dwc3 *dwc = dep->dwc;
>
> - if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected) {
> + if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected ||
> + !dwc->softconnect) {
> dev_dbg(dwc->dev, "%s: can't queue to disabled endpoint\n",
> dep->name);
> return -ESHUTDOWN;
> @@ -2348,6 +2350,7 @@ static int dwc3_gadget_set_selfpowered(struct
> usb_gadget *g,
>
> static void dwc3_stop_active_transfers(struct dwc3 *dwc)
> {
> + int ret;
> u32 epnum;
>
> for (epnum = 2; epnum < dwc->num_eps; epnum++) {
> @@ -2357,6 +2360,19 @@ static void dwc3_stop_active_transfers(struct
> dwc3 *dwc)
> if (!dep)
> continue;
>
> + if (dwc->ep0state != EP0_SETUP_PHASE) {
> + dep->flags |= DWC3_EP_DELAY_STOP;
> + spin_unlock(&dwc->lock);
> +
> + reinit_completion(&dwc->ep0_in_setup);
> +
> + ret = wait_for_completion_timeout(&dwc->ep0_in_setup,
> + msecs_to_jiffies(DWC3_PULL_UP_TIMEOUT));
> + if (ret == 0)
> + dev_warn(dwc->dev, "timed out waiting for SETUP phase\n");
> + spin_lock(&dwc->lock);
> + }
> +

This alleviates the problem, but doesn't completely prevent it.

> dwc3_remove_requests(dwc, dep);
> }
> }
> @@ -2547,6 +2563,19 @@ static int dwc3_gadget_pullup(struct usb_gadget
> *g, int is_on)
> return 0;
> }
>
> + if (!is_on) {
> + spin_lock_irqsave(&dwc->lock, flags);
> + /*
> + * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
> + * Section 4.1.8 Table 4-7, it states that for a device-initiated
> + * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
> + * command for any active transfers" before clearing the RunStop
> + * bit.
> + */
> + dwc3_stop_active_transfers(dwc);
> + spin_unlock_irqrestore(&dwc->lock, flags);
> + }
> +
> /*
> * Synchronize and disable any further event handling while controller
> * is being enabled/disabled.
> @@ -2559,14 +2588,6 @@ static int dwc3_gadget_pullup(struct usb_gadget
> *g, int is_on)
> u32 count;
>
> dwc->connected = false;
> - /*
> - * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
> - * Section 4.1.8 Table 4-7, it states that for a device-initiated
> - * disconnect, the SW needs to ensure that it sends "a DEPENDXFER
> - * command for any active transfers" before clearing the RunStop
> - * bit.
> - */
> - dwc3_stop_active_transfers(dwc);

This will still run into the same problem.

> __dwc3_gadget_stop(dwc);
>
> /*
> @@ -3334,7 +3355,7 @@ static bool dwc3_gadget_ep_should_continue(struct
> dwc3_ep *dep)
> struct dwc3 *dwc = dep->dwc;
>
> if (!dep->endpoint.desc || !dwc->pullups_connected ||
> - !dwc->connected)
> + !dwc->connected || !dwc->softconnect)
> return false;
>
> if (!list_empty(&dep->pending_list))
>

Looking at the pullup() function again, it needs some rework on top
of the problem we already have here. I'll try to rewrite it later this
week or the next. If you new updates, I'll be happy to review it.

Thanks,
Thinh