2013-09-02 11:26:45

by Gerd Hoffmann

[permalink] [raw]
Subject: [PATCH 4/5] uas: add dead request list

This patch adds a new list where all requests which are canceled are
added to, so we don't loose them. Then, after killing all inflight
urbs on bus reset (and disconnect) we'll walk over the list and clean
them up.

Without this we can end up with aborted requests lingering around in
case of status pipe transfer errors.

Signed-off-by: Gerd Hoffmann <[email protected]>
---
drivers/usb/storage/uas.c | 69 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index a63972a..9dfb8f9 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -78,6 +78,7 @@ struct uas_cmd_info {
struct urb *data_in_urb;
struct urb *data_out_urb;
struct list_head work;
+ struct list_head dead;
};

/* I hate forward declarations, but I actually have a loop */
@@ -87,10 +88,12 @@ static void uas_do_work(struct work_struct *work);
static int uas_try_complete(struct scsi_cmnd *cmnd, const char *caller);
static void uas_configure_endpoints(struct uas_dev_info *devinfo);
static void uas_free_streams(struct uas_dev_info *devinfo);
+static void uas_log_cmd_state(struct scsi_cmnd *cmnd, const char *caller);

static DECLARE_WORK(uas_work, uas_do_work);
static DEFINE_SPINLOCK(uas_lists_lock);
static LIST_HEAD(uas_work_list);
+static LIST_HEAD(uas_dead_list);

static void uas_unlink_data_urbs(struct uas_dev_info *devinfo,
struct uas_cmd_info *cmdinfo)
@@ -167,15 +170,13 @@ static void uas_abort_work(struct uas_dev_info *devinfo)
struct uas_dev_info *di = (void *)cmnd->device->hostdata;

if (di == devinfo) {
+ uas_log_cmd_state(cmnd, __func__);
+ BUG_ON(cmdinfo->state & COMMAND_ABORTED);
cmdinfo->state |= COMMAND_ABORTED;
+ spin_lock_irq(&uas_lists_lock);
+ list_add_tail(&cmdinfo->dead, &uas_dead_list);
+ spin_unlock_irq(&uas_lists_lock);
cmdinfo->state &= ~IS_IN_WORK_LIST;
- if (devinfo->resetting) {
- /* uas_stat_cmplt() will not do that
- * when a device reset is in
- * progress */
- cmdinfo->state &= ~COMMAND_INFLIGHT;
- }
- uas_try_complete(cmnd, __func__);
} else {
/* not our uas device, relink into list */
list_del(&cmdinfo->work);
@@ -187,6 +188,43 @@ static void uas_abort_work(struct uas_dev_info *devinfo)
spin_unlock_irqrestore(&devinfo->lock, flags);
}

+static void uas_zap_dead(struct uas_dev_info *devinfo)
+{
+ struct uas_cmd_info *cmdinfo;
+ struct uas_cmd_info *temp;
+ struct list_head list;
+ unsigned long flags;
+
+ spin_lock_irq(&uas_lists_lock);
+ list_replace_init(&uas_dead_list, &list);
+ spin_unlock_irq(&uas_lists_lock);
+
+ spin_lock_irqsave(&devinfo->lock, flags);
+ list_for_each_entry_safe(cmdinfo, temp, &list, dead) {
+ struct scsi_pointer *scp = (void *)cmdinfo;
+ struct scsi_cmnd *cmnd = container_of(scp,
+ struct scsi_cmnd, SCp);
+ struct uas_dev_info *di = (void *)cmnd->device->hostdata;
+
+ if (di == devinfo) {
+ uas_log_cmd_state(cmnd, __func__);
+ BUG_ON(!(cmdinfo->state & COMMAND_ABORTED));
+ /* all urbs are killed, clear inflight bits */
+ cmdinfo->state &= ~(COMMAND_INFLIGHT |
+ DATA_IN_URB_INFLIGHT |
+ DATA_OUT_URB_INFLIGHT);
+ uas_try_complete(cmnd, __func__);
+ } else {
+ /* not our uas device, relink into list */
+ list_del(&cmdinfo->dead);
+ spin_lock_irq(&uas_lists_lock);
+ list_add_tail(&cmdinfo->dead, &uas_dead_list);
+ spin_unlock_irq(&uas_lists_lock);
+ }
+ }
+ spin_unlock_irqrestore(&devinfo->lock, flags);
+}
+
static void uas_sense(struct urb *urb, struct scsi_cmnd *cmnd)
{
struct sense_iu *sense_iu = urb->transfer_buffer;
@@ -274,6 +312,9 @@ static int uas_try_complete(struct scsi_cmnd *cmnd, const char *caller)
if (cmdinfo->state & COMMAND_ABORTED) {
scmd_printk(KERN_INFO, cmnd, "abort completed\n");
cmnd->result = DID_ABORT << 16;
+ spin_lock_irq(&uas_lists_lock);
+ list_del(&cmdinfo->dead);
+ spin_unlock_irq(&uas_lists_lock);
}
cmnd->scsi_done(cmnd);
return 0;
@@ -307,7 +348,12 @@ static void uas_stat_cmplt(struct urb *urb)
u16 tag;

if (urb->status) {
- dev_err(&urb->dev->dev, "URB BAD STATUS %d\n", urb->status);
+ if (urb->status == -ENOENT) {
+ dev_err(&urb->dev->dev, "stat urb: killed (stream %d\n",
+ urb->stream_id);
+ } else {
+ dev_err(&urb->dev->dev, "stat urb: status %d\n", urb->status);
+ }
usb_free_urb(urb);
return;
}
@@ -762,7 +808,11 @@ static int uas_eh_abort_handler(struct scsi_cmnd *cmnd)

uas_log_cmd_state(cmnd, __func__);
spin_lock_irqsave(&devinfo->lock, flags);
+ BUG_ON(cmdinfo->state & COMMAND_ABORTED);
cmdinfo->state |= COMMAND_ABORTED;
+ spin_lock_irq(&uas_lists_lock);
+ list_add_tail(&cmdinfo->dead, &uas_dead_list);
+ spin_unlock_irq(&uas_lists_lock);
if (cmdinfo->state & IS_IN_WORK_LIST) {
spin_lock(&uas_lists_lock);
list_del(&cmdinfo->work);
@@ -797,11 +847,13 @@ static int uas_eh_bus_reset_handler(struct scsi_cmnd *cmnd)
struct usb_device *udev = devinfo->udev;
int err;

+ shost_printk(KERN_INFO, sdev->host, "%s start\n", __func__);
devinfo->resetting = 1;
uas_abort_work(devinfo);
usb_kill_anchored_urbs(&devinfo->cmd_urbs);
usb_kill_anchored_urbs(&devinfo->sense_urbs);
usb_kill_anchored_urbs(&devinfo->data_urbs);
+ uas_zap_dead(devinfo);
uas_free_streams(devinfo);
err = usb_reset_device(udev);
if (!err) {
@@ -1055,6 +1107,7 @@ static void uas_disconnect(struct usb_interface *intf)
usb_kill_anchored_urbs(&devinfo->cmd_urbs);
usb_kill_anchored_urbs(&devinfo->sense_urbs);
usb_kill_anchored_urbs(&devinfo->data_urbs);
+ uas_zap_dead(devinfo);
scsi_remove_host(shost);
uas_free_streams(devinfo);
kfree(devinfo);
--
1.8.3.1


2013-09-03 17:39:12

by Sarah Sharp

[permalink] [raw]
Subject: Re: [PATCH 4/5] uas: add dead request list

Don't you need to send an ABORT TASK message to the device to cancel the
outstanding request for that stream ID? I don't see that in this code.
I see lots of URB cancellation code, but nothing to remove the request
from the device-side queue.

Does this code currently handle the case where a device refuses to
respond to a SCSI request, and the upper layers attempt to cancel that
request? Or does it simply ensure that SCSI bus reset works?

I'm not comfortable with removing CONFIG_BROKEN until the upper layers
can cancel one outstanding request.

Plus, as Joe mentioned, this code is full of BUG_ON(), which is not
friendly to users, and doesn't increase my confidence that the driver is
ready to have CONFIG_BROKEN removed.

Sarah Sharp

On Mon, Sep 02, 2013 at 01:25:28PM +0200, Gerd Hoffmann wrote:
> This patch adds a new list where all requests which are canceled are
> added to, so we don't loose them. Then, after killing all inflight
> urbs on bus reset (and disconnect) we'll walk over the list and clean
> them up.
>
> Without this we can end up with aborted requests lingering around in
> case of status pipe transfer errors.
>
> Signed-off-by: Gerd Hoffmann <[email protected]>
> ---
> drivers/usb/storage/uas.c | 69 +++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 61 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
> index a63972a..9dfb8f9 100644
> --- a/drivers/usb/storage/uas.c
> +++ b/drivers/usb/storage/uas.c
> @@ -78,6 +78,7 @@ struct uas_cmd_info {
> struct urb *data_in_urb;
> struct urb *data_out_urb;
> struct list_head work;
> + struct list_head dead;
> };
>
> /* I hate forward declarations, but I actually have a loop */
> @@ -87,10 +88,12 @@ static void uas_do_work(struct work_struct *work);
> static int uas_try_complete(struct scsi_cmnd *cmnd, const char *caller);
> static void uas_configure_endpoints(struct uas_dev_info *devinfo);
> static void uas_free_streams(struct uas_dev_info *devinfo);
> +static void uas_log_cmd_state(struct scsi_cmnd *cmnd, const char *caller);
>
> static DECLARE_WORK(uas_work, uas_do_work);
> static DEFINE_SPINLOCK(uas_lists_lock);
> static LIST_HEAD(uas_work_list);
> +static LIST_HEAD(uas_dead_list);
>
> static void uas_unlink_data_urbs(struct uas_dev_info *devinfo,
> struct uas_cmd_info *cmdinfo)
> @@ -167,15 +170,13 @@ static void uas_abort_work(struct uas_dev_info *devinfo)
> struct uas_dev_info *di = (void *)cmnd->device->hostdata;
>
> if (di == devinfo) {
> + uas_log_cmd_state(cmnd, __func__);
> + BUG_ON(cmdinfo->state & COMMAND_ABORTED);
> cmdinfo->state |= COMMAND_ABORTED;
> + spin_lock_irq(&uas_lists_lock);
> + list_add_tail(&cmdinfo->dead, &uas_dead_list);
> + spin_unlock_irq(&uas_lists_lock);
> cmdinfo->state &= ~IS_IN_WORK_LIST;
> - if (devinfo->resetting) {
> - /* uas_stat_cmplt() will not do that
> - * when a device reset is in
> - * progress */
> - cmdinfo->state &= ~COMMAND_INFLIGHT;
> - }
> - uas_try_complete(cmnd, __func__);
> } else {
> /* not our uas device, relink into list */
> list_del(&cmdinfo->work);
> @@ -187,6 +188,43 @@ static void uas_abort_work(struct uas_dev_info *devinfo)
> spin_unlock_irqrestore(&devinfo->lock, flags);
> }
>
> +static void uas_zap_dead(struct uas_dev_info *devinfo)
> +{
> + struct uas_cmd_info *cmdinfo;
> + struct uas_cmd_info *temp;
> + struct list_head list;
> + unsigned long flags;
> +
> + spin_lock_irq(&uas_lists_lock);
> + list_replace_init(&uas_dead_list, &list);
> + spin_unlock_irq(&uas_lists_lock);
> +
> + spin_lock_irqsave(&devinfo->lock, flags);
> + list_for_each_entry_safe(cmdinfo, temp, &list, dead) {
> + struct scsi_pointer *scp = (void *)cmdinfo;
> + struct scsi_cmnd *cmnd = container_of(scp,
> + struct scsi_cmnd, SCp);
> + struct uas_dev_info *di = (void *)cmnd->device->hostdata;
> +
> + if (di == devinfo) {
> + uas_log_cmd_state(cmnd, __func__);
> + BUG_ON(!(cmdinfo->state & COMMAND_ABORTED));
> + /* all urbs are killed, clear inflight bits */
> + cmdinfo->state &= ~(COMMAND_INFLIGHT |
> + DATA_IN_URB_INFLIGHT |
> + DATA_OUT_URB_INFLIGHT);
> + uas_try_complete(cmnd, __func__);
> + } else {
> + /* not our uas device, relink into list */
> + list_del(&cmdinfo->dead);
> + spin_lock_irq(&uas_lists_lock);
> + list_add_tail(&cmdinfo->dead, &uas_dead_list);
> + spin_unlock_irq(&uas_lists_lock);
> + }
> + }
> + spin_unlock_irqrestore(&devinfo->lock, flags);
> +}
> +
> static void uas_sense(struct urb *urb, struct scsi_cmnd *cmnd)
> {
> struct sense_iu *sense_iu = urb->transfer_buffer;
> @@ -274,6 +312,9 @@ static int uas_try_complete(struct scsi_cmnd *cmnd, const char *caller)
> if (cmdinfo->state & COMMAND_ABORTED) {
> scmd_printk(KERN_INFO, cmnd, "abort completed\n");
> cmnd->result = DID_ABORT << 16;
> + spin_lock_irq(&uas_lists_lock);
> + list_del(&cmdinfo->dead);
> + spin_unlock_irq(&uas_lists_lock);
> }
> cmnd->scsi_done(cmnd);
> return 0;
> @@ -307,7 +348,12 @@ static void uas_stat_cmplt(struct urb *urb)
> u16 tag;
>
> if (urb->status) {
> - dev_err(&urb->dev->dev, "URB BAD STATUS %d\n", urb->status);
> + if (urb->status == -ENOENT) {
> + dev_err(&urb->dev->dev, "stat urb: killed (stream %d\n",
> + urb->stream_id);
> + } else {
> + dev_err(&urb->dev->dev, "stat urb: status %d\n", urb->status);
> + }
> usb_free_urb(urb);
> return;
> }
> @@ -762,7 +808,11 @@ static int uas_eh_abort_handler(struct scsi_cmnd *cmnd)
>
> uas_log_cmd_state(cmnd, __func__);
> spin_lock_irqsave(&devinfo->lock, flags);
> + BUG_ON(cmdinfo->state & COMMAND_ABORTED);
> cmdinfo->state |= COMMAND_ABORTED;
> + spin_lock_irq(&uas_lists_lock);
> + list_add_tail(&cmdinfo->dead, &uas_dead_list);
> + spin_unlock_irq(&uas_lists_lock);
> if (cmdinfo->state & IS_IN_WORK_LIST) {
> spin_lock(&uas_lists_lock);
> list_del(&cmdinfo->work);
> @@ -797,11 +847,13 @@ static int uas_eh_bus_reset_handler(struct scsi_cmnd *cmnd)
> struct usb_device *udev = devinfo->udev;
> int err;
>
> + shost_printk(KERN_INFO, sdev->host, "%s start\n", __func__);
> devinfo->resetting = 1;
> uas_abort_work(devinfo);
> usb_kill_anchored_urbs(&devinfo->cmd_urbs);
> usb_kill_anchored_urbs(&devinfo->sense_urbs);
> usb_kill_anchored_urbs(&devinfo->data_urbs);
> + uas_zap_dead(devinfo);
> uas_free_streams(devinfo);
> err = usb_reset_device(udev);
> if (!err) {
> @@ -1055,6 +1107,7 @@ static void uas_disconnect(struct usb_interface *intf)
> usb_kill_anchored_urbs(&devinfo->cmd_urbs);
> usb_kill_anchored_urbs(&devinfo->sense_urbs);
> usb_kill_anchored_urbs(&devinfo->data_urbs);
> + uas_zap_dead(devinfo);
> scsi_remove_host(shost);
> uas_free_streams(devinfo);
> kfree(devinfo);
> --
> 1.8.3.1
>

2013-09-04 07:04:13

by Gerd Hoffmann

[permalink] [raw]
Subject: Re: [PATCH 4/5] uas: add dead request list

On Di, 2013-09-03 at 10:39 -0700, Sarah Sharp wrote:
> Don't you need to send an ABORT TASK message to the device to cancel the
> outstanding request for that stream ID? I don't see that in this code.
> I see lots of URB cancellation code, but nothing to remove the request
> from the device-side queue.

It is there. uas_eh_abort_handler() cancels a single request. There is
also uas_eh_device_reset_handler() which will try a LOGICAL UNIT RESET.

Those might not work though, depending on the failure mode. If your
usb3 streams stop working you can't cancel scsi requests that way.

> Or does it simply ensure that SCSI bus reset works?

The scsi layer invokes the uas_eh_bus_reset_handler() as last resort,
when everything else fails. So, yes, there we'll have the sledge hammer
approach to recover: cancel all usb urbs, abort all requests, full usb
device reset + re-initialization. But we hardly have another chance
when the less invasive methods to cancel a requests didn't work ...

> Plus, as Joe mentioned, this code is full of BUG_ON(), which is not
> friendly to users, and doesn't increase my confidence that the driver is
> ready to have CONFIG_BROKEN removed.

Huh? Why you are thinking BUG_ON() is a indicator for bad code quality?
I'm using BUG_ON() like assert() in userspace, i.e. they are extra
sanity checks which should never ever trigger.

I can switch them to less disruptive WARN_ON() if that is the preferred
way these says.

cheers,
Gerd

2013-09-05 07:26:39

by Oliver Neukum

[permalink] [raw]
Subject: Re: [PATCH 4/5] uas: add dead request list

On Mon, 2013-09-02 at 13:25 +0200, Gerd Hoffmann wrote:
> +static void uas_zap_dead(struct uas_dev_info *devinfo)
> +{
> + struct uas_cmd_info *cmdinfo;
> + struct uas_cmd_info *temp;
> + struct list_head list;
> + unsigned long flags;
> +
> + spin_lock_irq(&uas_lists_lock);
> + list_replace_init(&uas_dead_list, &list);
> + spin_unlock_irq(&uas_lists_lock);

This looks like a window for a race.

> + spin_lock_irqsave(&devinfo->lock, flags);
> + list_for_each_entry_safe(cmdinfo, temp, &list, dead) {
> +

What happens if list entries are on the private list, when
the function is called for another device? It looks to me like
the di==devinfo test could put them back on the list although
they would need to be canceled.

Regards
Oliver