Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753638AbbBKQlz (ORCPT ); Wed, 11 Feb 2015 11:41:55 -0500 Received: from mail.cybernetics.com ([173.71.130.66]:17808 "EHLO mail.cybernetics.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751782AbbBKQlv (ORCPT ); Wed, 11 Feb 2015 11:41:51 -0500 X-Greylist: delayed 617 seconds by postgrey-1.27 at vger.kernel.org; Wed, 11 Feb 2015 11:41:51 EST Message-ID: <54DB83F5.50104@cybernetics.com> Date: Wed, 11 Feb 2015 11:31:49 -0500 From: Tony Battersby User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.4.0 MIME-Version: 1.0 To: linux-scsi@vger.kernel.org, "James E.J. Bottomley" , Christoph Hellwig , Jens Axboe , Douglas Gilbert CC: linux-kernel@vger.kernel.org Subject: [PATCH] [SCSI] bsg: fix unkillable I/O wait deadlock with scsi-mq Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9350 Lines: 287 When using the write()/read() interface for submitting commands, the bsg driver does not call blk_put_request() on a completed SCSI command until userspace calls read() to get the command completion. Since scsi-mq uses a fixed number of preallocated requests, this makes it possible for userspace to exhaust the entire preallocated supply of requests, leading to deadlock with the user process stuck in a permanent unkillable I/O wait in bsg_write() -> ... -> blk_get_request() -> ... -> bt_get(). Note that this deadlock can happen only if scsi-mq is enabled. Prevent the deadlock by calling blk_put_request() as soon as the SCSI command completes instead of waiting for userspace to call read(). Cc: # 3.17+ Signed-off-by: Tony Battersby --- For inclusion in kernel 3.20. Note: I did a number of tests with this patch, but I do not have any programs to test commands with bidirectional data transfer. I would appreciate if someone could test that. description of changes: *) For ioctl(SG_IO), allocate a struct bsg_command on the stack instead of allocating all the component fields on the stack. This helps simplify the code. *) Split blk_complete_sgv4_hdr_rq() into two functions: bsg_complete_command_rq() and bsg_complete_command_usercontext(). *) Call bsg_complete_command_rq() from the asynchronous bsg_rq_end_io() or after a synchronous call to blk_execute_rq(). *) The important change to fix the deadlock is that bsg_rq_end_io() now calls __blk_put_request(). *) Replace calls to blk_complete_sgv4_hdr_rq() with calls to bsg_complete_command_usercontext(). *) Use bc->err to pass around negative error values. *) If rq->errors < 0, do not use it to set device_status/transport_status/driver_status. *) Check the return value of blk_rq_unmap_user(). *) Remove bc->rq as it is no longer needed. I will send a separate patch to fix the same problem in the sg driver. --- linux-3.19.0/block/bsg.c.orig 2015-02-08 21:54:22.000000000 -0500 +++ linux-3.19.0/block/bsg.c 2015-02-11 11:00:57.000000000 -0500 @@ -80,7 +80,6 @@ static struct kmem_cache *bsg_cmd_cachep struct bsg_command { struct bsg_device *bd; struct list_head list; - struct request *rq; struct bio *bio; struct bio *bidi_bio; int err; @@ -88,6 +87,10 @@ struct bsg_command { char sense[SCSI_SENSE_BUFFERSIZE]; }; +static void bsg_complete_command_rq(struct bsg_command *bc, + struct request *rq, + bool holding_queue_lock); + static void bsg_free_command(struct bsg_command *bc) { struct bsg_device *bd = bc->bd; @@ -346,6 +349,8 @@ static void bsg_rq_end_io(struct request bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); + bsg_complete_command_rq(bc, rq, true); + spin_lock_irqsave(&bd->lock, flags); list_move_tail(&bc->list, &bd->done_list); bd->done_cmds++; @@ -366,7 +371,6 @@ static void bsg_add_command(struct bsg_d /* * add bc command to busy queue and submit rq for io */ - bc->rq = rq; bc->bio = rq->bio; if (rq->next_rq) bc->bidi_bio = rq->next_rq->bio; @@ -426,60 +430,100 @@ static struct bsg_command *bsg_get_done_ return bc; } -static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, - struct bio *bio, struct bio *bidi_bio) +/* + * Post-process a completed request. This should be called immediately after + * the request completes so that its resources can be reused for other + * commands. + */ +static void bsg_complete_command_rq(struct bsg_command *bc, + struct request *rq, + bool holding_queue_lock) { - int ret = 0; - - dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors); + dprintk("rq %p 0x%x\n", rq, rq->errors); /* * fill in all the output members + * + * If the request generated a negative error number, return it from + * the system call (e.g. read() or ioctl()); if it's just a protocol + * response (i.e. non negative), include it in the response hdr. */ - hdr->device_status = rq->errors & 0xff; - hdr->transport_status = host_byte(rq->errors); - hdr->driver_status = driver_byte(rq->errors); - hdr->info = 0; - if (hdr->device_status || hdr->transport_status || hdr->driver_status) - hdr->info |= SG_INFO_CHECK; - hdr->response_len = 0; - - if (rq->sense_len && hdr->response) { - int len = min_t(unsigned int, hdr->max_response_len, - rq->sense_len); - - ret = copy_to_user((void __user *)(unsigned long)hdr->response, - rq->sense, len); - if (!ret) - hdr->response_len = len; - else - ret = -EFAULT; - } + if (unlikely(rq->errors < 0)) { + bc->err = rq->errors; + bc->hdr.device_status = 0; + bc->hdr.transport_status = DID_ERROR; + bc->hdr.driver_status = DRIVER_ERROR; + } else { + bc->hdr.device_status = rq->errors & 0xff; + bc->hdr.transport_status = host_byte(rq->errors); + bc->hdr.driver_status = driver_byte(rq->errors); + } + bc->hdr.info = 0; + if (bc->hdr.device_status || bc->hdr.transport_status || + bc->hdr.driver_status) + bc->hdr.info |= SG_INFO_CHECK; + + bc->hdr.response_len = + (!bc->hdr.response) ? 0 : min_t(unsigned int, + bc->hdr.max_response_len, + rq->sense_len); if (rq->next_rq) { - hdr->dout_resid = rq->resid_len; - hdr->din_resid = rq->next_rq->resid_len; - blk_rq_unmap_user(bidi_bio); - blk_put_request(rq->next_rq); + bc->hdr.dout_resid = rq->resid_len; + bc->hdr.din_resid = rq->next_rq->resid_len; + if (holding_queue_lock) + __blk_put_request(rq->q, rq->next_rq); + else + blk_put_request(rq->next_rq); } else if (rq_data_dir(rq) == READ) - hdr->din_resid = rq->resid_len; + bc->hdr.din_resid = rq->resid_len; else - hdr->dout_resid = rq->resid_len; + bc->hdr.dout_resid = rq->resid_len; /* - * If the request generated a negative error number, return it - * (providing we aren't already returning an error); if it's - * just a protocol response (i.e. non negative), that gets - * processed above. + * Free the request as soon as it is complete so that its resources + * can be reused without waiting for userspace to read() the + * result. But keep the associated bios (if any) around until + * bsg_complete_command_usercontext() can be called from user context. */ - if (!ret && rq->errors < 0) - ret = rq->errors; - - blk_rq_unmap_user(bio); if (rq->cmd != rq->__cmd) kfree(rq->cmd); - blk_put_request(rq); + if (holding_queue_lock) + __blk_put_request(rq->q, rq); + else + blk_put_request(rq); +} - return ret; +/* + * Post-process a completed request from user context. + */ +static int bsg_complete_command_usercontext(struct bsg_command *bc) +{ + int ret; + + dprintk("bio %p\n", bc->bio); + + if (bc->hdr.response_len) { + ret = copy_to_user( + (void __user *)(unsigned long)bc->hdr.response, + bc->sense, bc->hdr.response_len); + if (unlikely(ret)) { + bc->hdr.response_len = 0; + bc->err = -EFAULT; + } + } + + if (bc->bidi_bio) { + ret = blk_rq_unmap_user(bc->bidi_bio); + if (unlikely(ret)) + bc->err = ret; + } + if (bc->bio) { + ret = blk_rq_unmap_user(bc->bio); + if (unlikely(ret)) + bc->err = ret; + } + + return bc->err; } static int bsg_complete_all_commands(struct bsg_device *bd) @@ -520,8 +564,7 @@ static int bsg_complete_all_commands(str if (IS_ERR(bc)) break; - tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, - bc->bidi_bio); + tret = bsg_complete_command_usercontext(bc); if (!ret) ret = tret; @@ -555,8 +598,7 @@ __bsg_read(char __user *buf, size_t coun * after completing the request. so do that here, * bsg_complete_work() cannot do that for us */ - ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, - bc->bidi_bio); + ret = bsg_complete_command_usercontext(bc); if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr))) ret = -EFAULT; @@ -926,28 +968,29 @@ static long bsg_ioctl(struct file *file, return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg); } case SG_IO: { + struct bsg_command bc; struct request *rq; - struct bio *bio, *bidi_bio = NULL; - struct sg_io_v4 hdr; int at_head; - u8 sense[SCSI_SENSE_BUFFERSIZE]; - if (copy_from_user(&hdr, uarg, sizeof(hdr))) + if (copy_from_user(&bc.hdr, uarg, sizeof(bc.hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE, sense); + rq = bsg_map_hdr(bd, &bc.hdr, file->f_mode & FMODE_WRITE, + bc.sense); if (IS_ERR(rq)) return PTR_ERR(rq); - bio = rq->bio; - if (rq->next_rq) - bidi_bio = rq->next_rq->bio; + bc.bd = bd; + bc.bio = rq->bio; + bc.bidi_bio = (rq->next_rq) ? rq->next_rq->bio : NULL; + bc.err = 0; - at_head = (0 == (hdr.flags & BSG_FLAG_Q_AT_TAIL)); + at_head = (0 == (bc.hdr.flags & BSG_FLAG_Q_AT_TAIL)); blk_execute_rq(bd->queue, NULL, rq, at_head); - ret = blk_complete_sgv4_hdr_rq(rq, &hdr, bio, bidi_bio); + bsg_complete_command_rq(&bc, rq, false); + ret = bsg_complete_command_usercontext(&bc); - if (copy_to_user(uarg, &hdr, sizeof(hdr))) + if (copy_to_user(uarg, &bc.hdr, sizeof(bc.hdr))) return -EFAULT; return ret; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/