Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752991AbbKNDMs (ORCPT ); Fri, 13 Nov 2015 22:12:48 -0500 Received: from userp1040.oracle.com ([156.151.31.81]:42416 "EHLO userp1040.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752959AbbKNDMp (ORCPT ); Fri, 13 Nov 2015 22:12:45 -0500 From: Bob Liu To: xen-devel@lists.xen.org Cc: linux-kernel@vger.kernel.org, roger.pau@citrix.com, konrad.wilk@oracle.com, felipe.franciosi@citrix.com, axboe@fb.com, avanzini.arianna@gmail.com, rafal.mielniczuk@citrix.com, jonathan.davies@citrix.com, david.vrabel@citrix.com, Bob Liu Subject: [PATCH v5 03/10] xen/blkfront: pseudo support for multi hardware queues/rings Date: Sat, 14 Nov 2015 11:12:12 +0800 Message-Id: <1447470739-18136-4-git-send-email-bob.liu@oracle.com> X-Mailer: git-send-email 1.7.10.4 In-Reply-To: <1447470739-18136-1-git-send-email-bob.liu@oracle.com> References: <1447470739-18136-1-git-send-email-bob.liu@oracle.com> X-Source-IP: aserv0021.oracle.com [141.146.126.233] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 18551 Lines: 591 Preparatory patch for multiple hardware queues (rings). The number of rings is unconditionally set to 1, larger number will be enabled in next patch("xen/blkfront: negotiate number of queues/rings to be used with backend") so as to make every single patch small and readable. Signed-off-by: Bob Liu --- v2: * Fix memleak. * Other comments from Konrad. --- drivers/block/xen-blkfront.c | 341 ++++++++++++++++++++++++------------------ 1 file changed, 195 insertions(+), 146 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0c3ad21..d73734f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -150,6 +150,7 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; + /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; struct request_queue *rq; struct list_head grants; @@ -164,7 +165,8 @@ struct blkfront_info unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; - struct blkfront_ring_info rinfo; + struct blkfront_ring_info *rinfo; + unsigned int nr_rings; }; static unsigned int nr_minors; @@ -209,7 +211,7 @@ static DEFINE_SPINLOCK(minor_lock); #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); -static int blkfront_gather_backend_features(struct blkfront_info *info); +static void blkfront_gather_backend_features(struct blkfront_info *info); static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { @@ -338,8 +340,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->rinfo.indirect_pages)); - indirect_page = list_first_entry(&info->rinfo.indirect_pages, + BUG_ON(list_empty(&info->rinfo->indirect_pages)); + indirect_page = list_first_entry(&info->rinfo->indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -597,7 +599,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free. */ - bool new_persistent_gnts; struct scatterlist *sg; int num_sg, max_grefs, num_grant; @@ -609,12 +610,12 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri */ max_grefs += INDIRECT_GREFS(max_grefs); - /* Check if we have enough grants to allocate a requests */ - if (info->persistent_gnts_c < max_grefs) { - new_persistent_gnts = 1; - if (gnttab_alloc_grant_references( - max_grefs - info->persistent_gnts_c, - &setup.gref_head) < 0) { + /* + * We have to reserve 'max_grefs' grants at first because persistent + * grants are shared by all rings. + */ + if (max_grefs > 0) + if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) { gnttab_request_free_callback( &rinfo->callback, blkif_restart_queue_callback, @@ -622,8 +623,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri max_grefs); return 1; } - } else - new_persistent_gnts = 0; /* Fill out a communications ring structure. */ ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); @@ -712,7 +711,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri /* Keep a private copy so we can reissue requests when recovering. */ rinfo->shadow[id].req = *ring_req; - if (new_persistent_gnts) + if (max_grefs > 0) gnttab_free_grant_references(setup.gref_head); return 0; @@ -791,7 +790,8 @@ static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, { struct blkfront_info *info = (struct blkfront_info *)data; - hctx->driver_data = &info->rinfo; + BUG_ON(info->nr_rings <= index); + hctx->driver_data = &info->rinfo[index]; return 0; } @@ -1050,8 +1050,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { - unsigned int minor, nr_minors; - struct blkfront_ring_info *rinfo = &info->rinfo; + unsigned int minor, nr_minors, i; if (info->rq == NULL) return; @@ -1059,11 +1058,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&rinfo->callback); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; - /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&rinfo->work); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&rinfo->callback); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&rinfo->work); + } del_gendisk(info->gd); @@ -1096,37 +1099,11 @@ static void blkif_restart_queue(struct work_struct *work) spin_unlock_irq(&rinfo->dev_info->io_lock); } -static void blkif_free(struct blkfront_info *info, int suspend) +static void blkif_free_ring(struct blkfront_ring_info *rinfo) { struct grant *persistent_gnt; - struct grant *n; + struct blkfront_info *info = rinfo->dev_info; int i, j, segs; - struct blkfront_ring_info *rinfo = &info->rinfo; - - /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&info->io_lock); - info->connected = suspend ? - BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; - /* No more blkif_request(). */ - if (info->rq) - blk_mq_stop_hw_queues(info->rq); - - /* Remove all persistent grants */ - if (!list_empty(&info->grants)) { - list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { - list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); - info->persistent_gnts_c--; - } - if (info->feature_persistent) - __free_page(persistent_gnt->page); - kfree(persistent_gnt); - } - } - BUG_ON(info->persistent_gnts_c != 0); /* * Remove indirect pages, this only happens when using indirect @@ -1186,7 +1163,6 @@ free_shadow: /* No more gnttab callback work. */ gnttab_cancel_free_callback(&rinfo->callback); - spin_unlock_irq(&info->io_lock); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work(&rinfo->work); @@ -1204,7 +1180,43 @@ free_shadow: if (rinfo->irq) unbind_from_irqhandler(rinfo->irq, rinfo); rinfo->evtchn = rinfo->irq = 0; +} +static void blkif_free(struct blkfront_info *info, int suspend) +{ + struct grant *persistent_gnt, *n; + unsigned int i; + + /* Prevent new requests being issued until we fix things up. */ + spin_lock_irq(&info->io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_mq_stop_hw_queues(info->rq); + + /* Remove all persistent grants */ + if (!list_empty(&info->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &info->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + info->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(persistent_gnt->page); + kfree(persistent_gnt); + } + } + BUG_ON(info->persistent_gnts_c != 0); + + for (i = 0; i < info->nr_rings; i++) + blkif_free_ring(&info->rinfo[i]); + kfree(info->rinfo); + info->nr_rings = 0; + spin_unlock_irq(&info->io_lock); } struct copy_from_grant { @@ -1492,7 +1504,7 @@ static int talk_to_blkback(struct xenbus_device *dev, int err, i; unsigned int max_page_order = 0; unsigned int ring_page_order = 0; - struct blkfront_ring_info *rinfo = &info->rinfo; + struct blkfront_ring_info *rinfo; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, "max-ring-page-order", "%u", &max_page_order); @@ -1503,10 +1515,13 @@ static int talk_to_blkback(struct xenbus_device *dev, info->nr_ring_pages = 1 << ring_page_order; } - /* Create shared ring, alloc event channel. */ - err = setup_blkring(dev, rinfo); - if (err) - goto out; + for (i = 0; i < info->nr_rings; i++) { + rinfo = &info->rinfo[i]; + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, rinfo); + if (err) + goto destroy_blkring; + } again: err = xenbus_transaction_start(&xbt); @@ -1515,37 +1530,43 @@ again: goto destroy_blkring; } - if (info->nr_ring_pages == 1) { - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", rinfo->ring_ref[0]); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; - } - } else { - err = xenbus_printf(xbt, dev->nodename, - "ring-page-order", "%u", ring_page_order); - if (err) { - message = "writing ring-page-order"; - goto abort_transaction; - } - - for (i = 0; i < info->nr_ring_pages; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; - - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", rinfo->ring_ref[i]); + if (info->nr_rings == 1) { + rinfo = &info->rinfo[0]; + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", rinfo->ring_ref[0]); if (err) { message = "writing ring-ref"; goto abort_transaction; } + } else { + err = xenbus_printf(xbt, dev->nodename, + "ring-page-order", "%u", ring_page_order); + if (err) { + message = "writing ring-page-order"; + goto abort_transaction; + } + + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dev->nodename, ring_ref_name, + "%u", rinfo->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } } - } - err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", rinfo->evtchn); - if (err) { - message = "writing event-channel"; + err = xenbus_printf(xbt, dev->nodename, + "event-channel", "%u", rinfo->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + } else { + /* Not supported at this stage. */ goto abort_transaction; } err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", @@ -1568,9 +1589,14 @@ again: goto destroy_blkring; } - for (i = 0; i < BLK_RING_SIZE(info); i++) - rinfo->shadow[i].req.u.rw.id = i+1; - rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + for (i = 0; i < info->nr_rings; i++) { + int j; + rinfo = &info->rinfo[i]; + + for (j = 0; j < BLK_RING_SIZE(info); j++) + rinfo->shadow[j].req.u.rw.id = j + 1; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + } xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1581,7 +1607,7 @@ again: xenbus_dev_fatal(dev, err, "%s", message); destroy_blkring: blkif_free(info, 0); - out: + return err; } @@ -1594,9 +1620,8 @@ again: static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { - int err, vdevice; + int err, vdevice, r_index; struct blkfront_info *info; - struct blkfront_ring_info *rinfo; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1646,10 +1671,22 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } - rinfo = &info->rinfo; - INIT_LIST_HEAD(&rinfo->indirect_pages); - rinfo->dev_info = info; - INIT_WORK(&rinfo->work, blkif_restart_queue); + info->nr_rings = 1; + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); + if (!info->rinfo) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); + kfree(info); + return -ENOMEM; + } + + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + INIT_LIST_HEAD(&rinfo->indirect_pages); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + } mutex_init(&info->mutex); spin_lock_init(&info->io_lock); @@ -1681,7 +1718,7 @@ static void split_bio_end(struct bio *bio) static int blkif_recover(struct blkfront_info *info) { - int i; + int i, r_index; struct request *req, *n; struct blk_shadow *copy; int rc; @@ -1691,57 +1728,62 @@ static int blkif_recover(struct blkfront_info *info) int pending, size; struct split_bio *split_bio; struct list_head requests; - struct blkfront_ring_info *rinfo = &info->rinfo; - - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - rinfo->shadow[i].req.u.rw.id = i+1; - rinfo->shadow_free = rinfo->ring.req_prod_pvt; - rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - - rc = blkfront_gather_backend_features(info); - if (rc) { - kfree(copy); - return rc; - } + blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); + if (!copy) + return -ENOMEM; + + /* Stage 2: Set up free list. */ + memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); + for (i = 0; i < BLK_RING_SIZE(info); i++) + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow_free = rinfo->ring.req_prod_pvt; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + + rc = blkfront_setup_indirect(rinfo); + if (rc) { + kfree(copy); + return rc; + } + + for (i = 0; i < BLK_RING_SIZE(info); i++) { + /* Not in use? */ + if (!copy[i].request) + continue; + /* - * Flush operations don't contain bios, so - * we need to requeue the whole request + * Get the bios in the request so we can re-queue them. */ - list_add(©[i].request->queuelist, &requests); - continue; + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; + } + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_end_request_all(copy[i].request, 0); } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); - } - - kfree(copy); + kfree(copy); + } xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1749,8 +1791,13 @@ static int blkif_recover(struct blkfront_info *info) /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Kick any other new requests queued since we resumed */ - kick_pending_request_queues(rinfo); + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(rinfo); + } list_for_each_entry_safe(req, n, &requests, queuelist) { /* Requeue pending requests (flush or discard) */ @@ -1961,7 +2008,7 @@ out_of_memory: /* * Gather all backend feature-* */ -static int blkfront_gather_backend_features(struct blkfront_info *info) +static void blkfront_gather_backend_features(struct blkfront_info *info) { int err; int barrier, flush, discard, persistent; @@ -2016,8 +2063,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) else info->max_indirect_segments = min(indirect_segments, xen_blkif_max_segments); - - return blkfront_setup_indirect(&info->rinfo); } /* @@ -2030,8 +2075,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int physical_sector_size; unsigned int binfo; - int err; - struct blkfront_ring_info *rinfo = &info->rinfo; + int err, i; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2088,11 +2132,15 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - err = blkfront_gather_backend_features(info); - if (err) { - xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", - info->xbdev->otherend); - return; + blkfront_gather_backend_features(info); + for (i = 0; i < info->nr_rings; i++) { + err = blkfront_setup_indirect(&info->rinfo[i]); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + blkif_free(info, 0); + break; + } } err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, @@ -2108,7 +2156,8 @@ static void blkfront_connect(struct blkfront_info *info) /* Kick pending requests. */ spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; - kick_pending_request_queues(rinfo); + for (i = 0; i < info->nr_rings; i++) + kick_pending_request_queues(&info->rinfo[i]); spin_unlock_irq(&info->io_lock); add_disk(info->gd); -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/