The right place for talk_to_blkback() to query backend features and transport
parameters is after backend entered XenbusStateInitWait. There is no problem
with this yet, but it is an violation of the design and furthermore it would not
allow frontend/backend to negotiate 'multi-page' and 'multi-queue' features
which require this.
This patch move talk_to_blkback() to blkback_changed() after backend entered
XenbusStateInitWait just like blkif.h defined:
See: xen/include/public/io/blkif.h
Front Back
================================= =====================================
XenbusStateInitialising XenbusStateInitialising
o Query virtual device o Query backend device identification
properties. data.
o Setup OS device instance. o Open and validate backend device.
o Publish backend features and
transport parameters.
|
|
V
XenbusStateInitWait
o Query backend features and
transport parameters.
o Allocate and initialize the
request ring.
o Publish transport parameters
that will be in effect during
this connection.
|
|
V
XenbusStateInitialised
o Query frontend transport parameters.
o Connect to the request ring and
event channel.
o Publish backend device properties.
|
|
V
XenbusStateConnected
o Query backend device properties.
o Finalize OS virtual device
instance.
|
|
V
XenbusStateConnected
Signed-off-by: Bob Liu <[email protected]>
---
drivers/block/xen-blkfront.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2c61cf8..88e23fd 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1430,13 +1430,6 @@ static int blkfront_probe(struct xenbus_device *dev,
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
dev_set_drvdata(&dev->dev, info);
- err = talk_to_blkback(dev, info);
- if (err) {
- kfree(info);
- dev_set_drvdata(&dev->dev, NULL);
- return err;
- }
-
return 0;
}
@@ -1906,8 +1899,13 @@ static void blkback_changed(struct xenbus_device *dev,
dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
switch (backend_state) {
- case XenbusStateInitialising:
case XenbusStateInitWait:
+ if (talk_to_blkback(dev, info)) {
+ kfree(info);
+ dev_set_drvdata(&dev->dev, NULL);
+ break;
+ }
+ case XenbusStateInitialising:
case XenbusStateInitialised:
case XenbusStateReconfiguring:
case XenbusStateReconfigured:
--
1.8.3.1
Extend xen/block to support multi-page ring, so that more requests can be issued
by using more than one pages as the request ring between blkfront and backend.
As a result, the performance can get improved significantly.
We got some impressive improvements on our highend iscsi storage cluster backend.
If using 64 pages as the ring, the IOPS increased about 15 times for the
throughput testing and above doubled for the latency testing.
The reason was the limit on outstanding requests is 32 if use only one-page
ring, but in our case the iscsi lun was spread across about 100 physical drives,
32 was really not enough to keep them busy.
Changes in v2:
- Rebased to 4.0-rc6
- Added description on how this protocol works into io/blkif.h
Changes in v3:
- Follow the protocol defined in io/blkif.h on XEN tree
Signed-off-by: Bob Liu <[email protected]>
---
drivers/block/xen-blkback/blkback.c | 14 ++++-
drivers/block/xen-blkback/common.h | 4 +-
drivers/block/xen-blkback/xenbus.c | 83 ++++++++++++++++++++++-------
drivers/block/xen-blkfront.c | 102 +++++++++++++++++++++++++++---------
4 files changed, 156 insertions(+), 47 deletions(-)
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 713fc9f..f191083 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -84,6 +84,12 @@ MODULE_PARM_DESC(max_persistent_grants,
"Maximum number of grants to map persistently");
/*
+ * Maximum number of pages to be used as the ring between front and backend
+ */
+unsigned int xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
+module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, S_IRUGO);
+MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
+/*
* The LRU mechanism to clean the lists of persistent grants needs to
* be executed periodically. The time interval between consecutive executions
* of the purge mechanism is set in ms.
@@ -630,7 +636,7 @@ purge_gnt_list:
}
/* Shrink if we have more than xen_blkif_max_buffer_pages */
- shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+ shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * blkif->nr_ring_pages);
if (log_stats && time_after(jiffies, blkif->st_print))
print_stats(blkif);
@@ -1435,6 +1441,12 @@ static int __init xen_blkif_init(void)
{
int rc = 0;
+ if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES) {
+ pr_info("Invalid max_ring_pages (%d), will use default max: %d.\n",
+ xen_blkif_max_ring_pages, XENBUS_MAX_RING_PAGES);
+ xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
+ }
+
if (!xen_domain())
return -ENODEV;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index f620b5d..84a964c 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -44,6 +44,7 @@
#include <xen/interface/io/blkif.h>
#include <xen/interface/io/protocols.h>
+extern unsigned int xen_blkif_max_ring_pages;
/*
* This is the maximum number of segments that would be allowed in indirect
* requests. This value will also be passed to the frontend.
@@ -248,7 +249,7 @@ struct backend_info;
#define PERSISTENT_GNT_WAS_ACTIVE 1
/* Number of requests that we can fit in a ring */
-#define XEN_BLKIF_REQS 32
+#define XEN_BLKIF_REQS (32 * XENBUS_MAX_RING_PAGES)
struct persistent_gnt {
struct page *page;
@@ -320,6 +321,7 @@ struct xen_blkif {
struct work_struct free_work;
/* Thread shutdown wait queue. */
wait_queue_head_t shutdown_wq;
+ int nr_ring_pages;
};
struct seg_buf {
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 6ab69ad..909babd 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -198,8 +198,8 @@ fail:
return ERR_PTR(-ENOMEM);
}
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
- unsigned int evtchn)
+static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+ unsigned int nr_grefs, unsigned int evtchn)
{
int err;
@@ -207,7 +207,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
if (blkif->irq)
return 0;
- err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
+ err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
&blkif->blk_ring);
if (err < 0)
return err;
@@ -217,21 +217,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
{
struct blkif_sring *sring;
sring = (struct blkif_sring *)blkif->blk_ring;
- BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
break;
}
case BLKIF_PROTOCOL_X86_32:
{
struct blkif_x86_32_sring *sring_x86_32;
sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
- BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
break;
}
case BLKIF_PROTOCOL_X86_64:
{
struct blkif_x86_64_sring *sring_x86_64;
sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
- BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
break;
}
default:
@@ -597,6 +597,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
if (err)
goto fail;
+ err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-pages", "%u",
+ xen_blkif_max_ring_pages);
+ if (err)
+ pr_warn("%s write out 'max-ring-pages' failed\n", __func__);
+
err = xenbus_switch_state(dev, XenbusStateInitWait);
if (err)
goto fail;
@@ -860,22 +865,62 @@ again:
static int connect_ring(struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
- unsigned long ring_ref;
- unsigned int evtchn;
+ unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
+ unsigned int evtchn, nr_grefs;
unsigned int pers_grants;
char protocol[64] = "";
int err;
pr_debug("%s %s\n", __func__, dev->otherend);
- err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
- &ring_ref, "event-channel", "%u", &evtchn, NULL);
- if (err) {
- xenbus_dev_fatal(dev, err,
- "reading %s/ring-ref and event-channel",
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+ &evtchn);
+ if (err != 1) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "reading %s/event-channel",
dev->otherend);
return err;
}
+ pr_info("event-channel %u\n", evtchn);
+
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "num-ring-pages", "%u",
+ &nr_grefs);
+ if (err != 1) {
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
+ "%u", &ring_ref[0]);
+ if (err != 1) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+ dev->otherend);
+ return err;
+ }
+ nr_grefs = 1;
+ pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
+ ring_ref[0]);
+ } else {
+ unsigned int i;
+
+ if (nr_grefs > xen_blkif_max_ring_pages) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "%s/request %d ring pages exceed max:%d",
+ dev->otherend, nr_grefs, xen_blkif_max_ring_pages);
+ return err;
+ }
+ for (i = 0; i < nr_grefs; i++) {
+ char ring_ref_name[BLKBACK_NAME_LEN];
+
+ snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
+ err = xenbus_scanf(XBT_NIL, dev->otherend,
+ ring_ref_name, "%u", &ring_ref[i]);
+ if (err != 1) {
+ err = -EINVAL;
+ xenbus_dev_fatal(dev, err, "reading %s/%s",
+ dev->otherend, ring_ref_name);
+ return err;
+ }
+ pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
+ }
+ }
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
@@ -900,16 +945,16 @@ static int connect_ring(struct backend_info *be)
be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0;
+ be->blkif->nr_ring_pages = nr_grefs;
- pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
- ring_ref, evtchn, be->blkif->blk_protocol, protocol,
- pers_grants ? "persistent grants" : "");
+ pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
+ nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
+ pers_grants ? "persistent grants" : "");
/* Map the shared frame, irq etc. */
- err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+ err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
if (err) {
- xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
- ring_ref, evtchn);
+ xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
return err;
}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 88e23fd..c4b427f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -98,7 +98,17 @@ static unsigned int xen_blkif_max_segments = 32;
module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+static unsigned int xen_blkif_max_ring_pages = 1;
+module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, S_IRUGO);
+MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
+#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
+/*
+ * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 characters
+ * are enough. Define to 20 to keep consist with backend.
+ */
+#define RINGREF_NAME_LEN (20)
/*
* We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -114,13 +124,14 @@ struct blkfront_info
int vdevice;
blkif_vdev_t handle;
enum blkif_state connected;
- int ring_ref;
+ int ring_ref[XENBUS_MAX_RING_PAGES];
+ unsigned int nr_ring_pages;
struct blkif_front_ring ring;
unsigned int evtchn, irq;
struct request_queue *rq;
struct work_struct work;
struct gnttab_free_callback callback;
- struct blk_shadow shadow[BLK_RING_SIZE];
+ struct blk_shadow shadow[BLK_MAX_RING_SIZE];
struct list_head grants;
struct list_head indirect_pages;
unsigned int persistent_gnts_c;
@@ -139,8 +150,6 @@ static unsigned int nr_minors;
static unsigned long *minors;
static DEFINE_SPINLOCK(minor_lock);
-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
- (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
#define GRANT_INVALID_REF 0
#define PARTS_PER_DISK 16
@@ -1033,12 +1042,15 @@ free_shadow:
flush_work(&info->work);
/* Free resources associated with old device channel. */
- if (info->ring_ref != GRANT_INVALID_REF) {
- gnttab_end_foreign_access(info->ring_ref, 0,
- (unsigned long)info->ring.sring);
- info->ring_ref = GRANT_INVALID_REF;
- info->ring.sring = NULL;
+ for (i = 0; i < info->nr_ring_pages; i++) {
+ if (info->ring_ref[i] != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+ info->ring_ref[i] = GRANT_INVALID_REF;
+ }
}
+ free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+ info->ring.sring = NULL;
+
if (info->irq)
unbind_from_irqhandler(info->irq, info);
info->evtchn = info->irq = 0;
@@ -1245,26 +1257,30 @@ static int setup_blkring(struct xenbus_device *dev,
struct blkfront_info *info)
{
struct blkif_sring *sring;
- grant_ref_t gref;
- int err;
+ int err, i;
+ unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
+ grant_ref_t gref[XENBUS_MAX_RING_PAGES];
- info->ring_ref = GRANT_INVALID_REF;
+ for (i = 0; i < info->nr_ring_pages; i++)
+ info->ring_ref[i] = GRANT_INVALID_REF;
- sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+ sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
+ get_order(ring_size));
if (!sring) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
return -ENOMEM;
}
SHARED_RING_INIT(sring);
- FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+ FRONT_RING_INIT(&info->ring, sring, ring_size);
- err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
+ err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
if (err < 0) {
free_page((unsigned long)sring);
info->ring.sring = NULL;
goto fail;
}
- info->ring_ref = gref;
+ for (i = 0; i < info->nr_ring_pages; i++)
+ info->ring_ref[i] = gref[i];
err = xenbus_alloc_evtchn(dev, &info->evtchn);
if (err)
@@ -1292,7 +1308,15 @@ static int talk_to_blkback(struct xenbus_device *dev,
{
const char *message = NULL;
struct xenbus_transaction xbt;
- int err;
+ int err, i;
+ unsigned int max_pages = 0;
+
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "max-ring-pages", "%u", &max_pages);
+ if (err != 1)
+ info->nr_ring_pages = 1;
+ else
+ info->nr_ring_pages = min(xen_blkif_max_ring_pages, max_pages);
/* Create shared ring, alloc event channel. */
err = setup_blkring(dev, info);
@@ -1306,11 +1330,31 @@ again:
goto destroy_blkring;
}
- err = xenbus_printf(xbt, dev->nodename,
- "ring-ref", "%u", info->ring_ref);
- if (err) {
- message = "writing ring-ref";
- goto abort_transaction;
+ if (info->nr_ring_pages == 1) {
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-ref", "%u", info->ring_ref[0]);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+ } else {
+ err = xenbus_printf(xbt, dev->nodename,
+ "num-ring-pages", "%u", info->nr_ring_pages);
+ if (err) {
+ message = "writing num-ring-pages";
+ goto abort_transaction;
+ }
+
+ for (i = 0; i < info->nr_ring_pages; i++) {
+ char ring_ref_name[RINGREF_NAME_LEN];
+ snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
+ err = xenbus_printf(xbt, dev->nodename,
+ ring_ref_name, "%u", info->ring_ref[i]);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+ }
}
err = xenbus_printf(xbt, dev->nodename,
"event-channel", "%u", info->evtchn);
@@ -1338,6 +1382,7 @@ again:
goto destroy_blkring;
}
+ info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
xenbus_switch_state(dev, XenbusStateInitialised);
return 0;
@@ -1422,9 +1467,8 @@ static int blkfront_probe(struct xenbus_device *dev,
info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
- for (i = 0; i < BLK_RING_SIZE; i++)
+ for (i = 0; i < BLK_MAX_RING_SIZE; i++)
info->shadow[i].req.u.rw.id = i+1;
- info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
/* Front end dir is a number, which is used as the id. */
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1469,7 +1513,7 @@ static int blkif_recover(struct blkfront_info *info)
/* Stage 2: Set up free list. */
memset(&info->shadow, 0, sizeof(info->shadow));
- for (i = 0; i < BLK_RING_SIZE; i++)
+ for (i = 0; i < BLK_MAX_RING_SIZE; i++)
info->shadow[i].req.u.rw.id = i+1;
info->shadow_free = info->ring.req_prod_pvt;
info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
@@ -2086,6 +2130,12 @@ static int __init xlblk_init(void)
{
int ret;
+ if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES) {
+ pr_info("Invalid max_ring_pages (%d), will use default max: %d.\n",
+ xen_blkif_max_ring_pages, XENBUS_MAX_RING_PAGES);
+ xen_blkif_max_ring_pages = 1;
+ }
+
if (!xen_domain())
return -ENODEV;
--
1.8.3.1
El 12/05/15 a les 13.01, Bob Liu ha escrit:
> The right place for talk_to_blkback() to query backend features and transport
> parameters is after backend entered XenbusStateInitWait. There is no problem
talk_to_blkback doesn't gather any backend features, it just publishes
the features supported by the frontend, which AFAICT can be done at any
time provided that it's before switching to state XenbusStateInitWait.
Blkfront doesn't have to wait for the backend to switch to state
XenbusStateInitWait before publishing the features supported by the
frontend, which is what talk_to_blkback does.
Roger.
On 05/15/2015 06:01 PM, Roger Pau Monn? wrote:
> El 12/05/15 a les 13.01, Bob Liu ha escrit:
>> The right place for talk_to_blkback() to query backend features and transport
>> parameters is after backend entered XenbusStateInitWait. There is no problem
>
> talk_to_blkback doesn't gather any backend features, it just publishes
> the features supported by the frontend, which AFAICT can be done at any
1) But talk_tlkback will also allocate and initialize the request ring which
should be done after backend entered XenbusStateInitWait.
Please see the protocol defined in xen/include/public/io/blkif.h:
*****************************************************************************
* Startup *
*****************************************************************************
*
* Tool stack creates front and back nodes with state XenbusStateInitialising.
*
* Front Back
* ================================= =====================================
* XenbusStateInitialising XenbusStateInitialising
* o Query virtual device o Query backend device identification
* properties. data.
* o Setup OS device instance. o Open and validate backend device.
* o Publish backend features and
* transport parameters.
* |
* |
* V
* XenbusStateInitWait
*
* o Query backend features and
* transport parameters.
* o Allocate and initialize the
* request ring.
2) Another problem is after 'mutli-page' ring feature get introduced, we have to know the max
ring pages supported by backend in setup_blkring().
If backend haven't enter XenbusStateInitWait, we may not query the right value. E.g.
Frontend Backend
in .probe:
talk_to_blkback()
> setup_blkring()
> xenbus_scanf(max_ring_pages)
in .probe:
xenbus_printf(max_ring_pages)
^^^^ Too late to write the real value
xenbus_switch_state(dev, XenbusStateInitWait)
Thank you reviewing these patches!
Regards,
-Bob
> time provided that it's before switching to state XenbusStateInitWait.
> Blkfront doesn't have to wait for the backend to switch to state
> XenbusStateInitWait before publishing the features supported by the
> frontend, which is what talk_to_blkback does.
>
> Roger.
>
El 12/05/15 a les 13.01, Bob Liu ha escrit:
> Extend xen/block to support multi-page ring, so that more requests can be issued
> by using more than one pages as the request ring between blkfront and backend.
> As a result, the performance can get improved significantly.
^ s/can get improved/improves/
>
> We got some impressive improvements on our highend iscsi storage cluster backend.
> If using 64 pages as the ring, the IOPS increased about 15 times for the
> throughput testing and above doubled for the latency testing.
>
> The reason was the limit on outstanding requests is 32 if use only one-page
> ring, but in our case the iscsi lun was spread across about 100 physical drives,
> 32 was really not enough to keep them busy.
>
> Changes in v2:
> - Rebased to 4.0-rc6
> - Added description on how this protocol works into io/blkif.h
I don't see any changes to io/blkif.h in this patch, is something missing?
Also you use XENBUS_MAX_RING_PAGES which AFAICT it's not defined anywhere.
>
> Changes in v3:
> - Follow the protocol defined in io/blkif.h on XEN tree
>
> Signed-off-by: Bob Liu <[email protected]>
> ---
> drivers/block/xen-blkback/blkback.c | 14 ++++-
> drivers/block/xen-blkback/common.h | 4 +-
> drivers/block/xen-blkback/xenbus.c | 83 ++++++++++++++++++++++-------
> drivers/block/xen-blkfront.c | 102 +++++++++++++++++++++++++++---------
> 4 files changed, 156 insertions(+), 47 deletions(-)
>
> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
> index 713fc9f..f191083 100644
> --- a/drivers/block/xen-blkback/blkback.c
> +++ b/drivers/block/xen-blkback/blkback.c
> @@ -84,6 +84,12 @@ MODULE_PARM_DESC(max_persistent_grants,
> "Maximum number of grants to map persistently");
>
> /*
> + * Maximum number of pages to be used as the ring between front and backend
> + */
> +unsigned int xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
> +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, S_IRUGO);
> +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
> +/*
> * The LRU mechanism to clean the lists of persistent grants needs to
> * be executed periodically. The time interval between consecutive executions
> * of the purge mechanism is set in ms.
> @@ -630,7 +636,7 @@ purge_gnt_list:
> }
>
> /* Shrink if we have more than xen_blkif_max_buffer_pages */
> - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
> + shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * blkif->nr_ring_pages);
You are greatly increasing the buffer of free (ballooned) pages.
Possibly making it 32 times bigger than it used to be, is this really
needed?
>
> if (log_stats && time_after(jiffies, blkif->st_print))
> print_stats(blkif);
> @@ -1435,6 +1441,12 @@ static int __init xen_blkif_init(void)
> {
> int rc = 0;
>
> + if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES) {
> + pr_info("Invalid max_ring_pages (%d), will use default max: %d.\n",
> + xen_blkif_max_ring_pages, XENBUS_MAX_RING_PAGES);
> + xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
> + }
> +
> if (!xen_domain())
> return -ENODEV;
>
> diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
> index f620b5d..84a964c 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -44,6 +44,7 @@
> #include <xen/interface/io/blkif.h>
> #include <xen/interface/io/protocols.h>
>
> +extern unsigned int xen_blkif_max_ring_pages;
> /*
> * This is the maximum number of segments that would be allowed in indirect
> * requests. This value will also be passed to the frontend.
> @@ -248,7 +249,7 @@ struct backend_info;
> #define PERSISTENT_GNT_WAS_ACTIVE 1
>
> /* Number of requests that we can fit in a ring */
> -#define XEN_BLKIF_REQS 32
> +#define XEN_BLKIF_REQS (32 * XENBUS_MAX_RING_PAGES)
This should be made a member of xen_blkif and set dynamically, or else
we are just wasting memory if the frontend doesn't support multipage
rings. We seem to be doing this for a bunch of features, but allocating
32 times the needed amount of requests (if the frontend doesn't support
multipage rings) seems too much IMHO.
>
> struct persistent_gnt {
> struct page *page;
> @@ -320,6 +321,7 @@ struct xen_blkif {
> struct work_struct free_work;
> /* Thread shutdown wait queue. */
> wait_queue_head_t shutdown_wq;
> + int nr_ring_pages;
> };
>
> struct seg_buf {
> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
> index 6ab69ad..909babd 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -198,8 +198,8 @@ fail:
> return ERR_PTR(-ENOMEM);
> }
>
> -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> - unsigned int evtchn)
> +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
> + unsigned int nr_grefs, unsigned int evtchn)
> {
> int err;
>
> @@ -207,7 +207,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> if (blkif->irq)
> return 0;
>
> - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
> + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
> &blkif->blk_ring);
> if (err < 0)
> return err;
> @@ -217,21 +217,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> {
> struct blkif_sring *sring;
> sring = (struct blkif_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
> break;
> }
> case BLKIF_PROTOCOL_X86_32:
> {
> struct blkif_x86_32_sring *sring_x86_32;
> sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
> break;
> }
> case BLKIF_PROTOCOL_X86_64:
> {
> struct blkif_x86_64_sring *sring_x86_64;
> sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
> break;
> }
> default:
> @@ -597,6 +597,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
> if (err)
> goto fail;
>
> + err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-pages", "%u",
> + xen_blkif_max_ring_pages);
> + if (err)
> + pr_warn("%s write out 'max-ring-pages' failed\n", __func__);
> +
> err = xenbus_switch_state(dev, XenbusStateInitWait);
> if (err)
> goto fail;
> @@ -860,22 +865,62 @@ again:
> static int connect_ring(struct backend_info *be)
> {
> struct xenbus_device *dev = be->dev;
> - unsigned long ring_ref;
> - unsigned int evtchn;
> + unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
> + unsigned int evtchn, nr_grefs;
> unsigned int pers_grants;
> char protocol[64] = "";
> int err;
>
> pr_debug("%s %s\n", __func__, dev->otherend);
>
> - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
> - &ring_ref, "event-channel", "%u", &evtchn, NULL);
> - if (err) {
> - xenbus_dev_fatal(dev, err,
> - "reading %s/ring-ref and event-channel",
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
> + &evtchn);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/event-channel",
> dev->otherend);
> return err;
> }
> + pr_info("event-channel %u\n", evtchn);
> +
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "num-ring-pages", "%u",
> + &nr_grefs);
> + if (err != 1) {
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
> + "%u", &ring_ref[0]);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
> + dev->otherend);
> + return err;
> + }
> + nr_grefs = 1;
> + pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
> + ring_ref[0]);
> + } else {
> + unsigned int i;
> +
> + if (nr_grefs > xen_blkif_max_ring_pages) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "%s/request %d ring pages exceed max:%d",
> + dev->otherend, nr_grefs, xen_blkif_max_ring_pages);
> + return err;
> + }
> + for (i = 0; i < nr_grefs; i++) {
> + char ring_ref_name[BLKBACK_NAME_LEN];
I would add a new macro rather than reusing one which might change
(BLKBACK_RINGREF_LEN?).
> +
> + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
> + err = xenbus_scanf(XBT_NIL, dev->otherend,
> + ring_ref_name, "%u", &ring_ref[i]);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/%s",
> + dev->otherend, ring_ref_name);
> + return err;
> + }
> + pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
> + }
> + }
>
> be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
> err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
> @@ -900,16 +945,16 @@ static int connect_ring(struct backend_info *be)
>
> be->blkif->vbd.feature_gnt_persistent = pers_grants;
> be->blkif->vbd.overflow_max_grants = 0;
> + be->blkif->nr_ring_pages = nr_grefs;
>
> - pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
> - ring_ref, evtchn, be->blkif->blk_protocol, protocol,
> - pers_grants ? "persistent grants" : "");
> + pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
> + nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
> + pers_grants ? "persistent grants" : "");
>
> /* Map the shared frame, irq etc. */
> - err = xen_blkif_map(be->blkif, ring_ref, evtchn);
> + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
> if (err) {
> - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
> - ring_ref, evtchn);
> + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
> return err;
> }
>
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 88e23fd..c4b427f 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -98,7 +98,17 @@ static unsigned int xen_blkif_max_segments = 32;
> module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
> MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
>
> -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
> +static unsigned int xen_blkif_max_ring_pages = 1;
> +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, S_IRUGO);
> +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
> +
> +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
I would prefer if this macro took a explicit info parameter instead on
realying that it is always info.
> +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
> +/*
> + * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 characters
> + * are enough. Define to 20 to keep consist with backend.
> + */
> +#define RINGREF_NAME_LEN (20)
>
> /*
> * We have one of these per vbd, whether ide, scsi or 'other'. They
> @@ -114,13 +124,14 @@ struct blkfront_info
> int vdevice;
> blkif_vdev_t handle;
> enum blkif_state connected;
> - int ring_ref;
> + int ring_ref[XENBUS_MAX_RING_PAGES];
> + unsigned int nr_ring_pages;
> struct blkif_front_ring ring;
> unsigned int evtchn, irq;
> struct request_queue *rq;
> struct work_struct work;
> struct gnttab_free_callback callback;
> - struct blk_shadow shadow[BLK_RING_SIZE];
> + struct blk_shadow shadow[BLK_MAX_RING_SIZE];
> struct list_head grants;
> struct list_head indirect_pages;
> unsigned int persistent_gnts_c;
> @@ -139,8 +150,6 @@ static unsigned int nr_minors;
> static unsigned long *minors;
> static DEFINE_SPINLOCK(minor_lock);
>
> -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
> #define GRANT_INVALID_REF 0
>
> #define PARTS_PER_DISK 16
> @@ -1033,12 +1042,15 @@ free_shadow:
> flush_work(&info->work);
>
> /* Free resources associated with old device channel. */
> - if (info->ring_ref != GRANT_INVALID_REF) {
> - gnttab_end_foreign_access(info->ring_ref, 0,
> - (unsigned long)info->ring.sring);
> - info->ring_ref = GRANT_INVALID_REF;
> - info->ring.sring = NULL;
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + if (info->ring_ref[i] != GRANT_INVALID_REF) {
> + gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
> + info->ring_ref[i] = GRANT_INVALID_REF;
> + }
> }
> + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
> + info->ring.sring = NULL;
> +
> if (info->irq)
> unbind_from_irqhandler(info->irq, info);
> info->evtchn = info->irq = 0;
> @@ -1245,26 +1257,30 @@ static int setup_blkring(struct xenbus_device *dev,
> struct blkfront_info *info)
> {
> struct blkif_sring *sring;
> - grant_ref_t gref;
> - int err;
> + int err, i;
> + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
> + grant_ref_t gref[XENBUS_MAX_RING_PAGES];
>
> - info->ring_ref = GRANT_INVALID_REF;
> + for (i = 0; i < info->nr_ring_pages; i++)
> + info->ring_ref[i] = GRANT_INVALID_REF;
>
> - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
> + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
> + get_order(ring_size));
> if (!sring) {
> xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
> return -ENOMEM;
> }
> SHARED_RING_INIT(sring);
> - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
> + FRONT_RING_INIT(&info->ring, sring, ring_size);
>
> - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
> + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
> if (err < 0) {
> free_page((unsigned long)sring);
> info->ring.sring = NULL;
> goto fail;
> }
> - info->ring_ref = gref;
> + for (i = 0; i < info->nr_ring_pages; i++)
> + info->ring_ref[i] = gref[i];
>
> err = xenbus_alloc_evtchn(dev, &info->evtchn);
> if (err)
> @@ -1292,7 +1308,15 @@ static int talk_to_blkback(struct xenbus_device *dev,
> {
> const char *message = NULL;
> struct xenbus_transaction xbt;
> - int err;
> + int err, i;
> + unsigned int max_pages = 0;
> +
> + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> + "max-ring-pages", "%u", &max_pages);
> + if (err != 1)
> + info->nr_ring_pages = 1;
> + else
> + info->nr_ring_pages = min(xen_blkif_max_ring_pages, max_pages);
>
> /* Create shared ring, alloc event channel. */
> err = setup_blkring(dev, info);
> @@ -1306,11 +1330,31 @@ again:
> goto destroy_blkring;
> }
>
> - err = xenbus_printf(xbt, dev->nodename,
> - "ring-ref", "%u", info->ring_ref);
> - if (err) {
> - message = "writing ring-ref";
> - goto abort_transaction;
> + if (info->nr_ring_pages == 1) {
> + err = xenbus_printf(xbt, dev->nodename,
> + "ring-ref", "%u", info->ring_ref[0]);
> + if (err) {
> + message = "writing ring-ref";
> + goto abort_transaction;
> + }
> + } else {
> + err = xenbus_printf(xbt, dev->nodename,
> + "num-ring-pages", "%u", info->nr_ring_pages);
> + if (err) {
> + message = "writing num-ring-pages";
> + goto abort_transaction;
> + }
> +
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + char ring_ref_name[RINGREF_NAME_LEN];
> + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
> + err = xenbus_printf(xbt, dev->nodename,
> + ring_ref_name, "%u", info->ring_ref[i]);
> + if (err) {
> + message = "writing ring-ref";
> + goto abort_transaction;
> + }
> + }
> }
> err = xenbus_printf(xbt, dev->nodename,
> "event-channel", "%u", info->evtchn);
> @@ -1338,6 +1382,7 @@ again:
> goto destroy_blkring;
> }
Why is this moved to here...
> + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> xenbus_switch_state(dev, XenbusStateInitialised);
>
> return 0;
> @@ -1422,9 +1467,8 @@ static int blkfront_probe(struct xenbus_device *dev,
> info->connected = BLKIF_STATE_DISCONNECTED;
> INIT_WORK(&info->work, blkif_restart_queue);
>
> - for (i = 0; i < BLK_RING_SIZE; i++)
> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
> info->shadow[i].req.u.rw.id = i+1;
> - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
from here. Isn't the previous location suitable anymore?
Also, if BLK_RING_SIZE < BLK_MAX_RING_SIZE you are overwriting the
previously written value in the above loop.
>
> /* Front end dir is a number, which is used as the id. */
> info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
> @@ -1469,7 +1513,7 @@ static int blkif_recover(struct blkfront_info *info)
>
> /* Stage 2: Set up free list. */
> memset(&info->shadow, 0, sizeof(info->shadow));
> - for (i = 0; i < BLK_RING_SIZE; i++)
> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
> info->shadow[i].req.u.rw.id = i+1;
> info->shadow_free = info->ring.req_prod_pvt;
> info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> @@ -2086,6 +2130,12 @@ static int __init xlblk_init(void)
> {
> int ret;
>
> + if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES) {
> + pr_info("Invalid max_ring_pages (%d), will use default max: %d.\n",
> + xen_blkif_max_ring_pages, XENBUS_MAX_RING_PAGES);
> + xen_blkif_max_ring_pages = 1;
> + }
> +
> if (!xen_domain())
> return -ENODEV;
>
>
El 15/05/15 a les 13.03, Bob Liu ha escrit:
>
> On 05/15/2015 06:01 PM, Roger Pau Monn? wrote:
>> El 12/05/15 a les 13.01, Bob Liu ha escrit:
>>> The right place for talk_to_blkback() to query backend features and transport
>>> parameters is after backend entered XenbusStateInitWait. There is no problem
>>
>> talk_to_blkback doesn't gather any backend features, it just publishes
>> the features supported by the frontend, which AFAICT can be done at any
>
> 1) But talk_tlkback will also allocate and initialize the request ring which
> should be done after backend entered XenbusStateInitWait.
Maybe setup_blkring should be moved to a more suitable location instead
of moving the whole function?
Roger.
On 05/15/2015 07:14 PM, Roger Pau Monn? wrote:
> El 15/05/15 a les 13.03, Bob Liu ha escrit:
>>
>> On 05/15/2015 06:01 PM, Roger Pau Monn? wrote:
>>> El 12/05/15 a les 13.01, Bob Liu ha escrit:
>>>> The right place for talk_to_blkback() to query backend features and transport
>>>> parameters is after backend entered XenbusStateInitWait. There is no problem
>>>
>>> talk_to_blkback doesn't gather any backend features, it just publishes
>>> the features supported by the frontend, which AFAICT can be done at any
>>
>> 1) But talk_tlkback will also allocate and initialize the request ring which
>> should be done after backend entered XenbusStateInitWait.
>
> Maybe setup_blkring should be moved to a more suitable location instead
> of moving the whole function?
>
Most of other parts in talk_to_blkback() depends on setup_blkring() like write
out ring-ref and event-channel.
Only notify 'feature-persistent' and 'protocol:XEN_IO_PROTO_ABI_NATIVE' can be left in front probe().
Then the patch would like this:
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2c61cf8..6b918e0 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1318,17 +1318,6 @@ again:
message = "writing event-channel";
goto abort_transaction;
}
- err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
- XEN_IO_PROTO_ABI_NATIVE);
- if (err) {
- message = "writing protocol";
- goto abort_transaction;
- }
- err = xenbus_printf(xbt, dev->nodename,
- "feature-persistent", "%u", 1);
- if (err)
- dev_warn(&dev->dev,
- "writing persistent grants feature to xenbus");
err = xenbus_transaction_end(xbt, 0);
if (err) {
@@ -1430,13 +1419,17 @@ static int blkfront_probe(struct xenbus_device *dev,
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
dev_set_drvdata(&dev->dev, info);
- err = talk_to_blkback(dev, info);
+ err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+ XEN_IO_PROTO_ABI_NATIVE);
if (err) {
- kfree(info);
- dev_set_drvdata(&dev->dev, NULL);
- return err;
+ message = "writing protocol";
+ goto abort_transaction;
}
-
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-persistent", "%u", 1);
+ if (err)
+ dev_warn(&dev->dev,
+ "writing persistent grants feature to xenbus");
return 0;
}
@@ -1906,8 +1899,13 @@ static void blkback_changed(struct xenbus_device *dev,
dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
switch (backend_state) {
- case XenbusStateInitialising:
case XenbusStateInitWait:
+ if (talk_to_blkback(dev, info)) {
+ kfree(info);
+ dev_set_drvdata(&dev->dev, NULL);
+ break;
+ }
+ case XenbusStateInitialising:
case XenbusStateInitialised:
case XenbusStateReconfiguring:
case XenbusStateReconfigured:
El 15/05/15 a les 13.35, Bob Liu ha escrit:
>
> On 05/15/2015 07:14 PM, Roger Pau Monné wrote:
>> El 15/05/15 a les 13.03, Bob Liu ha escrit:
>>>
>>> On 05/15/2015 06:01 PM, Roger Pau Monné wrote:
>>>> El 12/05/15 a les 13.01, Bob Liu ha escrit:
>>>>> The right place for talk_to_blkback() to query backend features and transport
>>>>> parameters is after backend entered XenbusStateInitWait. There is no problem
>>>>
>>>> talk_to_blkback doesn't gather any backend features, it just publishes
>>>> the features supported by the frontend, which AFAICT can be done at any
>>>
>>> 1) But talk_tlkback will also allocate and initialize the request ring which
>>> should be done after backend entered XenbusStateInitWait.
>>
>> Maybe setup_blkring should be moved to a more suitable location instead
>> of moving the whole function?
>>
>
> Most of other parts in talk_to_blkback() depends on setup_blkring() like write
> out ring-ref and event-channel.
My bad. Yes, I see that talk_to_blkback writes the ring info stuff, so
yes, moving it as a whole to a more suitable place seems OK to me, but
please re-write the commit message to be more clear.
Roger.
On 05/15/2015 07:13 PM, Roger Pau Monn? wrote:
> El 12/05/15 a les 13.01, Bob Liu ha escrit:
>> Extend xen/block to support multi-page ring, so that more requests can be issued
>> by using more than one pages as the request ring between blkfront and backend.
>> As a result, the performance can get improved significantly.
> ^ s/can get improved/improves/
>
>>
>> We got some impressive improvements on our highend iscsi storage cluster backend.
>> If using 64 pages as the ring, the IOPS increased about 15 times for the
>> throughput testing and above doubled for the latency testing.
>>
>> The reason was the limit on outstanding requests is 32 if use only one-page
>> ring, but in our case the iscsi lun was spread across about 100 physical drives,
>> 32 was really not enough to keep them busy.
>>
>> Changes in v2:
>> - Rebased to 4.0-rc6
>> - Added description on how this protocol works into io/blkif.h
>
> I don't see any changes to io/blkif.h in this patch, is something missing?
>
Sorry, I should mention in v3 that these changed were removed because I followed the protocol
already defined in XEN git tree: xen/include/public/io/blkif.h
> Also you use XENBUS_MAX_RING_PAGES which AFAICT it's not defined anywhere.
>
It was defined in include/xen/xenbus.h.
>>
>> Changes in v3:
>> - Follow the protocol defined in io/blkif.h on XEN tree
>>
>> Signed-off-by: Bob Liu <[email protected]>
>> ---
>> drivers/block/xen-blkback/blkback.c | 14 ++++-
>> drivers/block/xen-blkback/common.h | 4 +-
>> drivers/block/xen-blkback/xenbus.c | 83 ++++++++++++++++++++++-------
>> drivers/block/xen-blkfront.c | 102 +++++++++++++++++++++++++++---------
>> 4 files changed, 156 insertions(+), 47 deletions(-)
>>
>> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
>> index 713fc9f..f191083 100644
>> --- a/drivers/block/xen-blkback/blkback.c
>> +++ b/drivers/block/xen-blkback/blkback.c
>> @@ -84,6 +84,12 @@ MODULE_PARM_DESC(max_persistent_grants,
>> "Maximum number of grants to map persistently");
>>
>> /*
>> + * Maximum number of pages to be used as the ring between front and backend
>> + */
>> +unsigned int xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
>> +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, S_IRUGO);
>> +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
>> +/*
>> * The LRU mechanism to clean the lists of persistent grants needs to
>> * be executed periodically. The time interval between consecutive executions
>> * of the purge mechanism is set in ms.
>> @@ -630,7 +636,7 @@ purge_gnt_list:
>> }
>>
>> /* Shrink if we have more than xen_blkif_max_buffer_pages */
>> - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
>> + shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * blkif->nr_ring_pages);
>
> You are greatly increasing the buffer of free (ballooned) pages.
> Possibly making it 32 times bigger than it used to be, is this really
> needed?
>
Hmm.. it's a bit aggressive.
How about (xen_blkif_max_buffer_pages * blkif->nr_ring_pages) / 2?
>>
>> if (log_stats && time_after(jiffies, blkif->st_print))
>> print_stats(blkif);
>> @@ -1435,6 +1441,12 @@ static int __init xen_blkif_init(void)
>> {
>> int rc = 0;
>>
>> + if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES) {
>> + pr_info("Invalid max_ring_pages (%d), will use default max: %d.\n",
>> + xen_blkif_max_ring_pages, XENBUS_MAX_RING_PAGES);
>> + xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
>> + }
>> +
>> if (!xen_domain())
>> return -ENODEV;
>>
>> diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
>> index f620b5d..84a964c 100644
>> --- a/drivers/block/xen-blkback/common.h
>> +++ b/drivers/block/xen-blkback/common.h
>> @@ -44,6 +44,7 @@
>> #include <xen/interface/io/blkif.h>
>> #include <xen/interface/io/protocols.h>
>>
>> +extern unsigned int xen_blkif_max_ring_pages;
>> /*
>> * This is the maximum number of segments that would be allowed in indirect
>> * requests. This value will also be passed to the frontend.
>> @@ -248,7 +249,7 @@ struct backend_info;
>> #define PERSISTENT_GNT_WAS_ACTIVE 1
>>
>> /* Number of requests that we can fit in a ring */
>> -#define XEN_BLKIF_REQS 32
>> +#define XEN_BLKIF_REQS (32 * XENBUS_MAX_RING_PAGES)
>
> This should be made a member of xen_blkif and set dynamically, or else
> we are just wasting memory if the frontend doesn't support multipage
> rings. We seem to be doing this for a bunch of features, but allocating
> 32 times the needed amount of requests (if the frontend doesn't support
> multipage rings) seems too much IMHO.
>
Right, the reason I use static is to simple the code.
Else we should:
1) Move xen_blkif_alloc() from backend_probe() to some place until we read the
negotiated 'num-ring-pages' written by front.
2) Deal with memory alloc/free when domU migrated and 'num-ring-pages' changed.
I'll try to find a suitable place in frontend_changed() but may be in an new patch late.
>>
>> struct persistent_gnt {
>> struct page *page;
>> @@ -320,6 +321,7 @@ struct xen_blkif {
>> struct work_struct free_work;
>> /* Thread shutdown wait queue. */
>> wait_queue_head_t shutdown_wq;
>> + int nr_ring_pages;
>> };
>>
>> struct seg_buf {
>> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
>> index 6ab69ad..909babd 100644
>> --- a/drivers/block/xen-blkback/xenbus.c
>> +++ b/drivers/block/xen-blkback/xenbus.c
>> @@ -198,8 +198,8 @@ fail:
>> return ERR_PTR(-ENOMEM);
>> }
>>
>> -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
>> - unsigned int evtchn)
>> +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
>> + unsigned int nr_grefs, unsigned int evtchn)
>> {
>> int err;
>>
>> @@ -207,7 +207,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
>> if (blkif->irq)
>> return 0;
>>
>> - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
>> + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
>> &blkif->blk_ring);
>> if (err < 0)
>> return err;
>> @@ -217,21 +217,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
>> {
>> struct blkif_sring *sring;
>> sring = (struct blkif_sring *)blkif->blk_ring;
>> - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
>> + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
>> break;
>> }
>> case BLKIF_PROTOCOL_X86_32:
>> {
>> struct blkif_x86_32_sring *sring_x86_32;
>> sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
>> - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
>> + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
>> break;
>> }
>> case BLKIF_PROTOCOL_X86_64:
>> {
>> struct blkif_x86_64_sring *sring_x86_64;
>> sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
>> - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
>> + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
>> break;
>> }
>> default:
>> @@ -597,6 +597,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
>> if (err)
>> goto fail;
>>
>> + err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-pages", "%u",
>> + xen_blkif_max_ring_pages);
>> + if (err)
>> + pr_warn("%s write out 'max-ring-pages' failed\n", __func__);
>> +
>> err = xenbus_switch_state(dev, XenbusStateInitWait);
>> if (err)
>> goto fail;
>> @@ -860,22 +865,62 @@ again:
>> static int connect_ring(struct backend_info *be)
>> {
>> struct xenbus_device *dev = be->dev;
>> - unsigned long ring_ref;
>> - unsigned int evtchn;
>> + unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
>> + unsigned int evtchn, nr_grefs;
>> unsigned int pers_grants;
>> char protocol[64] = "";
>> int err;
>>
>> pr_debug("%s %s\n", __func__, dev->otherend);
>>
>> - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
>> - &ring_ref, "event-channel", "%u", &evtchn, NULL);
>> - if (err) {
>> - xenbus_dev_fatal(dev, err,
>> - "reading %s/ring-ref and event-channel",
>> + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
>> + &evtchn);
>> + if (err != 1) {
>> + err = -EINVAL;
>> + xenbus_dev_fatal(dev, err, "reading %s/event-channel",
>> dev->otherend);
>> return err;
>> }
>> + pr_info("event-channel %u\n", evtchn);
>> +
>> + err = xenbus_scanf(XBT_NIL, dev->otherend, "num-ring-pages", "%u",
>> + &nr_grefs);
>> + if (err != 1) {
>> + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
>> + "%u", &ring_ref[0]);
>> + if (err != 1) {
>> + err = -EINVAL;
>> + xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
>> + dev->otherend);
>> + return err;
>> + }
>> + nr_grefs = 1;
>> + pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
>> + ring_ref[0]);
>> + } else {
>> + unsigned int i;
>> +
>> + if (nr_grefs > xen_blkif_max_ring_pages) {
>> + err = -EINVAL;
>> + xenbus_dev_fatal(dev, err, "%s/request %d ring pages exceed max:%d",
>> + dev->otherend, nr_grefs, xen_blkif_max_ring_pages);
>> + return err;
>> + }
>> + for (i = 0; i < nr_grefs; i++) {
>> + char ring_ref_name[BLKBACK_NAME_LEN];
>
> I would add a new macro rather than reusing one which might change
> (BLKBACK_RINGREF_LEN?).
>
Sure.
>> +
>> + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
>> + err = xenbus_scanf(XBT_NIL, dev->otherend,
>> + ring_ref_name, "%u", &ring_ref[i]);
>> + if (err != 1) {
>> + err = -EINVAL;
>> + xenbus_dev_fatal(dev, err, "reading %s/%s",
>> + dev->otherend, ring_ref_name);
>> + return err;
>> + }
>> + pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
>> + }
>> + }
>>
>> be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
>> err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
>> @@ -900,16 +945,16 @@ static int connect_ring(struct backend_info *be)
>>
>> be->blkif->vbd.feature_gnt_persistent = pers_grants;
>> be->blkif->vbd.overflow_max_grants = 0;
>> + be->blkif->nr_ring_pages = nr_grefs;
>>
>> - pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
>> - ring_ref, evtchn, be->blkif->blk_protocol, protocol,
>> - pers_grants ? "persistent grants" : "");
>> + pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
>> + nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
>> + pers_grants ? "persistent grants" : "");
>>
>> /* Map the shared frame, irq etc. */
>> - err = xen_blkif_map(be->blkif, ring_ref, evtchn);
>> + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
>> if (err) {
>> - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
>> - ring_ref, evtchn);
>> + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
>> return err;
>> }
>>
>> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
>> index 88e23fd..c4b427f 100644
>> --- a/drivers/block/xen-blkfront.c
>> +++ b/drivers/block/xen-blkfront.c
>> @@ -98,7 +98,17 @@ static unsigned int xen_blkif_max_segments = 32;
>> module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
>> MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
>>
>> -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
>> +static unsigned int xen_blkif_max_ring_pages = 1;
>> +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, S_IRUGO);
>> +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
>> +
>> +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
>
> I would prefer if this macro took a explicit info parameter instead on
> realying that it is always info.
>
Sorry, I missed your idea here.
>> +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
>> +/*
>> + * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 characters
>> + * are enough. Define to 20 to keep consist with backend.
>> + */
>> +#define RINGREF_NAME_LEN (20)
>>
>> /*
>> * We have one of these per vbd, whether ide, scsi or 'other'. They
>> @@ -114,13 +124,14 @@ struct blkfront_info
>> int vdevice;
>> blkif_vdev_t handle;
>> enum blkif_state connected;
>> - int ring_ref;
>> + int ring_ref[XENBUS_MAX_RING_PAGES];
>> + unsigned int nr_ring_pages;
>> struct blkif_front_ring ring;
>> unsigned int evtchn, irq;
>> struct request_queue *rq;
>> struct work_struct work;
>> struct gnttab_free_callback callback;
>> - struct blk_shadow shadow[BLK_RING_SIZE];
>> + struct blk_shadow shadow[BLK_MAX_RING_SIZE];
>> struct list_head grants;
>> struct list_head indirect_pages;
>> unsigned int persistent_gnts_c;
>> @@ -139,8 +150,6 @@ static unsigned int nr_minors;
>> static unsigned long *minors;
>> static DEFINE_SPINLOCK(minor_lock);
>>
>> -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
>> - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
>> #define GRANT_INVALID_REF 0
>>
>> #define PARTS_PER_DISK 16
>> @@ -1033,12 +1042,15 @@ free_shadow:
>> flush_work(&info->work);
>>
>> /* Free resources associated with old device channel. */
>> - if (info->ring_ref != GRANT_INVALID_REF) {
>> - gnttab_end_foreign_access(info->ring_ref, 0,
>> - (unsigned long)info->ring.sring);
>> - info->ring_ref = GRANT_INVALID_REF;
>> - info->ring.sring = NULL;
>> + for (i = 0; i < info->nr_ring_pages; i++) {
>> + if (info->ring_ref[i] != GRANT_INVALID_REF) {
>> + gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
>> + info->ring_ref[i] = GRANT_INVALID_REF;
>> + }
>> }
>> + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
>> + info->ring.sring = NULL;
>> +
>> if (info->irq)
>> unbind_from_irqhandler(info->irq, info);
>> info->evtchn = info->irq = 0;
>> @@ -1245,26 +1257,30 @@ static int setup_blkring(struct xenbus_device *dev,
>> struct blkfront_info *info)
>> {
>> struct blkif_sring *sring;
>> - grant_ref_t gref;
>> - int err;
>> + int err, i;
>> + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
>> + grant_ref_t gref[XENBUS_MAX_RING_PAGES];
>>
>> - info->ring_ref = GRANT_INVALID_REF;
>> + for (i = 0; i < info->nr_ring_pages; i++)
>> + info->ring_ref[i] = GRANT_INVALID_REF;
>>
>> - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
>> + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
>> + get_order(ring_size));
>> if (!sring) {
>> xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
>> return -ENOMEM;
>> }
>> SHARED_RING_INIT(sring);
>> - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
>> + FRONT_RING_INIT(&info->ring, sring, ring_size);
>>
>> - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
>> + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
>> if (err < 0) {
>> free_page((unsigned long)sring);
>> info->ring.sring = NULL;
>> goto fail;
>> }
>> - info->ring_ref = gref;
>> + for (i = 0; i < info->nr_ring_pages; i++)
>> + info->ring_ref[i] = gref[i];
>>
>> err = xenbus_alloc_evtchn(dev, &info->evtchn);
>> if (err)
>> @@ -1292,7 +1308,15 @@ static int talk_to_blkback(struct xenbus_device *dev,
>> {
>> const char *message = NULL;
>> struct xenbus_transaction xbt;
>> - int err;
>> + int err, i;
>> + unsigned int max_pages = 0;
>> +
>> + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
>> + "max-ring-pages", "%u", &max_pages);
>> + if (err != 1)
>> + info->nr_ring_pages = 1;
>> + else
>> + info->nr_ring_pages = min(xen_blkif_max_ring_pages, max_pages);
>>
>> /* Create shared ring, alloc event channel. */
>> err = setup_blkring(dev, info);
>> @@ -1306,11 +1330,31 @@ again:
>> goto destroy_blkring;
>> }
>>
>> - err = xenbus_printf(xbt, dev->nodename,
>> - "ring-ref", "%u", info->ring_ref);
>> - if (err) {
>> - message = "writing ring-ref";
>> - goto abort_transaction;
>> + if (info->nr_ring_pages == 1) {
>> + err = xenbus_printf(xbt, dev->nodename,
>> + "ring-ref", "%u", info->ring_ref[0]);
>> + if (err) {
>> + message = "writing ring-ref";
>> + goto abort_transaction;
>> + }
>> + } else {
>> + err = xenbus_printf(xbt, dev->nodename,
>> + "num-ring-pages", "%u", info->nr_ring_pages);
>> + if (err) {
>> + message = "writing num-ring-pages";
>> + goto abort_transaction;
>> + }
>> +
>> + for (i = 0; i < info->nr_ring_pages; i++) {
>> + char ring_ref_name[RINGREF_NAME_LEN];
>> + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
>> + err = xenbus_printf(xbt, dev->nodename,
>> + ring_ref_name, "%u", info->ring_ref[i]);
>> + if (err) {
>> + message = "writing ring-ref";
>> + goto abort_transaction;
>> + }
>> + }
>> }
>> err = xenbus_printf(xbt, dev->nodename,
>> "event-channel", "%u", info->evtchn);
>> @@ -1338,6 +1382,7 @@ again:
>> goto destroy_blkring;
>> }
>
> Why is this moved to here...
>
Else the kernel will panic.
>> + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>> xenbus_switch_state(dev, XenbusStateInitialised);
>>
>> return 0;
>> @@ -1422,9 +1467,8 @@ static int blkfront_probe(struct xenbus_device *dev,
>> info->connected = BLKIF_STATE_DISCONNECTED;
>> INIT_WORK(&info->work, blkif_restart_queue);
>>
>> - for (i = 0; i < BLK_RING_SIZE; i++)
>> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
>> info->shadow[i].req.u.rw.id = i+1;
>> - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>
> from here. Isn't the previous location suitable anymore?
>
See:
#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
Because we haven't get info->nr_ring_pages yet here(in probe()), info->nr_ring_pages
is read out in talk_to_blkback() which I already moved to blkback_changed().
Thanks for your review.
-Bob
El 15/05/15 a les 14.06, Bob Liu ha escrit:
>
> On 05/15/2015 07:13 PM, Roger Pau Monn? wrote:
>> El 12/05/15 a les 13.01, Bob Liu ha escrit:
>>> Extend xen/block to support multi-page ring, so that more requests can be issued
>>> by using more than one pages as the request ring between blkfront and backend.
>>> As a result, the performance can get improved significantly.
>> ^ s/can get improved/improves/
>>
>>>
>>> We got some impressive improvements on our highend iscsi storage cluster backend.
>>> If using 64 pages as the ring, the IOPS increased about 15 times for the
>>> throughput testing and above doubled for the latency testing.
>>>
>>> The reason was the limit on outstanding requests is 32 if use only one-page
>>> ring, but in our case the iscsi lun was spread across about 100 physical drives,
>>> 32 was really not enough to keep them busy.
>>>
>>> Changes in v2:
>>> - Rebased to 4.0-rc6
>>> - Added description on how this protocol works into io/blkif.h
>>
>> I don't see any changes to io/blkif.h in this patch, is something missing?
>>
>
> Sorry, I should mention in v3 that these changed were removed because I followed the protocol
> already defined in XEN git tree: xen/include/public/io/blkif.h
So since you were reusing the old protocol specification there was no
need to add anything to blkif.h?
AFAICT from the conversation in the other thread (the one about updating
blkif.h in Xen tree) if you want to reuse an existing protocol you need
to make sure it's compatible with the previous implementation.
>> Also you use XENBUS_MAX_RING_PAGES which AFAICT it's not defined anywhere.
>>
>
> It was defined in include/xen/xenbus.h.
>
>>>
>>> Changes in v3:
>>> - Follow the protocol defined in io/blkif.h on XEN tree
>>>
>>> Signed-off-by: Bob Liu <[email protected]>
>>> ---
>>> drivers/block/xen-blkback/blkback.c | 14 ++++-
>>> drivers/block/xen-blkback/common.h | 4 +-
>>> drivers/block/xen-blkback/xenbus.c | 83 ++++++++++++++++++++++-------
>>> drivers/block/xen-blkfront.c | 102 +++++++++++++++++++++++++++---------
>>> 4 files changed, 156 insertions(+), 47 deletions(-)
>>>
>>> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
>>> index 713fc9f..f191083 100644
>>> --- a/drivers/block/xen-blkback/blkback.c
>>> +++ b/drivers/block/xen-blkback/blkback.c
>>> @@ -84,6 +84,12 @@ MODULE_PARM_DESC(max_persistent_grants,
>>> "Maximum number of grants to map persistently");
>>>
>>> /*
>>> + * Maximum number of pages to be used as the ring between front and backend
>>> + */
>>> +unsigned int xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
>>> +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, S_IRUGO);
>>> +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
>>> +/*
>>> * The LRU mechanism to clean the lists of persistent grants needs to
>>> * be executed periodically. The time interval between consecutive executions
>>> * of the purge mechanism is set in ms.
>>> @@ -630,7 +636,7 @@ purge_gnt_list:
>>> }
>>>
>>> /* Shrink if we have more than xen_blkif_max_buffer_pages */
>>> - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
>>> + shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * blkif->nr_ring_pages);
>>
>> You are greatly increasing the buffer of free (ballooned) pages.
>> Possibly making it 32 times bigger than it used to be, is this really
>> needed?
>>
>
> Hmm.. it's a bit aggressive.
> How about (xen_blkif_max_buffer_pages * blkif->nr_ring_pages) / 2?
IMHO I'm not sure about the best solution here.
xen_blkif_max_buffer_pages is set by the user and should represent the
buffer pages used by blkback. Modifying it this way means that we are
effectively lifting the limit without the user consent, which looks wrong.
I think we should change the meaning, so that it's better suited for
your proposes.
>>>
>>> if (log_stats && time_after(jiffies, blkif->st_print))
>>> print_stats(blkif);
>>> @@ -1435,6 +1441,12 @@ static int __init xen_blkif_init(void)
>>> {
>>> int rc = 0;
>>>
>>> + if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES) {
>>> + pr_info("Invalid max_ring_pages (%d), will use default max: %d.\n",
>>> + xen_blkif_max_ring_pages, XENBUS_MAX_RING_PAGES);
>>> + xen_blkif_max_ring_pages = XENBUS_MAX_RING_PAGES;
>>> + }
>>> +
>>> if (!xen_domain())
>>> return -ENODEV;
>>>
>>> diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
>>> index f620b5d..84a964c 100644
>>> --- a/drivers/block/xen-blkback/common.h
>>> +++ b/drivers/block/xen-blkback/common.h
>>> @@ -44,6 +44,7 @@
>>> #include <xen/interface/io/blkif.h>
>>> #include <xen/interface/io/protocols.h>
>>>
>>> +extern unsigned int xen_blkif_max_ring_pages;
>>> /*
>>> * This is the maximum number of segments that would be allowed in indirect
>>> * requests. This value will also be passed to the frontend.
>>> @@ -248,7 +249,7 @@ struct backend_info;
>>> #define PERSISTENT_GNT_WAS_ACTIVE 1
>>>
>>> /* Number of requests that we can fit in a ring */
>>> -#define XEN_BLKIF_REQS 32
>>> +#define XEN_BLKIF_REQS (32 * XENBUS_MAX_RING_PAGES)
>>
>> This should be made a member of xen_blkif and set dynamically, or else
>> we are just wasting memory if the frontend doesn't support multipage
>> rings. We seem to be doing this for a bunch of features, but allocating
>> 32 times the needed amount of requests (if the frontend doesn't support
>> multipage rings) seems too much IMHO.
>>
>
> Right, the reason I use static is to simple the code.
> Else we should:
> 1) Move xen_blkif_alloc() from backend_probe() to some place until we read the
> negotiated 'num-ring-pages' written by front.
> 2) Deal with memory alloc/free when domU migrated and 'num-ring-pages' changed.
>
> I'll try to find a suitable place in frontend_changed() but may be in an new patch late.
Ack. As said, we have been doing this for a long time. When I added
indirect descriptors I've also allocated everything before knowing if
indirect descriptors will be used or not.
Maybe it's time to change that and provide a way to allocate how many
requests we need, and which fields should be allocated based on the
supported features.
[...]
>>
>> Why is this moved to here...
>>
>
> Else the kernel will panic.
>
>>> + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>>> xenbus_switch_state(dev, XenbusStateInitialised);
>>>
>>> return 0;
>>> @@ -1422,9 +1467,8 @@ static int blkfront_probe(struct xenbus_device *dev,
>>> info->connected = BLKIF_STATE_DISCONNECTED;
>>> INIT_WORK(&info->work, blkif_restart_queue);
>>>
>>> - for (i = 0; i < BLK_RING_SIZE; i++)
>>> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
>>> info->shadow[i].req.u.rw.id = i+1;
>>> - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>>
>> from here. Isn't the previous location suitable anymore?
>>
>
> See:
> #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
>
> Because we haven't get info->nr_ring_pages yet here(in probe()), info->nr_ring_pages
> is read out in talk_to_blkback() which I already moved to blkback_changed().
Maybe it would be clearer if the initialization for loop was also moved
to the place above were info->shadow[BLK_RING_SIZE-1]... is being set?
Roger.