LinuxLists.cc - [PATCH 0/2] null

2018-07-06 17:43:23

Subject: [PATCH 0/2] null_blk: zone support

This series adds support for exposing a zone block device using the
null_blk device driver.

The first patch moves core null_blk data structures to a shared header
file. The second implements the actual zone support. The patchset adds
two new options. One to enable the zone interface, and another to
define the size of the zones to expose.

Thanks,
Matias

Matias Bjørling (2):
null_blk: move shared definitions to header file
null_blk: add zone support

Documentation/block/null_blk.txt | 7 ++
drivers/block/Makefile | 5 +-
drivers/block/null_blk.c | 124 ++++++++++++--------------------
drivers/block/null_blk.h | 108 ++++++++++++++++++++++++++++
drivers/block/null_blk_zoned.c | 149 +++++++++++++++++++++++++++++++++++++++
5 files changed, 315 insertions(+), 78 deletions(-)
create mode 100644 drivers/block/null_blk.h
create mode 100644 drivers/block/null_blk_zoned.c

--
2.11.0

2018-07-06 17:42:27

by Matias Bjørling

[permalink] [raw]

Subject: [PATCH 2/2] null_blk: add zone support

From: Matias Bjørling <[email protected]>

Adds support for exposing a null_blk device through the zone device
interface.

The interface is managed with the parameters zoned and zone_size.
If zoned is set, the null_blk instance registers as a zoned block
device. The zone_size parameter defines how big each zone will be.

Signed-off-by: Matias Bjørling <[email protected]>
Signed-off-by: Bart Van Assche <[email protected]>
Signed-off-by: Damien Le Moal <[email protected]>
---
Documentation/block/null_blk.txt | 7 ++
drivers/block/Makefile | 5 +-
drivers/block/null_blk.c | 48 ++++++++++++-
drivers/block/null_blk.h | 28 ++++++++
drivers/block/null_blk_zoned.c | 149 +++++++++++++++++++++++++++++++++++++++
5 files changed, 234 insertions(+), 3 deletions(-)
create mode 100644 drivers/block/null_blk_zoned.c

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 07f147381f32..ea2dafe49ae8 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0
0: Tag set is not shared.
1: Tag set shared between devices for blk-mq. Only makes sense with
nr_devices > 1, otherwise there's no tag set to share.
+
+zoned=[0/1]: Default: 0
+ 0: Block device is exposed as a random-access block device.
+ 1: Block device is exposed as a host-managed zoned block device.
+
+zone_size=[MB]: Default: 256
+ Per zone size when exposed as a zoned block device. Must be a power of two.
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index dc061158b403..a0d88aa0c05d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/

obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
-obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
obj-$(CONFIG_ZRAM) += zram/

+obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk_mod.o
+null_blk_mod-objs := null_blk.o
+null_blk_mod-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o
+
skd-y := skd_main.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cd4b0849d3b4..99b6bfe7abd1 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -180,6 +180,14 @@ static bool g_use_per_node_hctx;
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");

+static bool g_zoned;
+module_param_named(zoned, g_zoned, bool, S_IRUGO);
+MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
+
+static unsigned long g_zone_size = 256;
+module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+
static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
@@ -283,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool);
NULLB_DEVICE_ATTR(discard, bool);
NULLB_DEVICE_ATTR(mbps, uint);
NULLB_DEVICE_ATTR(cache_size, ulong);
+NULLB_DEVICE_ATTR(zoned, bool);
+NULLB_DEVICE_ATTR(zone_size, ulong);

static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@@ -394,6 +404,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_mbps,
&nullb_device_attr_cache_size,
&nullb_device_attr_badblocks,
+ &nullb_device_attr_zoned,
+ &nullb_device_attr_zone_size,
NULL,
};

@@ -446,7 +458,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)

static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
- return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
+ return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n");
}

CONFIGFS_ATTR_RO(memb_group_, features);
@@ -505,6 +517,8 @@ static struct nullb_device *null_alloc_dev(void)
dev->hw_queue_depth = g_hw_queue_depth;
dev->blocking = g_blocking;
dev->use_per_node_hctx = g_use_per_node_hctx;
+ dev->zoned = g_zoned;
+ dev->zone_size = g_zone_size;
return dev;
}

@@ -513,6 +527,7 @@ static void null_free_dev(struct nullb_device *dev)
if (!dev)
return;

+ null_zone_exit(dev);
badblocks_exit(&dev->badblocks);
kfree(dev);
}
@@ -1145,6 +1160,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
struct nullb *nullb = dev->nullb;
int err = 0;

+ if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) {
+ cmd->error = null_zone_report(nullb, cmd);
+ goto out;
+ }
+
if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
struct request *rq = cmd->rq;

@@ -1209,6 +1229,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
}
}
cmd->error = errno_to_blk_status(err);
+
+ if (!cmd->error && dev->zoned) {
+ if (req_op(cmd->rq) == REQ_OP_WRITE)
+ null_zone_write(cmd);
+ else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET)
+ null_zone_reset(cmd);
+ }
out:
/* Complete IO by inline, softirq or timer */
switch (dev->irqmode) {
@@ -1736,6 +1763,15 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_flush_queueable(nullb->q, true);
}

+ if (dev->zoned) {
+ rv = null_zone_init(dev);
+ if (rv)
+ goto out_cleanup_blk_queue;
+
+ blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
+ nullb->q->limits.zoned = BLK_ZONED_HM;
+ }
+
nullb->q->queuedata = nullb;
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
@@ -1754,13 +1790,16 @@ static int null_add_dev(struct nullb_device *dev)

rv = null_gendisk_register(nullb);
if (rv)
- goto out_cleanup_blk_queue;
+ goto out_cleanup_zone;

mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
mutex_unlock(&lock);

return 0;
+out_cleanup_zone:
+ if (dev->zoned)
+ null_zone_exit(dev);
out_cleanup_blk_queue:
blk_cleanup_queue(nullb->q);
out_cleanup_tags:
@@ -1787,6 +1826,11 @@ static int __init null_init(void)
g_bs = PAGE_SIZE;
}

+ if (!is_power_of_2(g_zone_size)) {
+ pr_err("null_blk: zone_size must be power-of-two\n");
+ return -EINVAL;
+ }
+
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) {
pr_warn("null_blk: submit_queues param is set to %u.\n",
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index d82c5501806d..d81781f22dba 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -41,9 +41,14 @@ struct nullb_device {
unsigned int curr_cache;
struct badblocks badblocks;

+ unsigned int nr_zones;
+ struct blk_zone *zones;
+ sector_t zone_size_sects;
+
unsigned long size; /* device size in MB */
unsigned long completion_nsec; /* time in ns to complete a request */
unsigned long cache_size; /* disk cache size in MB */
+ unsigned long zone_size; /* zone size in MB if device is zoned */
unsigned int submit_queues; /* number of submission queues */
unsigned int home_node; /* home node for the device */
unsigned int queue_mode; /* block interface */
@@ -57,6 +62,7 @@ struct nullb_device {
bool power; /* power on/off the device */
bool memory_backed; /* if data is stored in memory */
bool discard; /* if support discard */
+ bool zoned; /* if device is zoned */
};

struct nullb {
@@ -77,4 +83,26 @@ struct nullb {
unsigned int nr_queues;
char disk_name[DISK_NAME_LEN];
};
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int null_zone_init(struct nullb_device *dev);
+void null_zone_exit(struct nullb_device *dev);
+blk_status_t null_zone_report(struct nullb *nullb,
+ struct nullb_cmd *cmd);
+void null_zone_write(struct nullb_cmd *cmd);
+void null_zone_reset(struct nullb_cmd *cmd);
+#else
+static inline int null_zone_init(struct nullb_device *dev)
+{
+ return -EINVAL;
+}
+static inline void null_zone_exit(struct nullb_device *dev) {}
+static inline blk_status_t null_zone_report(struct nullb *nullb,
+ struct nullb_cmd *cmd)
+{
+ return BLK_STS_NOTSUPP;
+}
+static inline void null_zone_write(struct nullb_cmd *cmd) {}
+static inline void null_zone_reset(struct nullb_cmd *cmd) {}
+#endif /* CONFIG_BLK_DEV_ZONED */
#endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
new file mode 100644
index 000000000000..a979ca00d7be
--- /dev/null
+++ b/drivers/block/null_blk_zoned.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/vmalloc.h>
+#include "null_blk.h"
+
+/* zone_size in MBs to sectors. */
+#define ZONE_SIZE_SHIFT 11
+
+static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
+{
+ return sect >> ilog2(dev->zone_size_sects);
+}
+
+int null_zone_init(struct nullb_device *dev)
+{
+ sector_t dev_size = (sector_t)dev->size * 1024 * 1024;
+ sector_t sector = 0;
+ unsigned int i;
+
+ if (!is_power_of_2(dev->zone_size)) {
+ pr_err("null_blk: zone_size must be power-of-two\n");
+ return -EINVAL;
+ }
+
+ dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
+ dev->nr_zones = dev_size >>
+ (SECTOR_SHIFT + ilog2(dev->zone_size_sects));
+ dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!dev->zones)
+ return -ENOMEM;
+
+ for (i = 0; i < dev->nr_zones; i++) {
+ struct blk_zone *zone = &dev->zones[i];
+
+ zone->start = zone->wp = sector;
+ zone->len = dev->zone_size_sects;
+ zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ zone->cond = BLK_ZONE_COND_EMPTY;
+
+ sector += dev->zone_size_sects;
+ }
+
+ return 0;
+}
+
+void null_zone_exit(struct nullb_device *dev)
+{
+ kvfree(dev->zones);
+}
+
+static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq,
+ unsigned int zno, unsigned int nr_zones)
+{
+ struct blk_zone_report_hdr *hdr = NULL;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ void *addr;
+ unsigned int zones_to_cpy;
+
+ bio_for_each_segment(bvec, rq->bio, iter) {
+ addr = kmap_atomic(bvec.bv_page);
+
+ zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone);
+
+ if (!hdr) {
+ hdr = (struct blk_zone_report_hdr *)addr;
+ hdr->nr_zones = nr_zones;
+ zones_to_cpy--;
+ addr += sizeof(struct blk_zone_report_hdr);
+ }
+
+ zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones);
+
+ memcpy(addr, &dev->zones[zno],
+ zones_to_cpy * sizeof(struct blk_zone));
+
+ kunmap_atomic(addr);
+
+ nr_zones -= zones_to_cpy;
+ zno += zones_to_cpy;
+
+ if (!nr_zones)
+ break;
+ }
+}
+
+blk_status_t null_zone_report(struct nullb *nullb,
+ struct nullb_cmd *cmd)
+{
+ struct nullb_device *dev = nullb->dev;
+ struct request *rq = cmd->rq;
+ unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+ unsigned int nr_zones = dev->nr_zones - zno;
+ unsigned int max_zones = (blk_rq_bytes(rq) /
+ sizeof(struct blk_zone)) - 1;
+
+ nr_zones = min_t(unsigned int, nr_zones, max_zones);
+
+ null_zone_fill_rq(nullb->dev, rq, zno, nr_zones);
+
+ return BLK_STS_OK;
+}
+
+void null_zone_write(struct nullb_cmd *cmd)
+{
+ struct nullb_device *dev = cmd->nq->dev;
+ struct request *rq = cmd->rq;
+ sector_t sector = blk_rq_pos(rq);
+ unsigned int rq_sectors = blk_rq_sectors(rq);
+ unsigned int zno = null_zone_no(dev, sector);
+ struct blk_zone *zone = &dev->zones[zno];
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_FULL:
+ /* Cannot write to a full zone */
+ cmd->error = BLK_STS_IOERR;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_IMP_OPEN:
+ /* Writes must be at the write pointer position */
+ if (blk_rq_pos(rq) != zone->wp) {
+ cmd->error = BLK_STS_IOERR;
+ break;
+ }
+
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += rq_sectors;
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+ break;
+ default:
+ /* Invalid zone condition */
+ cmd->error = BLK_STS_IOERR;
+ break;
+ }
+}
+
+void null_zone_reset(struct nullb_cmd *cmd)
+{
+ struct nullb_device *dev = cmd->nq->dev;
+ struct request *rq = cmd->rq;
+ unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+ struct blk_zone *zone = &dev->zones[zno];
+
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+}
--
2.11.0

2018-07-06 17:42:53

by Matias Bjørling

[permalink] [raw]

Subject: [PATCH 1/2] null_blk: move shared definitions to header file

From: Matias Bjørling <[email protected]>

Split the null_blk device driver, such that it can prepare for
zoned block interface support.

Signed-off-by: Matias Bjørling <[email protected]>
---
drivers/block/null_blk.c | 76 +--------------------------------------------
drivers/block/null_blk.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 81 insertions(+), 75 deletions(-)
create mode 100644 drivers/block/null_blk.h

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 042c778e5a4e..cd4b0849d3b4 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -7,14 +7,8 @@
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
-#include <linux/blkdev.h>
#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/blk-mq.h>
-#include <linux/hrtimer.h>
-#include <linux/configfs.h>
-#include <linux/badblocks.h>
-#include <linux/fault-inject.h>
+#include "null_blk.h"

#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
@@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps)
return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
}

-struct nullb_cmd {
- struct list_head list;
- struct llist_node ll_list;
- struct __call_single_data csd;
- struct request *rq;
- struct bio *bio;
- unsigned int tag;
- blk_status_t error;
- struct nullb_queue *nq;
- struct hrtimer timer;
-};
-
-struct nullb_queue {
- unsigned long *tag_map;
- wait_queue_head_t wait;
- unsigned int queue_depth;
- struct nullb_device *dev;
- unsigned int requeue_selection;
-
- struct nullb_cmd *cmds;
-};
-
/*
* Status flags for nullb_device.
*
@@ -92,52 +64,6 @@ struct nullb_page {
#define NULLB_PAGE_LOCK (MAP_SZ - 1)
#define NULLB_PAGE_FREE (MAP_SZ - 2)

-struct nullb_device {
- struct nullb *nullb;
- struct config_item item;
- struct radix_tree_root data; /* data stored in the disk */
- struct radix_tree_root cache; /* disk cache data */
- unsigned long flags; /* device flags */
- unsigned int curr_cache;
- struct badblocks badblocks;
-
- unsigned long size; /* device size in MB */
- unsigned long completion_nsec; /* time in ns to complete a request */
- unsigned long cache_size; /* disk cache size in MB */
- unsigned int submit_queues; /* number of submission queues */
- unsigned int home_node; /* home node for the device */
- unsigned int queue_mode; /* block interface */
- unsigned int blocksize; /* block size */
- unsigned int irqmode; /* IRQ completion handler */
- unsigned int hw_queue_depth; /* queue depth */
- unsigned int index; /* index of the disk, only valid with a disk */
- unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
- bool blocking; /* blocking blk-mq device */
- bool use_per_node_hctx; /* use per-node allocation for hardware context */
- bool power; /* power on/off the device */
- bool memory_backed; /* if data is stored in memory */
- bool discard; /* if support discard */
-};
-
-struct nullb {
- struct nullb_device *dev;
- struct list_head list;
- unsigned int index;
- struct request_queue *q;
- struct gendisk *disk;
- struct blk_mq_tag_set *tag_set;
- struct blk_mq_tag_set __tag_set;
- unsigned int queue_depth;
- atomic_long_t cur_bytes;
- struct hrtimer bw_timer;
- unsigned long cache_flush_pos;
- spinlock_t lock;
-
- struct nullb_queue *queues;
- unsigned int nr_queues;
- char disk_name[DISK_NAME_LEN];
-};
-
static LIST_HEAD(nullb_list);
static struct mutex lock;
static int null_major;
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
new file mode 100644
index 000000000000..d82c5501806d
--- /dev/null
+++ b/drivers/block/null_blk.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BLK_NULL_BLK_H
+#define __BLK_NULL_BLK_H
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/hrtimer.h>
+#include <linux/configfs.h>
+#include <linux/badblocks.h>
+#include <linux/fault-inject.h>
+
+struct nullb_cmd {
+ struct list_head list;
+ struct llist_node ll_list;
+ struct __call_single_data csd;
+ struct request *rq;
+ struct bio *bio;
+ unsigned int tag;
+ blk_status_t error;
+ struct nullb_queue *nq;
+ struct hrtimer timer;
+};
+
+struct nullb_queue {
+ unsigned long *tag_map;
+ wait_queue_head_t wait;
+ unsigned int queue_depth;
+ struct nullb_device *dev;
+ unsigned int requeue_selection;
+
+ struct nullb_cmd *cmds;
+};
+
+struct nullb_device {
+ struct nullb *nullb;
+ struct config_item item;
+ struct radix_tree_root data; /* data stored in the disk */
+ struct radix_tree_root cache; /* disk cache data */
+ unsigned long flags; /* device flags */
+ unsigned int curr_cache;
+ struct badblocks badblocks;
+
+ unsigned long size; /* device size in MB */
+ unsigned long completion_nsec; /* time in ns to complete a request */
+ unsigned long cache_size; /* disk cache size in MB */
+ unsigned int submit_queues; /* number of submission queues */
+ unsigned int home_node; /* home node for the device */
+ unsigned int queue_mode; /* block interface */
+ unsigned int blocksize; /* block size */
+ unsigned int irqmode; /* IRQ completion handler */
+ unsigned int hw_queue_depth; /* queue depth */
+ unsigned int index; /* index of the disk, only valid with a disk */
+ unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+ bool blocking; /* blocking blk-mq device */
+ bool use_per_node_hctx; /* use per-node allocation for hardware context */
+ bool power; /* power on/off the device */
+ bool memory_backed; /* if data is stored in memory */
+ bool discard; /* if support discard */
+};
+
+struct nullb {
+ struct nullb_device *dev;
+ struct list_head list;
+ unsigned int index;
+ struct request_queue *q;
+ struct gendisk *disk;
+ struct blk_mq_tag_set *tag_set;
+ struct blk_mq_tag_set __tag_set;
+ unsigned int queue_depth;
+ atomic_long_t cur_bytes;
+ struct hrtimer bw_timer;
+ unsigned long cache_flush_pos;
+ spinlock_t lock;
+
+ struct nullb_queue *queues;
+ unsigned int nr_queues;
+ char disk_name[DISK_NAME_LEN];
+};
+#endif /* __NULL_BLK_H */
--
2.11.0

2018-07-06 17:47:28

by Laurence Oberman

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On Fri, 2018-07-06 at 19:38 +0200, Matias Bjørling wrote:
> This series adds support for exposing a zone block device using the
> null_blk device driver.
>
> The first patch moves core null_blk data structures to a shared
> header
> file. The second implements the actual zone support. The patchset
> adds
> two new options. One to enable the zone interface, and another to
> define the size of the zones to expose.
>
> Thanks,
> Matias
>
> Matias Bjørling (2):
> null_blk: move shared definitions to header file
> null_blk: add zone support
>
> Documentation/block/null_blk.txt |   7 ++
> drivers/block/Makefile           |   5 +-
> drivers/block/null_blk.c         | 124 ++++++++++++-----------------
> ---
> drivers/block/null_blk.h         | 108 ++++++++++++++++++++++++++++
> drivers/block/null_blk_zoned.c   | 149
> +++++++++++++++++++++++++++++++++++++++
> 5 files changed, 315 insertions(+), 78 deletions(-)
> create mode 100644 drivers/block/null_blk.h
> create mode 100644 drivers/block/null_blk_zoned.c
>

Thank you for this will be very useful.
I will test it to add value.
I dont know the code well enough to review it
Regards
Laurence

2018-07-07 02:55:33

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On 7/6/18 11:38 AM, Matias Bjørling wrote:
> This series adds support for exposing a zone block device using the
> null_blk device driver.
>
> The first patch moves core null_blk data structures to a shared header
> file. The second implements the actual zone support. The patchset adds
> two new options. One to enable the zone interface, and another to
> define the size of the zones to expose.

Thanks, should be useful for verifying write pointer violations.
Applied for 4.19.

--
Jens Axboe

2018-07-09 07:55:57

by Matias Bjørling

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On 07/06/2018 07:45 PM, Laurence Oberman wrote:
> On Fri, 2018-07-06 at 19:38 +0200, Matias Bjørling wrote:
>> This series adds support for exposing a zone block device using the
>> null_blk device driver.
>>
>> The first patch moves core null_blk data structures to a shared
>> header
>> file. The second implements the actual zone support. The patchset
>> adds
>> two new options. One to enable the zone interface, and another to
>> define the size of the zones to expose.
>>
>> Thanks,
>> Matias
>>
>> Matias Bjørling (2):
>> null_blk: move shared definitions to header file
>> null_blk: add zone support
>>
>> Documentation/block/null_blk.txt |   7 ++
>> drivers/block/Makefile           |   5 +-
>> drivers/block/null_blk.c         | 124 ++++++++++++-----------------
>> ---
>> drivers/block/null_blk.h         | 108 ++++++++++++++++++++++++++++
>> drivers/block/null_blk_zoned.c   | 149
>> +++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 315 insertions(+), 78 deletions(-)
>> create mode 100644 drivers/block/null_blk.h
>> create mode 100644 drivers/block/null_blk_zoned.c
>>
>
> Thank you for this will be very useful.
> I will test it to add value.
> I dont know the code well enough to review it
> Regards
> Laurence
>

Great.

For fio, you can use the zone support here:

https://github.com/bvanassche/fio

It is in the process of being upstreamed.

Also, you'll need the latest libzbc/util-linux (blkzone) if you want to
report zones. A bug was fixed around detection of the drive.

https://github.com/hgst/libzbc
https://github.com/karelzak/util-linux

2018-07-09 16:35:46

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On 7/9/18 1:54 AM, Matias Bjørling wrote:
> For fio, you can use the zone support here:
>
> https://github.com/bvanassche/fio
>
> It is in the process of being upstreamed.

In the spirit of making some progress on this, I just don't like how
it's done. For example, it should not be necessary to adjust what
comes out of the block generator, instead the block generator should
be told to do what we need on zbc. This is a key concept. The workload
should be defined as such that it works for zoned devices.

The trim as zone resets seems odd. What happens if you end up with a
zoned flash device in the future?

The support code looks fine.

--
Jens Axboe

2018-07-10 00:06:55

by Bart Van Assche

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
> On 7/9/18 1:54 AM, Matias Bjørling wrote:
> > For fio, you can use the zone support here:
> >
> > https://github.com/bvanassche/fio
> >
> > It is in the process of being upstreamed.
>
> In the spirit of making some progress on this, I just don't like how
> it's done. For example, it should not be necessary to adjust what
> comes out of the block generator, instead the block generator should
> be told to do what we need on zbc. This is a key concept. The workload
> should be defined as such that it works for zoned devices.

Hello Jens,

How would you like to see block generation work? I don't see an alternative
for random I/O other starting from the output of a random generator and
translating that output into something that is appropriate for a zoned block
device. Random reads must happen below the zone pointer if fio is configured
to read below the zone pointer. Random writes must happen at the write
pointer. The only way I see to implement such an I/O pattern is to start
from the output of a random generator and to adjust the output of that
random generator. However, I don't have a strong opinion whether adjusting
the output of a random generator should happen by the caller of
get_next_buflen() or inside get_next_buflen(). Or is your concern perhaps
that the current approach interferes with fio job options like bs_unaligned?

> The trim as zone resets seems odd. What happens if you end up with a
> zoned flash device in the future?

We can leave out the code that translates trim into zone resets from fio
and discuss later how to handle this. One possibility is to modify the sd
driver such that it translates REQ_OP_DISCARD into zone resets when
appropriate. There may be better alternatives.

Bart.

2018-07-10 14:50:58

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On 7/9/18 6:05 PM, Bart Van Assche wrote:
> On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
>> On 7/9/18 1:54 AM, Matias Bjørling wrote:
>>> For fio, you can use the zone support here:
>>>
>>> https://github.com/bvanassche/fio
>>>
>>> It is in the process of being upstreamed.
>>
>> In the spirit of making some progress on this, I just don't like how
>> it's done. For example, it should not be necessary to adjust what
>> comes out of the block generator, instead the block generator should
>> be told to do what we need on zbc. This is a key concept. The workload
>> should be defined as such that it works for zoned devices.
>
> Hello Jens,
>
> How would you like to see block generation work? I don't see an
> alternative for random I/O other starting from the output of a random
> generator and translating that output into something that is
> appropriate for a zoned block device. Random reads must happen below
> the zone pointer if fio is configured to read below the zone pointer.
> Random writes must happen at the write pointer. The only way I see to
> implement such an I/O pattern is to start from the output of a random
> generator and to adjust the output of that random generator. However,
> I don't have a strong opinion whether adjusting the output of a random
> generator should happen by the caller of get_next_buflen() or inside
> get_next_buflen(). Or is your concern perhaps that the current
> approach interferes with fio job options like bs_unaligned?

The main issue I have with that approach is that the core of fio is
generating the IO patterns, and then you are just changing them as you
see fit. This means that the workload definition and the resulting IO
operations are no longer matched up, since they now also depend on what
you are running on. If I take one workload and run it on a zoned drive,
and then run it on a non-zoned drive, I can't compare the results at
all. This is a showstopper.

There should be no adjusting of the output, rather it should be possible
to write zoned friendly job definitions. It should be possible to run
the same job on a non-zoned drive, and vice versa, and the resulting IO
patterns must be the same.

Fio already has some notion of zones. Maybe that could be extended to
hard zones, and some control of open zones, and patterns within those
zones?

--
Jens Axboe

2018-07-10 17:02:24

by Bart Van Assche

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On Tue, 2018-07-10 at 08:46 -0600, Jens Axboe wrote:
> On 7/9/18 6:05 PM, Bart Van Assche wrote:
> > On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
> > > In the spirit of making some progress on this, I just don't like how
> > > it's done. For example, it should not be necessary to adjust what
> > > comes out of the block generator, instead the block generator should
> > > be told to do what we need on zbc. This is a key concept. The workload
> > > should be defined as such that it works for zoned devices.
> >
> > How would you like to see block generation work? I don't see an
> > alternative for random I/O other starting from the output of a random
> > generator and translating that output into something that is
> > appropriate for a zoned block device. Random reads must happen below
> > the zone pointer if fio is configured to read below the zone pointer.
> > Random writes must happen at the write pointer. The only way I see to
> > implement such an I/O pattern is to start from the output of a random
> > generator and to adjust the output of that random generator. However,
> > I don't have a strong opinion whether adjusting the output of a random
> > generator should happen by the caller of get_next_buflen() or inside
> > get_next_buflen(). Or is your concern perhaps that the current
> > approach interferes with fio job options like bs_unaligned?
>
> The main issue I have with that approach is that the core of fio is
> generating the IO patterns, and then you are just changing them as you
> see fit. This means that the workload definition and the resulting IO
> operations are no longer matched up, since they now also depend on what
> you are running on. If I take one workload and run it on a zoned drive,
> and then run it on a non-zoned drive, I can't compare the results at
> all. This is a showstopper.
>
> There should be no adjusting of the output, rather it should be possible
> to write zoned friendly job definitions. It should be possible to run
> the same job on a non-zoned drive, and vice versa, and the resulting IO
> patterns must be the same.
>
> Fio already has some notion of zones. Maybe that could be extended to
> hard zones, and some control of open zones, and patterns within those
> zones?

Hello Jens,

How about adding a job option that makes it possible to use the zoned
block device (ZBD) I/O pattern on non-ZBD devices, requiring that the
zone size is set explicitly for non-ZBD devices and maintaining a write
pointer not only when performing I/O to a ZBD device but also if a
ZBD-style I/O pattern is applied to a non-ZBD disk? This should allow to
apply exactly the same workload to a non-ZBD disk as to a ZBD disk.

What I derived from the fio source code is as follows (please correct me
if I got anything wrong):
* The purpose of the zonesize, zonerange and zoneskip job options is to
limit the I/O range to a single zone with size "zonesize". The I/O
pattern for zoned block devices is different: I/O happens in multiple
zones simultaneously. The number of zones to which I/O happens is
called the number of open zones.
* The purpose of the random_distribution=zoned{_abs} job option is to
allow the user to skew a uniform random distribution. This is another
workload pattern than the typical pattern for ZBD drives.

Thanks,

Bart.

2018-07-10 18:47:16

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On 7/10/18 10:47 AM, Bart Van Assche wrote:
> On Tue, 2018-07-10 at 08:46 -0600, Jens Axboe wrote:
>> On 7/9/18 6:05 PM, Bart Van Assche wrote:
>>> On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
>>>> In the spirit of making some progress on this, I just don't like how
>>>> it's done. For example, it should not be necessary to adjust what
>>>> comes out of the block generator, instead the block generator should
>>>> be told to do what we need on zbc. This is a key concept. The workload
>>>> should be defined as such that it works for zoned devices.
>>>
>>> How would you like to see block generation work? I don't see an
>>> alternative for random I/O other starting from the output of a random
>>> generator and translating that output into something that is
>>> appropriate for a zoned block device. Random reads must happen below
>>> the zone pointer if fio is configured to read below the zone pointer.
>>> Random writes must happen at the write pointer. The only way I see to
>>> implement such an I/O pattern is to start from the output of a random
>>> generator and to adjust the output of that random generator. However,
>>> I don't have a strong opinion whether adjusting the output of a random
>>> generator should happen by the caller of get_next_buflen() or inside
>>> get_next_buflen(). Or is your concern perhaps that the current
>>> approach interferes with fio job options like bs_unaligned?
>>
>> The main issue I have with that approach is that the core of fio is
>> generating the IO patterns, and then you are just changing them as you
>> see fit. This means that the workload definition and the resulting IO
>> operations are no longer matched up, since they now also depend on what
>> you are running on. If I take one workload and run it on a zoned drive,
>> and then run it on a non-zoned drive, I can't compare the results at
>> all. This is a showstopper.
>>
>> There should be no adjusting of the output, rather it should be possible
>> to write zoned friendly job definitions. It should be possible to run
>> the same job on a non-zoned drive, and vice versa, and the resulting IO
>> patterns must be the same.
>>
>> Fio already has some notion of zones. Maybe that could be extended to
>> hard zones, and some control of open zones, and patterns within those
>> zones?
>
> Hello Jens,
>
> How about adding a job option that makes it possible to use the zoned
> block device (ZBD) I/O pattern on non-ZBD devices, requiring that the
> zone size is set explicitly for non-ZBD devices and maintaining a write
> pointer not only when performing I/O to a ZBD device but also if a
> ZBD-style I/O pattern is applied to a non-ZBD disk? This should allow to
> apply exactly the same workload to a non-ZBD disk as to a ZBD disk.

It just doesn't make any sense to me. The source of truth is the
generator of the IO, which does exactly what it is told by the job
definition. You're proposing to mangle that somehow, to fit some
restrictions that the underlying device has. That very concept is
foreign, and adding an option to be able to do the same on some other
device is misleading. The difference between the job file and the
workload run can be huge. Consider something really basic:

[randwrites]
bs=4k
rw=randwrite

which would be 100% random 4k writes. If I run this on a zoned device,
then that'd turn into 100% sequential writes. That makes no sense at
all. And if I run it on a different devices, I'd get 100% random writes.
Except if I set some magic option. Sorry, but that concept is just too
ugly to live, it makes zero sense. Put down your zoned hat for a bit and
think about it.

> What I derived from the fio source code is as follows (please correct me
> if I got anything wrong):
> * The purpose of the zonesize, zonerange and zoneskip job options is to
> limit the I/O range to a single zone with size "zonesize". The I/O
> pattern for zoned block devices is different: I/O happens in multiple
> zones simultaneously. The number of zones to which I/O happens is
> called the number of open zones.

The only difference is that fio currently only has one zone active. When
it finishes one, it goes to the next. See my above suggestion on adding
the notion of open zones, which would extend this to more than 1.

> * The purpose of the random_distribution=zoned{_abs} job option is to
> allow the user to skew a uniform random distribution. This is another
> workload pattern than the typical pattern for ZBD drives.

Fio's zones were never intended to be for zoned devices. Don't get hung
up on current use cases, think about what kind of definitions would make
sense for zoned devices.

--
Jens Axboe

2018-07-10 18:50:27

by Bart Van Assche

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
> The difference between the job file and the
> workload run can be huge. Consider something really basic:
>
> [randwrites]
> bs=4k
> rw=randwrite
>
> which would be 100% random 4k writes. If I run this on a zoned device,
> then that'd turn into 100% sequential writes.

That's not correct. The ZBD code in the github pull request serializes writes
per zone, not globally.

> > What I derived from the fio source code is as follows (please correct me
> > if I got anything wrong):
> > * The purpose of the zonesize, zonerange and zoneskip job options is to
> > limit the I/O range to a single zone with size "zonesize". The I/O
> > pattern for zoned block devices is different: I/O happens in multiple
> > zones simultaneously. The number of zones to which I/O happens is
> > called the number of open zones.
>
> The only difference is that fio currently only has one zone active. When
> it finishes one, it goes to the next. See my above suggestion on adding
> the notion of open zones, which would extend this to more than 1.

Thanks, I will look into this.

Bart.

2018-07-10 18:53:30

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH 0/2] null_blk: zone support

On 7/10/18 12:49 PM, Bart Van Assche wrote:
> On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
>> The difference between the job file and the
>> workload run can be huge. Consider something really basic:
>>
>> [randwrites]
>> bs=4k
>> rw=randwrite
>>
>> which would be 100% random 4k writes. If I run this on a zoned device,
>> then that'd turn into 100% sequential writes.
>
> That's not correct. The ZBD code in the github pull request serializes writes
> per zone, not globally.

That's a totally minor detail. If all my random writes fall within a single
zone, then they'd be 100% sequential. For N open zones, you'd be 100%
sequential within the zone. The point is that the workload as defined and
the workload as run are two totally different things, and THAT is the
problem.

--
Jens Axboe

2018-08-09 20:53:18

by Bart Van Assche

[permalink] [raw]

Subject: Re: Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support)

On Tue, 2018-07-10 at 12:51 -0600, Jens Axboe wrote:
+AD4- On 7/10/18 12:49 PM, Bart Van Assche wrote:
+AD4- +AD4- On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
+AD4- +AD4- +AD4- The difference between the job file and the
+AD4- +AD4- +AD4- workload run can be huge. Consider something really basic:
+AD4- +AD4- +AD4-
+AD4- +AD4- +AD4- +AFs-randwrites+AF0-
+AD4- +AD4- +AD4- bs+AD0-4k
+AD4- +AD4- +AD4- rw+AD0-randwrite
+AD4- +AD4- +AD4-
+AD4- +AD4- +AD4- which would be 100+ACU- random 4k writes. If I run this on a zoned device,
+AD4- +AD4- +AD4- then that'd turn into 100+ACU- sequential writes.
+AD4- +AD4-
+AD4- +AD4- That's not correct. The ZBD code in the github pull request serializes writes
+AD4- +AD4- per zone, not globally.
+AD4-
+AD4- That's a totally minor detail. If all my random writes fall within a single
+AD4- zone, then they'd be 100+ACU- sequential. For N open zones, you'd be 100+ACU-
+AD4- sequential within the zone. The point is that the workload as defined and
+AD4- the workload as run are two totally different things, and THAT is the
+AD4- problem.

Hello Jens,

What you proposed in a previous e-mail, namely to use the existing fio zone
concept for zoned block devices sounds interesting to me. This is something I
will definitely look further into. This will help to make a given workload
that is suited for zoned block devices to behave (almost) identical when run
against a regular block device. Since fio users expect that no zones are
reset before e.g. a random write test is started, there will always be a small
difference in behavior between a workload run against a zoned block device and
a workload run against a regular device if some zones already contain data.

It's not clear to me how close you want the behavior of fio to be for zoned
and regular block devices. Do you e.g. want me to introduce a new I/O pattern
(--rw+AD0-...) that causes fio to write sequentially inside a zone and for which
zones are selected randomly? I don't see any other approach that allows to
make sure that the same workload definition behaves identically against zoned
and regular block devices.

Thanks,

Bart.

2018-08-09 21:04:26

by Jens Axboe

[permalink] [raw]

Subject: Re: Zoned block device support for fio

On 8/9/18 2:51 PM, Bart Van Assche wrote:
> On Tue, 2018-07-10 at 12:51 -0600, Jens Axboe wrote:
>> On 7/10/18 12:49 PM, Bart Van Assche wrote:
>>> On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
>>>> The difference between the job file and the
>>>> workload run can be huge. Consider something really basic:
>>>>
>>>> [randwrites]
>>>> bs=4k
>>>> rw=randwrite
>>>>
>>>> which would be 100% random 4k writes. If I run this on a zoned device,
>>>> then that'd turn into 100% sequential writes.
>>>
>>> That's not correct. The ZBD code in the github pull request serializes writes
>>> per zone, not globally.
>>
>> That's a totally minor detail. If all my random writes fall within a single
>> zone, then they'd be 100% sequential. For N open zones, you'd be 100%
>> sequential within the zone. The point is that the workload as defined and
>> the workload as run are two totally different things, and THAT is the
>> problem.
>
> Hello Jens,
>
> What you proposed in a previous e-mail, namely to use the existing fio zone
> concept for zoned block devices sounds interesting to me. This is something I
> will definitely look further into. This will help to make a given workload
> that is suited for zoned block devices to behave (almost) identical when run
> against a regular block device. Since fio users expect that no zones are
> reset before e.g. a random write test is started, there will always be a small
> difference in behavior between a workload run against a zoned block device and
> a workload run against a regular device if some zones already contain data.
>
> It's not clear to me how close you want the behavior of fio to be for zoned
> and regular block devices. Do you e.g. want me to introduce a new I/O pattern
> (--rw=...) that causes fio to write sequentially inside a zone and for which
> zones are selected randomly? I don't see any other approach that allows to
> make sure that the same workload definition behaves identically against zoned
> and regular block devices.

Yes exactly. Basically come up with something that just maps fio zones on
top of SMR zones. Then come up with something that allows you to have
sequential writes IO within a zone, and something that allows you to decide
how many zones to use, when to skip between zones, etc.

--
Jens Axboe

2018-08-15 18:09:03

by Bart Van Assche

[permalink] [raw]

Subject: Re: Zoned block device support for fio

On Thu, 2018-08-09 at 15:03 -0600, Jens Axboe wrote:
+AD4- Yes exactly. Basically come up with something that just maps fio zones on
+AD4- top of SMR zones. Then come up with something that allows you to have
+AD4- sequential writes IO within a zone, and something that allows you to decide
+AD4- how many zones to use, when to skip between zones, etc.

Hello Jens,

SMR support for fio has been reworked. The reworked patches are available at
https://github.com/axboe/fio/pull/585. The following two test scripts allow to
verify whether the fio behavior is identical for SMR and non-SMR storage media:
+ACo- t/zbd/run-tests-against-regular-nullb
+ACo- t/zbd/run-tests-against-zoned-nullb

Thanks,

Bart.