From: Andreas Hindborg <[email protected]>
Hi All,
This patch set adds zoned storage support to `ublk`. The first two patches does
some house cleaning in preparation for the last two patches. The third patch
adds support for report_zones and the following operations:
- REQ_OP_ZONE_OPEN
- REQ_OP_ZONE_CLOSE
- REQ_OP_ZONE_FINISH
- REQ_OP_ZONE_RES
The last patch adds support for REQ_OP_ZONE_APPEND.
v3 [2] -> v4 changes:
- Split up v3 patches
- Add zone append support
- Change order of variables in `ublk_report_zones`
Read/write and zone operations are tested with zenfs [3].
The zone append path is tested with fio -> zonefs -> ublk -> null_blk.
The implementation of zone append requires ublk user copy feature, and therefore
the series is based on branch for-next (6afa337a3789) of [4].
[1] https://github.com/metaspace/ubdsrv/commit/7de0d901c329fde7dc5a2e998952dd88bf5e668b
[2] https://lore.kernel.org/linux-block/[email protected]/
[3] https://github.com/westerndigitalcorporation/zenfs
[4] https://git.kernel.dk/linux.git
Andreas Hindborg (4):
ublk: change ublk IO command defines to enum
ublk: move types to shared header file
ublk: enable zoned storage support
ublk: add zone append
MAINTAINERS | 2 +
drivers/block/Kconfig | 4 +
drivers/block/Makefile | 4 +-
drivers/block/ublk_drv-zoned.c | 155 +++++++++++++++++++++++++++++++++
drivers/block/ublk_drv.c | 150 +++++++++++++++++++------------
drivers/block/ublk_drv.h | 71 +++++++++++++++
include/uapi/linux/ublk_cmd.h | 38 ++++++--
7 files changed, 363 insertions(+), 61 deletions(-)
create mode 100644 drivers/block/ublk_drv-zoned.c
create mode 100644 drivers/block/ublk_drv.h
base-commit: 3261ea42710e9665c9151006049411bd23b5411f
--
2.41.0
From: Andreas Hindborg <[email protected]>
Add zone append feature to ublk. This feature uses the `addr` field of `struct
ublksrv_io_cmd`. Therefore ublk must be used with the user copy
feature (UBLK_F_USER_COPY) for zone append to be available. Without this
feature, ublk will fail zone append requests.
Suggested-by: Ming Lei <[email protected]>
Signed-off-by: Andreas Hindborg <[email protected]>
---
drivers/block/ublk_drv-zoned.c | 11 +++++++++
drivers/block/ublk_drv.c | 43 ++++++++++++++++++++++++++++++----
drivers/block/ublk_drv.h | 1 +
include/uapi/linux/ublk_cmd.h | 3 ++-
4 files changed, 52 insertions(+), 6 deletions(-)
diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
index ea86bf4b3681..007e8fc7ff25 100644
--- a/drivers/block/ublk_drv-zoned.c
+++ b/drivers/block/ublk_drv-zoned.c
@@ -16,6 +16,16 @@ void ublk_set_nr_zones(struct ublk_device *ub)
ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
}
+int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
+{
+ const struct ublk_param_zoned *p = &ub->params.zoned;
+
+ if (! p->max_zone_append_sectors)
+ return -EINVAL;
+
+ return 0;
+}
+
void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
const struct ublk_param_zoned *p = &ub->params.zoned;
@@ -23,6 +33,7 @@ void ublk_dev_param_zoned_apply(struct ublk_device *ub)
if (ub->dev_info.flags & UBLK_F_ZONED) {
disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
+ blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
}
}
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 88fa39853c61..6a949669b47e 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -107,6 +107,11 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+/*
+ * Set when IO is Zone Append
+ */
+#define UBLK_IO_FLAG_ZONE_APPEND 0x10
+
struct ublk_io {
/* userspace buffer address from io cmd */
__u64 addr;
@@ -230,6 +235,8 @@ static void ublk_dev_param_discard_apply(struct ublk_device *ub)
static int ublk_validate_params(const struct ublk_device *ub)
{
+ int ret = 0;
+
/* basic param is the only one which must be set */
if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
const struct ublk_param_basic *p = &ub->params.basic;
@@ -260,6 +267,13 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
return -EINVAL;
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ (ub->params.types & UBLK_PARAM_TYPE_ZONED)) {
+ ret = ublk_dev_param_zoned_validate(ub);
+ if (ret)
+ return ret;
+ }
+
return 0;
}
@@ -690,6 +704,11 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
return BLK_STS_IOERR;
}
case REQ_OP_ZONE_APPEND:
+ if (!(ubq->dev->dev_info.flags & UBLK_F_USER_COPY))
+ return BLK_STS_IOERR;
+ ublk_op = UBLK_IO_OP_ZONE_APPEND;
+ io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
+ break;
case REQ_OP_ZONE_RESET_ALL:
case REQ_OP_DRV_OUT:
/* We do not support zone append or reset_all yet */
@@ -1112,6 +1131,12 @@ static void ublk_commit_completion(struct ublk_device *ub,
/* find the io request and complete */
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+ if (io->flags & UBLK_IO_FLAG_ZONE_APPEND)
+ req->__sector = ub_cmd->addr;
+ io->flags &= ~UBLK_IO_FLAG_ZONE_APPEND;
+ }
+
if (req && likely(!blk_should_fake_timeout(req->q)))
ublk_put_req_ref(ubq, req);
}
@@ -1411,7 +1436,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
- if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
+ if (ublk_support_user_copy(ubq) &&
+ !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
ret = -EINVAL;
goto out;
}
@@ -1534,11 +1560,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
int ubuf_dir)
{
/* copy ubuf to request pages */
- if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
+ if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
+ ubuf_dir == ITER_SOURCE)
return true;
/* copy request pages to ubuf */
- if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
+ if ((req_op(req) == REQ_OP_WRITE ||
+ req_op(req) == REQ_OP_ZONE_APPEND) &&
+ ubuf_dir == ITER_DEST)
return true;
return false;
@@ -1867,6 +1896,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
ub->dev_info.ublksrv_pid = ublksrv_pid;
ub->ub_disk = disk;
+ ub->dev_info.state = UBLK_S_DEV_LIVE;
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ub->dev_info.flags & UBLK_F_ZONED) {
+ disk_set_zoned(disk, BLK_ZONED_HM);
+ }
+
ret = ublk_apply_params(ub);
if (ret)
goto out_put_disk;
@@ -1877,7 +1912,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ub->dev_info.flags & UBLK_F_ZONED) {
- disk_set_zoned(disk, BLK_ZONED_HM);
blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
ret = ublk_revalidate_disk_zones(disk);
if (ret)
@@ -1885,7 +1919,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
}
get_device(&ub->cdev_dev);
- ub->dev_info.state = UBLK_S_DEV_LIVE;
ret = add_disk(disk);
if (ret) {
/*
diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
index 7242430fd6b9..f55e1c25531d 100644
--- a/drivers/block/ublk_drv.h
+++ b/drivers/block/ublk_drv.h
@@ -56,6 +56,7 @@ struct ublk_rq_data {
};
void ublk_set_nr_zones(struct ublk_device *ub);
+int ublk_dev_param_zoned_validate(const struct ublk_device *ub);
void ublk_dev_param_zoned_apply(struct ublk_device *ub);
int ublk_revalidate_disk_zones(struct gendisk *disk);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 436525afffe8..4b6ad0b28598 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -351,7 +351,8 @@ struct ublk_param_devt {
struct ublk_param_zoned {
__u32 max_open_zones;
__u32 max_active_zones;
- __u8 reserved[24];
+ __u32 max_zone_append_sectors;
+ __u8 reserved[20];
};
struct ublk_params {
--
2.41.0
From: Andreas Hindborg <[email protected]>
Add zoned storage support to ublk: report_zones and operations:
- REQ_OP_ZONE_OPEN
- REQ_OP_ZONE_CLOSE
- REQ_OP_ZONE_FINISH
- REQ_OP_ZONE_RESET
Note: This commit changes the ublk kernel module name from `ublk_drv.ko` to
`ublk.ko` in order to link multiple translation units into the module.
Signed-off-by: Andreas Hindborg <[email protected]>
---
MAINTAINERS | 1 +
drivers/block/Kconfig | 4 +
drivers/block/Makefile | 4 +-
drivers/block/ublk_drv-zoned.c | 144 +++++++++++++++++++++++++++++++++
drivers/block/ublk_drv.c | 64 +++++++++++++--
drivers/block/ublk_drv.h | 15 ++++
include/uapi/linux/ublk_cmd.h | 14 ++++
7 files changed, 239 insertions(+), 7 deletions(-)
create mode 100644 drivers/block/ublk_drv-zoned.c
diff --git a/MAINTAINERS b/MAINTAINERS
index ace71c90751c..db8a8deb5926 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21555,6 +21555,7 @@ S: Maintained
F: Documentation/block/ublk.rst
F: drivers/block/ublk_drv.c
F: drivers/block/ublk_drv.h
+F: drivers/block/ublk_drv-zoned.c
F: include/uapi/linux/ublk_cmd.h
UCLINUX (M68KNOMMU AND COLDFIRE)
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 5b9d4aaebb81..c58dfd035557 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
suggested to enable N if your application(ublk server) switches to
ioctl command encoding.
+config BLK_DEV_UBLK_ZONED
+ def_bool y
+ depends on BLK_DEV_UBLK && BLK_DEV_ZONED
+
source "drivers/block/rnbd/Kconfig"
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 101612cba303..bc1649e20ec2 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,6 +37,8 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
-obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
+obj-$(CONFIG_BLK_DEV_UBLK) += ublk.o
+ublk-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
+ublk-$(CONFIG_BLK_DEV_UBLK_ZONED) += ublk_drv-zoned.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
new file mode 100644
index 000000000000..ea86bf4b3681
--- /dev/null
+++ b/drivers/block/ublk_drv-zoned.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023 Andreas Hindborg <[email protected]>
+ */
+#include <linux/blkzoned.h>
+#include <linux/ublk_cmd.h>
+#include <linux/vmalloc.h>
+
+#include "ublk_drv.h"
+
+void ublk_set_nr_zones(struct ublk_device *ub)
+{
+ const struct ublk_param_basic *p = &ub->params.basic;
+
+ if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
+ ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
+}
+
+void ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+ const struct ublk_param_zoned *p = &ub->params.zoned;
+
+ if (ub->dev_info.flags & UBLK_F_ZONED) {
+ disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
+ disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
+ }
+}
+
+int ublk_revalidate_disk_zones(struct gendisk *disk)
+{
+ return blk_revalidate_disk_zones(disk, NULL);
+}
+
+/* Based on virtblk_alloc_report_buffer */
+static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
+ unsigned int nr_zones,
+ unsigned int zone_sectors, size_t *buflen)
+{
+ struct request_queue *q = ublk->ub_disk->queue;
+ size_t bufsize;
+ void *buf;
+
+ nr_zones = min_t(unsigned int, nr_zones,
+ get_capacity(ublk->ub_disk) >> ilog2(zone_sectors));
+
+ bufsize = nr_zones * sizeof(struct blk_zone);
+ bufsize =
+ min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
+ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+ while (bufsize >= sizeof(struct blk_zone)) {
+ buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
+ if (buf) {
+ *buflen = bufsize;
+ return buf;
+ }
+ bufsize >>= 1;
+ }
+
+ bufsize = 0;
+ return NULL;
+}
+
+int ublk_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct ublk_device *ub = disk->private_data;
+ unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
+ unsigned int first_zone = sector >> ilog2(zone_size_sectors);
+ unsigned int done_zones = 0;
+ unsigned int max_zones_per_request;
+ struct blk_zone *buffer;
+ size_t buffer_length;
+
+ if (!(ub->dev_info.flags & UBLK_F_ZONED))
+ return -EOPNOTSUPP;
+
+ nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
+ nr_zones);
+
+ buffer = ublk_alloc_report_buffer(ub, nr_zones, zone_size_sectors,
+ &buffer_length);
+ if (!buffer)
+ return -ENOMEM;
+
+ max_zones_per_request = buffer_length / sizeof(struct blk_zone);
+
+ while (done_zones < nr_zones) {
+ unsigned int remaining_zones = nr_zones - done_zones;
+ unsigned int zones_in_request = min_t(
+ unsigned int, remaining_zones, max_zones_per_request);
+ int err = 0;
+ struct request *req;
+ struct ublk_rq_data *pdu;
+ blk_status_t status;
+
+ memset(buffer, 0, buffer_length);
+
+ req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ pdu = blk_mq_rq_to_pdu(req);
+ pdu->operation = UBLK_IO_OP_REPORT_ZONES;
+ pdu->sector = sector;
+ pdu->nr_sectors = remaining_zones * zone_size_sectors;
+
+ err = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
+ GFP_KERNEL);
+ if (err) {
+ blk_mq_free_request(req);
+ kvfree(buffer);
+ return err;
+ }
+
+ status = blk_execute_rq(req, 0);
+ err = blk_status_to_errno(status);
+ blk_mq_free_request(req);
+ if (err) {
+ kvfree(buffer);
+ return err;
+ }
+
+ for (unsigned int i = 0; i < zones_in_request; i++) {
+ struct blk_zone *zone = buffer + i;
+
+ err = cb(zone, i, data);
+ if (err)
+ return err;
+
+ done_zones++;
+ sector += zone_size_sectors;
+
+ /* A zero length zone means don't ask for more zones */
+ if (!zone->len) {
+ kvfree(buffer);
+ return done_zones;
+ }
+ }
+ }
+
+ kvfree(buffer);
+ return done_zones;
+}
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index e519dc0d9fe7..88fa39853c61 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -57,12 +57,13 @@
| UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_UNPRIVILEGED_DEV \
| UBLK_F_CMD_IOCTL_ENCODE \
- | UBLK_F_USER_COPY)
+ | UBLK_F_USER_COPY \
+ | UBLK_F_ZONED)
/* All UBLK_PARAM_TYPE_* should be included here */
-#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
- UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
-
+#define UBLK_PARAM_TYPE_ALL \
+ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
+ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
struct ublk_uring_cmd_pdu {
struct ublk_queue *ubq;
@@ -209,6 +210,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
set_disk_ro(ub->ub_disk, true);
set_capacity(ub->ub_disk, p->dev_sectors);
+
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+ ublk_set_nr_zones(ub);
}
static void ublk_dev_param_discard_apply(struct ublk_device *ub)
@@ -269,6 +273,9 @@ static int ublk_apply_params(struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
ublk_dev_param_discard_apply(ub);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
+ ublk_dev_param_zoned_apply(ub);
+
return 0;
}
@@ -439,6 +446,7 @@ static const struct block_device_operations ub_fops = {
.owner = THIS_MODULE,
.open = ublk_open,
.free_disk = ublk_free_disk,
+ .report_zones = ublk_report_zones,
};
#define UBLK_MAX_PIN_PAGES 32
@@ -553,7 +561,8 @@ static inline bool ublk_need_map_req(const struct request *req)
static inline bool ublk_need_unmap_req(const struct request *req)
{
- return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
+ return ublk_rq_has_data(req) &&
+ (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
}
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
@@ -637,6 +646,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
+ struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
u32 ublk_op;
switch (req_op(req)) {
@@ -655,6 +665,35 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
case REQ_OP_WRITE_ZEROES:
ublk_op = UBLK_IO_OP_WRITE_ZEROES;
break;
+ case REQ_OP_ZONE_OPEN:
+ ublk_op = UBLK_IO_OP_ZONE_OPEN;
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ ublk_op = UBLK_IO_OP_ZONE_CLOSE;
+ break;
+ case REQ_OP_ZONE_FINISH:
+ ublk_op = UBLK_IO_OP_ZONE_FINISH;
+ break;
+ case REQ_OP_ZONE_RESET:
+ ublk_op = UBLK_IO_OP_ZONE_RESET;
+ break;
+ case REQ_OP_DRV_IN:
+ ublk_op = pdu->operation;
+ switch (ublk_op) {
+ case UBLK_IO_OP_REPORT_ZONES:
+ iod->op_flags = ublk_op | ublk_req_build_flags(req);
+ iod->nr_sectors = pdu->nr_sectors;
+ iod->start_sector = pdu->sector;
+ iod->addr = io->addr;
+ return BLK_STS_OK;
+ default:
+ return BLK_STS_IOERR;
+ }
+ case REQ_OP_ZONE_APPEND:
+ case REQ_OP_ZONE_RESET_ALL:
+ case REQ_OP_DRV_OUT:
+ /* We do not support zone append or reset_all yet */
+ fallthrough;
default:
return BLK_STS_IOERR;
}
@@ -708,7 +747,8 @@ static inline void __ublk_complete_rq(struct request *req)
*
* Both the two needn't unmap.
*/
- if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
+ if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
+ req_op(req) != REQ_OP_DRV_IN)
goto exit;
/* for READ request, writing data in iod->addr to rq buffers */
@@ -1835,6 +1875,15 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
if (ub->nr_privileged_daemon != ub->nr_queues_ready)
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ub->dev_info.flags & UBLK_F_ZONED) {
+ disk_set_zoned(disk, BLK_ZONED_HM);
+ blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
+ ret = ublk_revalidate_disk_zones(disk);
+ if (ret)
+ goto out_put_disk;
+ }
+
get_device(&ub->cdev_dev);
ub->dev_info.state = UBLK_S_DEV_LIVE;
ret = add_disk(disk);
@@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
if (ub->dev_info.flags & UBLK_F_USER_COPY)
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+ ub->dev_info.flags &= ~UBLK_F_ZONED;
+
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
index f81e62256456..7242430fd6b9 100644
--- a/drivers/block/ublk_drv.h
+++ b/drivers/block/ublk_drv.h
@@ -50,6 +50,21 @@ struct ublk_rq_data {
struct llist_node node;
struct kref ref;
+ enum ublk_op operation;
+ __u64 sector;
+ __u32 nr_sectors;
};
+void ublk_set_nr_zones(struct ublk_device *ub);
+void ublk_dev_param_zoned_apply(struct ublk_device *ub);
+int ublk_revalidate_disk_zones(struct gendisk *disk);
+
+#ifdef CONFIG_BLK_DEV_UBLK_ZONED
+int ublk_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb,
+ void *data);
+#else
+#define ublk_report_zones NULL
+#endif
+
#endif
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 471b3b983045..436525afffe8 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -176,6 +176,11 @@
/* Copy between request and user buffer by pread()/pwrite() */
#define UBLK_F_USER_COPY (1UL << 7)
+/*
+ * Enable zoned device support
+ */
+#define UBLK_F_ZONED (1ULL << 8)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@@ -242,6 +247,7 @@ enum ublk_op {
UBLK_IO_OP_ZONE_APPEND = 13,
UBLK_IO_OP_ZONE_RESET = 15,
__UBLK_IO_OP_DRV_IN_START = 32,
+ UBLK_IO_OP_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
__UBLK_IO_OP_DRV_IN_END = 96,
__UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
__UBLK_IO_OP_DRV_OUT_END = 160,
@@ -342,6 +348,12 @@ struct ublk_param_devt {
__u32 disk_minor;
};
+struct ublk_param_zoned {
+ __u32 max_open_zones;
+ __u32 max_active_zones;
+ __u8 reserved[24];
+};
+
struct ublk_params {
/*
* Total length of parameters, userspace has to set 'len' for both
@@ -353,11 +365,13 @@ struct ublk_params {
#define UBLK_PARAM_TYPE_BASIC (1 << 0)
#define UBLK_PARAM_TYPE_DISCARD (1 << 1)
#define UBLK_PARAM_TYPE_DEVT (1 << 2)
+#define UBLK_PARAM_TYPE_ZONED (1 << 3)
__u32 types; /* types of parameter included */
struct ublk_param_basic basic;
struct ublk_param_discard discard;
struct ublk_param_devt devt;
+ struct ublk_param_zoned zoned;
};
#endif
--
2.41.0
From: Andreas Hindborg <[email protected]>
This change is in preparation for zoned storage support.
Signed-off-by: Andreas Hindborg <[email protected]>
---
include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 4b8558db90e1..471b3b983045 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
__u64 reserved2;
};
-#define UBLK_IO_OP_READ 0
-#define UBLK_IO_OP_WRITE 1
-#define UBLK_IO_OP_FLUSH 2
-#define UBLK_IO_OP_DISCARD 3
-#define UBLK_IO_OP_WRITE_SAME 4
-#define UBLK_IO_OP_WRITE_ZEROES 5
+enum ublk_op {
+ UBLK_IO_OP_READ = 0,
+ UBLK_IO_OP_WRITE = 1,
+ UBLK_IO_OP_FLUSH = 2,
+ UBLK_IO_OP_DISCARD = 3,
+ UBLK_IO_OP_WRITE_SAME = 4,
+ UBLK_IO_OP_WRITE_ZEROES = 5,
+ UBLK_IO_OP_ZONE_OPEN = 10,
+ UBLK_IO_OP_ZONE_CLOSE = 11,
+ UBLK_IO_OP_ZONE_FINISH = 12,
+ UBLK_IO_OP_ZONE_APPEND = 13,
+ UBLK_IO_OP_ZONE_RESET = 15,
+ __UBLK_IO_OP_DRV_IN_START = 32,
+ __UBLK_IO_OP_DRV_IN_END = 96,
+ __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
+ __UBLK_IO_OP_DRV_OUT_END = 160,
+};
#define UBLK_IO_F_FAILFAST_DEV (1U << 8)
#define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9)
--
2.41.0
From: Andreas Hindborg <[email protected]>
This change is in preparation for ublk zoned storage support.
Signed-off-by: Andreas Hindborg <[email protected]>
---
MAINTAINERS | 1 +
drivers/block/ublk_drv.c | 45 +-------------------------------
drivers/block/ublk_drv.h | 55 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 57 insertions(+), 44 deletions(-)
create mode 100644 drivers/block/ublk_drv.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 27ef11624748..ace71c90751c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21554,6 +21554,7 @@ L: [email protected]
S: Maintained
F: Documentation/block/ublk.rst
F: drivers/block/ublk_drv.c
+F: drivers/block/ublk_drv.h
F: include/uapi/linux/ublk_cmd.h
UCLINUX (M68KNOMMU AND COLDFIRE)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 1c823750c95a..e519dc0d9fe7 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -45,6 +45,7 @@
#include <linux/namei.h>
#include <linux/kref.h>
#include <uapi/linux/ublk_cmd.h>
+#include "ublk_drv.h"
#define UBLK_MINORS (1U << MINORBITS)
@@ -62,11 +63,6 @@
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
-struct ublk_rq_data {
- struct llist_node node;
-
- struct kref ref;
-};
struct ublk_uring_cmd_pdu {
struct ublk_queue *ubq;
@@ -140,45 +136,6 @@ struct ublk_queue {
#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
-struct ublk_device {
- struct gendisk *ub_disk;
-
- char *__queues;
-
- unsigned int queue_size;
- struct ublksrv_ctrl_dev_info dev_info;
-
- struct blk_mq_tag_set tag_set;
-
- struct cdev cdev;
- struct device cdev_dev;
-
-#define UB_STATE_OPEN 0
-#define UB_STATE_USED 1
-#define UB_STATE_DELETED 2
- unsigned long state;
- int ub_number;
-
- struct mutex mutex;
-
- spinlock_t mm_lock;
- struct mm_struct *mm;
-
- struct ublk_params params;
-
- struct completion completion;
- unsigned int nr_queues_ready;
- unsigned int nr_privileged_daemon;
-
- /*
- * Our ubq->daemon may be killed without any notification, so
- * monitor each queue's daemon periodically
- */
- struct delayed_work monitor_work;
- struct work_struct quiesce_work;
- struct work_struct stop_work;
-};
-
/* header of ublk_params */
struct ublk_params_header {
__u32 len;
diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
new file mode 100644
index 000000000000..f81e62256456
--- /dev/null
+++ b/drivers/block/ublk_drv.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _UBLK_DRV_H
+#define _UBLK_DRV_H
+
+#include <uapi/linux/ublk_cmd.h>
+#include <linux/blk-mq.h>
+#include <linux/cdev.h>
+
+struct ublk_device {
+ struct gendisk *ub_disk;
+
+ char *__queues;
+
+ unsigned int queue_size;
+ struct ublksrv_ctrl_dev_info dev_info;
+
+ struct blk_mq_tag_set tag_set;
+
+ struct cdev cdev;
+ struct device cdev_dev;
+
+#define UB_STATE_OPEN 0
+#define UB_STATE_USED 1
+#define UB_STATE_DELETED 2
+ unsigned long state;
+ int ub_number;
+
+ struct mutex mutex;
+
+ spinlock_t mm_lock;
+ struct mm_struct *mm;
+
+ struct ublk_params params;
+
+ struct completion completion;
+ unsigned int nr_queues_ready;
+ unsigned int nr_privileged_daemon;
+
+ /*
+ * Our ubq->daemon may be killed without any notification, so
+ * monitor each queue's daemon periodically
+ */
+ struct delayed_work monitor_work;
+ struct work_struct quiesce_work;
+ struct work_struct stop_work;
+};
+
+struct ublk_rq_data {
+ struct llist_node node;
+
+ struct kref ref;
+};
+
+#endif
--
2.41.0
On 6/29/23 04:06, Andreas Hindborg wrote:
> From: Andreas Hindborg <[email protected]>
>
> This change is in preparation for ublk zoned storage support.
>
> Signed-off-by: Andreas Hindborg <[email protected]>
Looks OK to me.
Reviewed-by: Damien Le Moal <[email protected]>
--
Damien Le Moal
Western Digital Research
On 6/29/23 04:06, Andreas Hindborg wrote:
> From: Andreas Hindborg <[email protected]>
>
> This change is in preparation for zoned storage support.
>
> Signed-off-by: Andreas Hindborg <[email protected]>
> ---
> include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
> 1 file changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 4b8558db90e1..471b3b983045 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
> __u64 reserved2;
> };
>
> -#define UBLK_IO_OP_READ 0
> -#define UBLK_IO_OP_WRITE 1
> -#define UBLK_IO_OP_FLUSH 2
> -#define UBLK_IO_OP_DISCARD 3
> -#define UBLK_IO_OP_WRITE_SAME 4
> -#define UBLK_IO_OP_WRITE_ZEROES 5
> +enum ublk_op {
> + UBLK_IO_OP_READ = 0,
> + UBLK_IO_OP_WRITE = 1,
> + UBLK_IO_OP_FLUSH = 2,
> + UBLK_IO_OP_DISCARD = 3,
> + UBLK_IO_OP_WRITE_SAME = 4,
> + UBLK_IO_OP_WRITE_ZEROES = 5,
> + UBLK_IO_OP_ZONE_OPEN = 10,
> + UBLK_IO_OP_ZONE_CLOSE = 11,
> + UBLK_IO_OP_ZONE_FINISH = 12,
> + UBLK_IO_OP_ZONE_APPEND = 13,
> + UBLK_IO_OP_ZONE_RESET = 15,
> + __UBLK_IO_OP_DRV_IN_START = 32,
> + __UBLK_IO_OP_DRV_IN_END = 96,
> + __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
> + __UBLK_IO_OP_DRV_OUT_END = 160,
> +};
This patch does not do what the title says. You are also introducing the zone
operations, and the very obscure __UBLK_IO_OP_DRV_XXX operations without an
explanation. Also, why the "__" prefix for these ? I do not see the point...
Given that this is a uapi, a comment to explain the less obvious commands would
be nice.
So I think the change to an enum for the existing ops can be done either in
patch 2 or as a separate patch and the introduction of the zone operations done
in patch 3 or as a separate patch.
>
> #define UBLK_IO_F_FAILFAST_DEV (1U << 8)
> #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9)
--
Damien Le Moal
Western Digital Research
On 6/29/23 04:06, Andreas Hindborg wrote:
> From: Andreas Hindborg <[email protected]>
>
> Add zoned storage support to ublk: report_zones and operations:
> - REQ_OP_ZONE_OPEN
> - REQ_OP_ZONE_CLOSE
> - REQ_OP_ZONE_FINISH
> - REQ_OP_ZONE_RESET
>
> Note: This commit changes the ublk kernel module name from `ublk_drv.ko` to
> `ublk.ko` in order to link multiple translation units into the module.
You probably could rename ublk_drv.c to ublk.c to avoid that and keep the module
name as it was, ublk_drv.ko.
>
> Signed-off-by: Andreas Hindborg <[email protected]>
> ---
> MAINTAINERS | 1 +
> drivers/block/Kconfig | 4 +
> drivers/block/Makefile | 4 +-
> drivers/block/ublk_drv-zoned.c | 144 +++++++++++++++++++++++++++++++++
> drivers/block/ublk_drv.c | 64 +++++++++++++--
> drivers/block/ublk_drv.h | 15 ++++
> include/uapi/linux/ublk_cmd.h | 14 ++++
> 7 files changed, 239 insertions(+), 7 deletions(-)
> create mode 100644 drivers/block/ublk_drv-zoned.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ace71c90751c..db8a8deb5926 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -21555,6 +21555,7 @@ S: Maintained
> F: Documentation/block/ublk.rst
> F: drivers/block/ublk_drv.c
> F: drivers/block/ublk_drv.h
> +F: drivers/block/ublk_drv-zoned.c
> F: include/uapi/linux/ublk_cmd.h
>
> UCLINUX (M68KNOMMU AND COLDFIRE)
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 5b9d4aaebb81..c58dfd035557 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
> suggested to enable N if your application(ublk server) switches to
> ioctl command encoding.
>
> +config BLK_DEV_UBLK_ZONED
> + def_bool y
This can be "bool" only.
> + depends on BLK_DEV_UBLK && BLK_DEV_ZONED
> +
> source "drivers/block/rnbd/Kconfig"
>
> endif # BLK_DEV
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 101612cba303..bc1649e20ec2 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -37,6 +37,8 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
>
> obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
>
> -obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
> +obj-$(CONFIG_BLK_DEV_UBLK) += ublk.o
> +ublk-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
> +ublk-$(CONFIG_BLK_DEV_UBLK_ZONED) += ublk_drv-zoned.o
>
> swim_mod-y := swim.o swim_asm.o
> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
> new file mode 100644
> index 000000000000..ea86bf4b3681
> --- /dev/null
> +++ b/drivers/block/ublk_drv-zoned.c
> @@ -0,0 +1,144 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2023 Andreas Hindborg <[email protected]>
> + */
> +#include <linux/blkzoned.h>
> +#include <linux/ublk_cmd.h>
> +#include <linux/vmalloc.h>
> +
> +#include "ublk_drv.h"
> +
> +void ublk_set_nr_zones(struct ublk_device *ub)
> +{
> + const struct ublk_param_basic *p = &ub->params.basic;
> +
> + if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
If UBLK_F_ZONED is set but chunk_sectors is not, that is a bug on the user
driver side, no ? So an error message about that would be nice instead of
silently ignoring the zoned flag.
> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> +}
> +
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> + const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> + if (ub->dev_info.flags & UBLK_F_ZONED) {
> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
You do not need to check if the max_active_zones and max_open_zones values are
sensible ? E.g. what if they are larger than the number of zones ?
> + }
> +}
> +
> +int ublk_revalidate_disk_zones(struct gendisk *disk)
> +{
> + return blk_revalidate_disk_zones(disk, NULL);
> +}
I do not think this helper is needed at all (see below comment on the call site).
> +
> +/* Based on virtblk_alloc_report_buffer */
> +static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
> + unsigned int nr_zones,
> + unsigned int zone_sectors, size_t *buflen)
> +{
> + struct request_queue *q = ublk->ub_disk->queue;
> + size_t bufsize;
> + void *buf;
> +
> + nr_zones = min_t(unsigned int, nr_zones,
> + get_capacity(ublk->ub_disk) >> ilog2(zone_sectors));
> +
> + bufsize = nr_zones * sizeof(struct blk_zone);
> + bufsize =
> + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
> + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
> +
> + while (bufsize >= sizeof(struct blk_zone)) {
> + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> + if (buf) {
> + *buflen = bufsize;
> + return buf;
> + }
> + bufsize >>= 1;
> + }
> +
> + bufsize = 0;
This is not needed. This should rather be "*buflen = 0;".
> + return NULL;
> +}
> +
> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> + struct ublk_device *ub = disk->private_data;
> + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
> + unsigned int first_zone = sector >> ilog2(zone_size_sectors);
> + unsigned int done_zones = 0;
> + unsigned int max_zones_per_request;
> + struct blk_zone *buffer;
> + size_t buffer_length;
> +
> + if (!(ub->dev_info.flags & UBLK_F_ZONED))
This is repeated a lot. So a small inline helper ublk_dev_is_zoned() would be nice.
> + return -EOPNOTSUPP;
> +
> + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
> + nr_zones);
> +
> + buffer = ublk_alloc_report_buffer(ub, nr_zones, zone_size_sectors,
> + &buffer_length);
> + if (!buffer)
> + return -ENOMEM;
> +
> + max_zones_per_request = buffer_length / sizeof(struct blk_zone);
> +
> + while (done_zones < nr_zones) {
> + unsigned int remaining_zones = nr_zones - done_zones;
> + unsigned int zones_in_request = min_t(
> + unsigned int, remaining_zones, max_zones_per_request);
> + int err = 0;
> + struct request *req;
> + struct ublk_rq_data *pdu;
> + blk_status_t status;
> +
> + memset(buffer, 0, buffer_length);
> +
> + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> + if (IS_ERR(req))
> + return PTR_ERR(req);
You are leaking buffer.
> +
> + pdu = blk_mq_rq_to_pdu(req);
> + pdu->operation = UBLK_IO_OP_REPORT_ZONES;
> + pdu->sector = sector;
> + pdu->nr_sectors = remaining_zones * zone_size_sectors;
> +
> + err = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
> + GFP_KERNEL);
> + if (err) {
> + blk_mq_free_request(req);
> + kvfree(buffer);
> + return err;
> + }
> +
> + status = blk_execute_rq(req, 0);
> + err = blk_status_to_errno(status);
> + blk_mq_free_request(req);
> + if (err) {
> + kvfree(buffer);
> + return err;
You are repeating this a lot. Use a goto to cleanup on error.
> + }
> +
> + for (unsigned int i = 0; i < zones_in_request; i++) {
> + struct blk_zone *zone = buffer + i;
> +
> + err = cb(zone, i, data);
> + if (err)
> + return err;
> +
> + done_zones++;
> + sector += zone_size_sectors;
> +
> + /* A zero length zone means don't ask for more zones */
> + if (!zone->len) {
> + kvfree(buffer);
> + return done_zones;
> + }
> + }
> + }
> +
> + kvfree(buffer);
> + return done_zones;
> +}
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index e519dc0d9fe7..88fa39853c61 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -57,12 +57,13 @@
> | UBLK_F_USER_RECOVERY_REISSUE \
> | UBLK_F_UNPRIVILEGED_DEV \
> | UBLK_F_CMD_IOCTL_ENCODE \
> - | UBLK_F_USER_COPY)
> + | UBLK_F_USER_COPY \
> + | UBLK_F_ZONED)
>
> /* All UBLK_PARAM_TYPE_* should be included here */
> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
> - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
> -
> +#define UBLK_PARAM_TYPE_ALL \
> + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
>
> struct ublk_uring_cmd_pdu {
> struct ublk_queue *ubq;
> @@ -209,6 +210,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
> set_disk_ro(ub->ub_disk, true);
>
> set_capacity(ub->ub_disk, p->dev_sectors);
> +
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> + ublk_set_nr_zones(ub);
So if the user is attempting to setup a zoned drive but the kernel does not have
CONFIG_BLK_DEV_ZONED=y, the user setup will be silently ignored. Not exactly
nice I think, unless I am missing something.
Also, repeating that "if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))" for all zone
related functions is very verbose. Stub the functions in ublk_drv.h. That will
make the main C code lighter.
> }
>
> static void ublk_dev_param_discard_apply(struct ublk_device *ub)
> @@ -269,6 +273,9 @@ static int ublk_apply_params(struct ublk_device *ub)
> if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
> ublk_dev_param_discard_apply(ub);
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
> + ublk_dev_param_zoned_apply(ub);
Similar comment as above. If the user tries to apply zoned parameters to a non
zoned drive, no error reported...
> +
> return 0;
> }
>
> @@ -439,6 +446,7 @@ static const struct block_device_operations ub_fops = {
> .owner = THIS_MODULE,
> .open = ublk_open,
> .free_disk = ublk_free_disk,
> + .report_zones = ublk_report_zones,
> };
>
> #define UBLK_MAX_PIN_PAGES 32
> @@ -553,7 +561,8 @@ static inline bool ublk_need_map_req(const struct request *req)
>
> static inline bool ublk_need_unmap_req(const struct request *req)
> {
> - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
> + return ublk_rq_has_data(req) &&
> + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
> }
>
> static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
> @@ -637,6 +646,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
> {
> struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> struct ublk_io *io = &ubq->ios[req->tag];
> + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
> u32 ublk_op;
>
> switch (req_op(req)) {
> @@ -655,6 +665,35 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
> case REQ_OP_WRITE_ZEROES:
> ublk_op = UBLK_IO_OP_WRITE_ZEROES;
> break;
> + case REQ_OP_ZONE_OPEN:
> + ublk_op = UBLK_IO_OP_ZONE_OPEN;
> + break;
> + case REQ_OP_ZONE_CLOSE:
> + ublk_op = UBLK_IO_OP_ZONE_CLOSE;
> + break;
> + case REQ_OP_ZONE_FINISH:
> + ublk_op = UBLK_IO_OP_ZONE_FINISH;
> + break;
> + case REQ_OP_ZONE_RESET:
> + ublk_op = UBLK_IO_OP_ZONE_RESET;
> + break;
> + case REQ_OP_DRV_IN:
> + ublk_op = pdu->operation;
> + switch (ublk_op) {
> + case UBLK_IO_OP_REPORT_ZONES:
> + iod->op_flags = ublk_op | ublk_req_build_flags(req);
> + iod->nr_sectors = pdu->nr_sectors;
> + iod->start_sector = pdu->sector;
> + iod->addr = io->addr;
> + return BLK_STS_OK;
> + default:
> + return BLK_STS_IOERR;
> + }
> + case REQ_OP_ZONE_APPEND:
> + case REQ_OP_ZONE_RESET_ALL:
> + case REQ_OP_DRV_OUT:
> + /* We do not support zone append or reset_all yet */
> + fallthrough;
Not OK ! zone append is mandatory for zoned block devices. So zone append
support needs to come with this patch. reset all can be a different patch as
that is optional.
> default:
> return BLK_STS_IOERR;
> }
> @@ -708,7 +747,8 @@ static inline void __ublk_complete_rq(struct request *req)
> *
> * Both the two needn't unmap.
> */
> - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
> + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
> + req_op(req) != REQ_OP_DRV_IN)
> goto exit;
>
> /* for READ request, writing data in iod->addr to rq buffers */
> @@ -1835,6 +1875,15 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
> if (ub->nr_privileged_daemon != ub->nr_queues_ready)
> set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + ub->dev_info.flags & UBLK_F_ZONED) {
Same comment as above about error return instead of silently ignoring the zoned
flag for the CONFIG_BLK_DEV_ZONED=n case.
> + disk_set_zoned(disk, BLK_ZONED_HM);
> + blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
> + ret = ublk_revalidate_disk_zones(disk);
> + if (ret)
> + goto out_put_disk;
This should be all a helper ublk_set_zoned() or something.
> + }
> +
> get_device(&ub->cdev_dev);
> ub->dev_info.state = UBLK_S_DEV_LIVE;
> ret = add_disk(disk);
> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
> if (ub->dev_info.flags & UBLK_F_USER_COPY)
> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>
> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> + ub->dev_info.flags &= ~UBLK_F_ZONED;
Arg, no. The user should be notified with an error that he/she is attempting to
create a zoned device that cannot be supported.
> +
> /* We are not ready to support zero copy */
> ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>
> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
> index f81e62256456..7242430fd6b9 100644
> --- a/drivers/block/ublk_drv.h
> +++ b/drivers/block/ublk_drv.h
> @@ -50,6 +50,21 @@ struct ublk_rq_data {
> struct llist_node node;
>
> struct kref ref;
> + enum ublk_op operation;
> + __u64 sector;
> + __u32 nr_sectors;
> };
>
> +void ublk_set_nr_zones(struct ublk_device *ub);
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> +int ublk_revalidate_disk_zones(struct gendisk *disk);
> +
> +#ifdef CONFIG_BLK_DEV_UBLK_ZONED
> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb,
> + void *data);
> +#else
> +#define ublk_report_zones NULL
> +#endif
> +
> #endif
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 471b3b983045..436525afffe8 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -176,6 +176,11 @@
> /* Copy between request and user buffer by pread()/pwrite() */
> #define UBLK_F_USER_COPY (1UL << 7)
>
> +/*
> + * Enable zoned device support
Isn't this for "Indicate that the device is zoned" ?
> + */
> +#define UBLK_F_ZONED (1ULL << 8)
> +
> /* device state */
> #define UBLK_S_DEV_DEAD 0
> #define UBLK_S_DEV_LIVE 1
> @@ -242,6 +247,7 @@ enum ublk_op {
> UBLK_IO_OP_ZONE_APPEND = 13,
> UBLK_IO_OP_ZONE_RESET = 15,
> __UBLK_IO_OP_DRV_IN_START = 32,
> + UBLK_IO_OP_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
> __UBLK_IO_OP_DRV_IN_END = 96,
> __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
> __UBLK_IO_OP_DRV_OUT_END = 160,
> @@ -342,6 +348,12 @@ struct ublk_param_devt {
> __u32 disk_minor;
> };
>
> +struct ublk_param_zoned {
> + __u32 max_open_zones;
> + __u32 max_active_zones;
> + __u8 reserved[24];
> +};
> +
> struct ublk_params {
> /*
> * Total length of parameters, userspace has to set 'len' for both
> @@ -353,11 +365,13 @@ struct ublk_params {
> #define UBLK_PARAM_TYPE_BASIC (1 << 0)
> #define UBLK_PARAM_TYPE_DISCARD (1 << 1)
> #define UBLK_PARAM_TYPE_DEVT (1 << 2)
> +#define UBLK_PARAM_TYPE_ZONED (1 << 3)
> __u32 types; /* types of parameter included */
>
> struct ublk_param_basic basic;
> struct ublk_param_discard discard;
> struct ublk_param_devt devt;
> + struct ublk_param_zoned zoned;
> };
>
> #endif
--
Damien Le Moal
Western Digital Research
On 6/29/23 04:06, Andreas Hindborg wrote:
> From: Andreas Hindborg <[email protected]>
>
> Add zone append feature to ublk. This feature uses the `addr` field of `struct
> ublksrv_io_cmd`. Therefore ublk must be used with the user copy
> feature (UBLK_F_USER_COPY) for zone append to be available. Without this
> feature, ublk will fail zone append requests.
>
> Suggested-by: Ming Lei <[email protected]>
> Signed-off-by: Andreas Hindborg <[email protected]>
This needs to be squashed in patch 3.
> ---
> drivers/block/ublk_drv-zoned.c | 11 +++++++++
> drivers/block/ublk_drv.c | 43 ++++++++++++++++++++++++++++++----
> drivers/block/ublk_drv.h | 1 +
> include/uapi/linux/ublk_cmd.h | 3 ++-
> 4 files changed, 52 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
> index ea86bf4b3681..007e8fc7ff25 100644
> --- a/drivers/block/ublk_drv-zoned.c
> +++ b/drivers/block/ublk_drv-zoned.c
> @@ -16,6 +16,16 @@ void ublk_set_nr_zones(struct ublk_device *ub)
> ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> }
>
> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
> +{
> + const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> + if (! p->max_zone_append_sectors)
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> {
> const struct ublk_param_zoned *p = &ub->params.zoned;
> @@ -23,6 +33,7 @@ void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> if (ub->dev_info.flags & UBLK_F_ZONED) {
> disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
> }
> }
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 88fa39853c61..6a949669b47e 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -107,6 +107,11 @@ struct ublk_uring_cmd_pdu {
> */
> #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
>
> +/*
> + * Set when IO is Zone Append
> + */
> +#define UBLK_IO_FLAG_ZONE_APPEND 0x10
> +
> struct ublk_io {
> /* userspace buffer address from io cmd */
> __u64 addr;
> @@ -230,6 +235,8 @@ static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>
> static int ublk_validate_params(const struct ublk_device *ub)
> {
> + int ret = 0;
> +
> /* basic param is the only one which must be set */
> if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
> const struct ublk_param_basic *p = &ub->params.basic;
> @@ -260,6 +267,13 @@ static int ublk_validate_params(const struct ublk_device *ub)
> if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
> return -EINVAL;
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + (ub->params.types & UBLK_PARAM_TYPE_ZONED)) {
> + ret = ublk_dev_param_zoned_validate(ub);
> + if (ret)
> + return ret;
> + }
> +
> return 0;
> }
>
> @@ -690,6 +704,11 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
> return BLK_STS_IOERR;
> }
> case REQ_OP_ZONE_APPEND:
> + if (!(ubq->dev->dev_info.flags & UBLK_F_USER_COPY))
> + return BLK_STS_IOERR;
> + ublk_op = UBLK_IO_OP_ZONE_APPEND;
> + io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
> + break;
> case REQ_OP_ZONE_RESET_ALL:
> case REQ_OP_DRV_OUT:
> /* We do not support zone append or reset_all yet */
> @@ -1112,6 +1131,12 @@ static void ublk_commit_completion(struct ublk_device *ub,
> /* find the io request and complete */
> req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
> + if (io->flags & UBLK_IO_FLAG_ZONE_APPEND)
> + req->__sector = ub_cmd->addr;
> + io->flags &= ~UBLK_IO_FLAG_ZONE_APPEND;
> + }
> +
> if (req && likely(!blk_should_fake_timeout(req->q)))
> ublk_put_req_ref(ubq, req);
> }
> @@ -1411,7 +1436,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
> goto out;
>
> - if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
> + if (ublk_support_user_copy(ubq) &&
> + !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
> ret = -EINVAL;
> goto out;
> }
> @@ -1534,11 +1560,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
> int ubuf_dir)
> {
> /* copy ubuf to request pages */
> - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
> + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
> + ubuf_dir == ITER_SOURCE)
> return true;
>
> /* copy request pages to ubuf */
> - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
> + if ((req_op(req) == REQ_OP_WRITE ||
> + req_op(req) == REQ_OP_ZONE_APPEND) &&
> + ubuf_dir == ITER_DEST)
> return true;
>
> return false;
> @@ -1867,6 +1896,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
> ub->dev_info.ublksrv_pid = ublksrv_pid;
> ub->ub_disk = disk;
>
> + ub->dev_info.state = UBLK_S_DEV_LIVE;
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + ub->dev_info.flags & UBLK_F_ZONED) {
> + disk_set_zoned(disk, BLK_ZONED_HM);
> + }
> +
> ret = ublk_apply_params(ub);
> if (ret)
> goto out_put_disk;
> @@ -1877,7 +1912,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
>
> if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> ub->dev_info.flags & UBLK_F_ZONED) {
> - disk_set_zoned(disk, BLK_ZONED_HM);
> blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
> ret = ublk_revalidate_disk_zones(disk);
> if (ret)
> @@ -1885,7 +1919,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
> }
>
> get_device(&ub->cdev_dev);
> - ub->dev_info.state = UBLK_S_DEV_LIVE;
> ret = add_disk(disk);
> if (ret) {
> /*
> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
> index 7242430fd6b9..f55e1c25531d 100644
> --- a/drivers/block/ublk_drv.h
> +++ b/drivers/block/ublk_drv.h
> @@ -56,6 +56,7 @@ struct ublk_rq_data {
> };
>
> void ublk_set_nr_zones(struct ublk_device *ub);
> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub);
> void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> int ublk_revalidate_disk_zones(struct gendisk *disk);
>
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 436525afffe8..4b6ad0b28598 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -351,7 +351,8 @@ struct ublk_param_devt {
> struct ublk_param_zoned {
> __u32 max_open_zones;
> __u32 max_active_zones;
> - __u8 reserved[24];
> + __u32 max_zone_append_sectors;
> + __u8 reserved[20];
> };
>
> struct ublk_params {
--
Damien Le Moal
Western Digital Research
On Thu, Jun 29, 2023 at 07:47:47AM +0900, Damien Le Moal wrote:
> On 6/29/23 04:06, Andreas Hindborg wrote:
> > From: Andreas Hindborg <[email protected]>
> >
> > This change is in preparation for zoned storage support.
> >
> > Signed-off-by: Andreas Hindborg <[email protected]>
> > ---
> > include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
> > 1 file changed, 17 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> > index 4b8558db90e1..471b3b983045 100644
> > --- a/include/uapi/linux/ublk_cmd.h
> > +++ b/include/uapi/linux/ublk_cmd.h
> > @@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
> > __u64 reserved2;
> > };
> >
> > -#define UBLK_IO_OP_READ 0
> > -#define UBLK_IO_OP_WRITE 1
> > -#define UBLK_IO_OP_FLUSH 2
> > -#define UBLK_IO_OP_DISCARD 3
> > -#define UBLK_IO_OP_WRITE_SAME 4
> > -#define UBLK_IO_OP_WRITE_ZEROES 5
> > +enum ublk_op {
> > + UBLK_IO_OP_READ = 0,
> > + UBLK_IO_OP_WRITE = 1,
> > + UBLK_IO_OP_FLUSH = 2,
> > + UBLK_IO_OP_DISCARD = 3,
> > + UBLK_IO_OP_WRITE_SAME = 4,
> > + UBLK_IO_OP_WRITE_ZEROES = 5,
> > + UBLK_IO_OP_ZONE_OPEN = 10,
> > + UBLK_IO_OP_ZONE_CLOSE = 11,
> > + UBLK_IO_OP_ZONE_FINISH = 12,
> > + UBLK_IO_OP_ZONE_APPEND = 13,
> > + UBLK_IO_OP_ZONE_RESET = 15,
> > + __UBLK_IO_OP_DRV_IN_START = 32,
> > + __UBLK_IO_OP_DRV_IN_END = 96,
> > + __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
> > + __UBLK_IO_OP_DRV_OUT_END = 160,
> > +};
>
> This patch does not do what the title says. You are also introducing the zone
> operations, and the very obscure __UBLK_IO_OP_DRV_XXX operations without an
> explanation. Also, why the "__" prefix for these ? I do not see the point...
It should be to reserve space for ublk passthrough OP.
> Given that this is a uapi, a comment to explain the less obvious commands would
> be nice.
>
> So I think the change to an enum for the existing ops can be done either in
> patch 2 or as a separate patch and the introduction of the zone operations done
> in patch 3 or as a separate patch.
Also it might break userspace by changing to enum from macro for existed
definition, cause userspace may check something by '#ifdef UBLK_IO_OP_*',
so probably it is better to keep these OPs as enum, or at least keep
existed definition as macro.
Thanks,
Ming
On 6/29/23 09:38, Ming Lei wrote:
> On Thu, Jun 29, 2023 at 07:47:47AM +0900, Damien Le Moal wrote:
>> On 6/29/23 04:06, Andreas Hindborg wrote:
>>> From: Andreas Hindborg <[email protected]>
>>>
>>> This change is in preparation for zoned storage support.
>>>
>>> Signed-off-by: Andreas Hindborg <[email protected]>
>>> ---
>>> include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
>>> 1 file changed, 17 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>>> index 4b8558db90e1..471b3b983045 100644
>>> --- a/include/uapi/linux/ublk_cmd.h
>>> +++ b/include/uapi/linux/ublk_cmd.h
>>> @@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
>>> __u64 reserved2;
>>> };
>>>
>>> -#define UBLK_IO_OP_READ 0
>>> -#define UBLK_IO_OP_WRITE 1
>>> -#define UBLK_IO_OP_FLUSH 2
>>> -#define UBLK_IO_OP_DISCARD 3
>>> -#define UBLK_IO_OP_WRITE_SAME 4
>>> -#define UBLK_IO_OP_WRITE_ZEROES 5
>>> +enum ublk_op {
>>> + UBLK_IO_OP_READ = 0,
>>> + UBLK_IO_OP_WRITE = 1,
>>> + UBLK_IO_OP_FLUSH = 2,
>>> + UBLK_IO_OP_DISCARD = 3,
>>> + UBLK_IO_OP_WRITE_SAME = 4,
>>> + UBLK_IO_OP_WRITE_ZEROES = 5,
>>> + UBLK_IO_OP_ZONE_OPEN = 10,
>>> + UBLK_IO_OP_ZONE_CLOSE = 11,
>>> + UBLK_IO_OP_ZONE_FINISH = 12,
>>> + UBLK_IO_OP_ZONE_APPEND = 13,
>>> + UBLK_IO_OP_ZONE_RESET = 15,
>>> + __UBLK_IO_OP_DRV_IN_START = 32,
>>> + __UBLK_IO_OP_DRV_IN_END = 96,
>>> + __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
>>> + __UBLK_IO_OP_DRV_OUT_END = 160,
>>> +};
>>
>> This patch does not do what the title says. You are also introducing the zone
>> operations, and the very obscure __UBLK_IO_OP_DRV_XXX operations without an
>> explanation. Also, why the "__" prefix for these ? I do not see the point...
>
> It should be to reserve space for ublk passthrough OP.
A comment about that would be nice.
>
>> Given that this is a uapi, a comment to explain the less obvious commands would
>> be nice.
>>
>> So I think the change to an enum for the existing ops can be done either in
>> patch 2 or as a separate patch and the introduction of the zone operations done
>> in patch 3 or as a separate patch.
>
> Also it might break userspace by changing to enum from macro for existed
> definition, cause userspace may check something by '#ifdef UBLK_IO_OP_*',
> so probably it is better to keep these OPs as enum, or at least keep
> existed definition as macro.
Then let's keep defining things with #define instead of an enum.
>
> Thanks,
> Ming
>
--
Damien Le Moal
Western Digital Research
On Wed, Jun 28, 2023 at 09:06:49PM +0200, Andreas Hindborg wrote:
> From: Andreas Hindborg <[email protected]>
>
> Add zone append feature to ublk. This feature uses the `addr` field of `struct
> ublksrv_io_cmd`. Therefore ublk must be used with the user copy
> feature (UBLK_F_USER_COPY) for zone append to be available. Without this
> feature, ublk will fail zone append requests.
Given zone append is a must, please fail to add device in case of zoned
and !user_copy, then we can make fast IO code path clean.
>
> Suggested-by: Ming Lei <[email protected]>
> Signed-off-by: Andreas Hindborg <[email protected]>
> ---
> drivers/block/ublk_drv-zoned.c | 11 +++++++++
> drivers/block/ublk_drv.c | 43 ++++++++++++++++++++++++++++++----
> drivers/block/ublk_drv.h | 1 +
> include/uapi/linux/ublk_cmd.h | 3 ++-
> 4 files changed, 52 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
> index ea86bf4b3681..007e8fc7ff25 100644
> --- a/drivers/block/ublk_drv-zoned.c
> +++ b/drivers/block/ublk_drv-zoned.c
> @@ -16,6 +16,16 @@ void ublk_set_nr_zones(struct ublk_device *ub)
> ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> }
>
> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
> +{
> + const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> + if (! p->max_zone_append_sectors)
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> {
> const struct ublk_param_zoned *p = &ub->params.zoned;
> @@ -23,6 +33,7 @@ void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> if (ub->dev_info.flags & UBLK_F_ZONED) {
> disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
> }
> }
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 88fa39853c61..6a949669b47e 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -107,6 +107,11 @@ struct ublk_uring_cmd_pdu {
> */
> #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
>
> +/*
> + * Set when IO is Zone Append
> + */
> +#define UBLK_IO_FLAG_ZONE_APPEND 0x10
> +
> struct ublk_io {
> /* userspace buffer address from io cmd */
> __u64 addr;
> @@ -230,6 +235,8 @@ static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>
> static int ublk_validate_params(const struct ublk_device *ub)
> {
> + int ret = 0;
> +
> /* basic param is the only one which must be set */
> if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
> const struct ublk_param_basic *p = &ub->params.basic;
> @@ -260,6 +267,13 @@ static int ublk_validate_params(const struct ublk_device *ub)
> if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
> return -EINVAL;
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + (ub->params.types & UBLK_PARAM_TYPE_ZONED)) {
> + ret = ublk_dev_param_zoned_validate(ub);
> + if (ret)
> + return ret;
> + }
> +
> return 0;
> }
>
> @@ -690,6 +704,11 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
> return BLK_STS_IOERR;
> }
> case REQ_OP_ZONE_APPEND:
> + if (!(ubq->dev->dev_info.flags & UBLK_F_USER_COPY))
> + return BLK_STS_IOERR;
> + ublk_op = UBLK_IO_OP_ZONE_APPEND;
> + io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
> + break;
> case REQ_OP_ZONE_RESET_ALL:
> case REQ_OP_DRV_OUT:
> /* We do not support zone append or reset_all yet */
> @@ -1112,6 +1131,12 @@ static void ublk_commit_completion(struct ublk_device *ub,
> /* find the io request and complete */
> req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
> + if (io->flags & UBLK_IO_FLAG_ZONE_APPEND)
> + req->__sector = ub_cmd->addr;
> + io->flags &= ~UBLK_IO_FLAG_ZONE_APPEND;
> + }
> +
> if (req && likely(!blk_should_fake_timeout(req->q)))
> ublk_put_req_ref(ubq, req);
> }
> @@ -1411,7 +1436,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
> ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
> goto out;
>
> - if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
> + if (ublk_support_user_copy(ubq) &&
> + !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
> ret = -EINVAL;
> goto out;
> }
> @@ -1534,11 +1560,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
> int ubuf_dir)
> {
> /* copy ubuf to request pages */
> - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
> + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
> + ubuf_dir == ITER_SOURCE)
> return true;
>
> /* copy request pages to ubuf */
> - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
> + if ((req_op(req) == REQ_OP_WRITE ||
> + req_op(req) == REQ_OP_ZONE_APPEND) &&
> + ubuf_dir == ITER_DEST)
> return true;
>
> return false;
> @@ -1867,6 +1896,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
> ub->dev_info.ublksrv_pid = ublksrv_pid;
> ub->ub_disk = disk;
>
> + ub->dev_info.state = UBLK_S_DEV_LIVE;
I guess the above line change isn't necessary?
Thanks,
Ming
On Wed, Jun 28, 2023 at 09:06:48PM +0200, Andreas Hindborg wrote:
> From: Andreas Hindborg <[email protected]>
>
> Add zoned storage support to ublk: report_zones and operations:
> - REQ_OP_ZONE_OPEN
> - REQ_OP_ZONE_CLOSE
> - REQ_OP_ZONE_FINISH
> - REQ_OP_ZONE_RESET
>
> Note: This commit changes the ublk kernel module name from `ublk_drv.ko` to
> `ublk.ko` in order to link multiple translation units into the module.
>
> Signed-off-by: Andreas Hindborg <[email protected]>
> ---
> MAINTAINERS | 1 +
> drivers/block/Kconfig | 4 +
> drivers/block/Makefile | 4 +-
> drivers/block/ublk_drv-zoned.c | 144 +++++++++++++++++++++++++++++++++
> drivers/block/ublk_drv.c | 64 +++++++++++++--
> drivers/block/ublk_drv.h | 15 ++++
> include/uapi/linux/ublk_cmd.h | 14 ++++
> 7 files changed, 239 insertions(+), 7 deletions(-)
> create mode 100644 drivers/block/ublk_drv-zoned.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ace71c90751c..db8a8deb5926 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -21555,6 +21555,7 @@ S: Maintained
> F: Documentation/block/ublk.rst
> F: drivers/block/ublk_drv.c
> F: drivers/block/ublk_drv.h
> +F: drivers/block/ublk_drv-zoned.c
> F: include/uapi/linux/ublk_cmd.h
>
> UCLINUX (M68KNOMMU AND COLDFIRE)
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 5b9d4aaebb81..c58dfd035557 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
> suggested to enable N if your application(ublk server) switches to
> ioctl command encoding.
>
> +config BLK_DEV_UBLK_ZONED
> + def_bool y
> + depends on BLK_DEV_UBLK && BLK_DEV_ZONED
> +
> source "drivers/block/rnbd/Kconfig"
>
> endif # BLK_DEV
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 101612cba303..bc1649e20ec2 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -37,6 +37,8 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
>
> obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
>
> -obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
> +obj-$(CONFIG_BLK_DEV_UBLK) += ublk.o
> +ublk-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
> +ublk-$(CONFIG_BLK_DEV_UBLK_ZONED) += ublk_drv-zoned.o
>
> swim_mod-y := swim.o swim_asm.o
> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
> new file mode 100644
> index 000000000000..ea86bf4b3681
> --- /dev/null
> +++ b/drivers/block/ublk_drv-zoned.c
> @@ -0,0 +1,144 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2023 Andreas Hindborg <[email protected]>
> + */
> +#include <linux/blkzoned.h>
> +#include <linux/ublk_cmd.h>
> +#include <linux/vmalloc.h>
> +
> +#include "ublk_drv.h"
> +
> +void ublk_set_nr_zones(struct ublk_device *ub)
> +{
> + const struct ublk_param_basic *p = &ub->params.basic;
> +
> + if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> +}
> +
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> + const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> + if (ub->dev_info.flags & UBLK_F_ZONED) {
> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> + }
> +}
> +
> +int ublk_revalidate_disk_zones(struct gendisk *disk)
> +{
> + return blk_revalidate_disk_zones(disk, NULL);
> +}
> +
> +/* Based on virtblk_alloc_report_buffer */
> +static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
> + unsigned int nr_zones,
> + unsigned int zone_sectors, size_t *buflen)
> +{
> + struct request_queue *q = ublk->ub_disk->queue;
> + size_t bufsize;
> + void *buf;
> +
> + nr_zones = min_t(unsigned int, nr_zones,
> + get_capacity(ublk->ub_disk) >> ilog2(zone_sectors));
> +
> + bufsize = nr_zones * sizeof(struct blk_zone);
> + bufsize =
> + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
> + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
This means 'blk_zone'(includes any sub-field type) is reused as part of ublk's UAPI,
and it is smart, please document it around UBLK_IO_OP_REPORT_ZONES's definition.
> +
> + while (bufsize >= sizeof(struct blk_zone)) {
> + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> + if (buf) {
> + *buflen = bufsize;
> + return buf;
> + }
> + bufsize >>= 1;
> + }
> +
> + bufsize = 0;
> + return NULL;
> +}
> +
> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> + struct ublk_device *ub = disk->private_data;
> + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
> + unsigned int first_zone = sector >> ilog2(zone_size_sectors);
> + unsigned int done_zones = 0;
> + unsigned int max_zones_per_request;
> + struct blk_zone *buffer;
> + size_t buffer_length;
> +
> + if (!(ub->dev_info.flags & UBLK_F_ZONED))
> + return -EOPNOTSUPP;
> +
> + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
> + nr_zones);
> +
> + buffer = ublk_alloc_report_buffer(ub, nr_zones, zone_size_sectors,
> + &buffer_length);
> + if (!buffer)
> + return -ENOMEM;
> +
> + max_zones_per_request = buffer_length / sizeof(struct blk_zone);
> +
> + while (done_zones < nr_zones) {
> + unsigned int remaining_zones = nr_zones - done_zones;
> + unsigned int zones_in_request = min_t(
> + unsigned int, remaining_zones, max_zones_per_request);
> + int err = 0;
> + struct request *req;
> + struct ublk_rq_data *pdu;
> + blk_status_t status;
> +
> + memset(buffer, 0, buffer_length);
> +
> + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> + if (IS_ERR(req))
> + return PTR_ERR(req);
> +
> + pdu = blk_mq_rq_to_pdu(req);
> + pdu->operation = UBLK_IO_OP_REPORT_ZONES;
> + pdu->sector = sector;
> + pdu->nr_sectors = remaining_zones * zone_size_sectors;
> +
> + err = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
> + GFP_KERNEL);
> + if (err) {
> + blk_mq_free_request(req);
> + kvfree(buffer);
> + return err;
> + }
> +
> + status = blk_execute_rq(req, 0);
> + err = blk_status_to_errno(status);
> + blk_mq_free_request(req);
> + if (err) {
> + kvfree(buffer);
> + return err;
> + }
> +
> + for (unsigned int i = 0; i < zones_in_request; i++) {
> + struct blk_zone *zone = buffer + i;
> +
> + err = cb(zone, i, data);
> + if (err)
> + return err;
> +
> + done_zones++;
> + sector += zone_size_sectors;
> +
> + /* A zero length zone means don't ask for more zones */
> + if (!zone->len) {
This way is part of UAPI UBLK_IO_OP_REPORT_ZONES, please document it
around UBLK_IO_OP_REPORT_ZONES.
> + kvfree(buffer);
> + return done_zones;
> + }
> + }
> + }
> +
> + kvfree(buffer);
> + return done_zones;
> +}
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index e519dc0d9fe7..88fa39853c61 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -57,12 +57,13 @@
> | UBLK_F_USER_RECOVERY_REISSUE \
> | UBLK_F_UNPRIVILEGED_DEV \
> | UBLK_F_CMD_IOCTL_ENCODE \
> - | UBLK_F_USER_COPY)
> + | UBLK_F_USER_COPY \
> + | UBLK_F_ZONED)
>
> /* All UBLK_PARAM_TYPE_* should be included here */
> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
> - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
> -
> +#define UBLK_PARAM_TYPE_ALL \
> + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
>
> struct ublk_uring_cmd_pdu {
> struct ublk_queue *ubq;
> @@ -209,6 +210,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
> set_disk_ro(ub->ub_disk, true);
>
> set_capacity(ub->ub_disk, p->dev_sectors);
> +
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> + ublk_set_nr_zones(ub);
> }
>
> static void ublk_dev_param_discard_apply(struct ublk_device *ub)
> @@ -269,6 +273,9 @@ static int ublk_apply_params(struct ublk_device *ub)
> if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
> ublk_dev_param_discard_apply(ub);
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
> + ublk_dev_param_zoned_apply(ub);
> +
> return 0;
> }
>
> @@ -439,6 +446,7 @@ static const struct block_device_operations ub_fops = {
> .owner = THIS_MODULE,
> .open = ublk_open,
> .free_disk = ublk_free_disk,
> + .report_zones = ublk_report_zones,
> };
>
> #define UBLK_MAX_PIN_PAGES 32
> @@ -553,7 +561,8 @@ static inline bool ublk_need_map_req(const struct request *req)
>
> static inline bool ublk_need_unmap_req(const struct request *req)
> {
> - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
> + return ublk_rq_has_data(req) &&
> + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
> }
>
> static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
> @@ -637,6 +646,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
> {
> struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> struct ublk_io *io = &ubq->ios[req->tag];
> + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
> u32 ublk_op;
>
> switch (req_op(req)) {
> @@ -655,6 +665,35 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
> case REQ_OP_WRITE_ZEROES:
> ublk_op = UBLK_IO_OP_WRITE_ZEROES;
> break;
> + case REQ_OP_ZONE_OPEN:
> + ublk_op = UBLK_IO_OP_ZONE_OPEN;
> + break;
> + case REQ_OP_ZONE_CLOSE:
> + ublk_op = UBLK_IO_OP_ZONE_CLOSE;
> + break;
> + case REQ_OP_ZONE_FINISH:
> + ublk_op = UBLK_IO_OP_ZONE_FINISH;
> + break;
> + case REQ_OP_ZONE_RESET:
> + ublk_op = UBLK_IO_OP_ZONE_RESET;
> + break;
> + case REQ_OP_DRV_IN:
> + ublk_op = pdu->operation;
> + switch (ublk_op) {
> + case UBLK_IO_OP_REPORT_ZONES:
> + iod->op_flags = ublk_op | ublk_req_build_flags(req);
> + iod->nr_sectors = pdu->nr_sectors;
> + iod->start_sector = pdu->sector;
> + iod->addr = io->addr;
As Damien commented, zone append needs to be moved into this patch,
so zoned depends on user copy feature, and you can simply fail to
add device if user copy isn't enabled.
So the above 'iod->addr = io->addr' can be removed.
> + return BLK_STS_OK;
> + default:
> + return BLK_STS_IOERR;
> + }
> + case REQ_OP_ZONE_APPEND:
> + case REQ_OP_ZONE_RESET_ALL:
> + case REQ_OP_DRV_OUT:
> + /* We do not support zone append or reset_all yet */
> + fallthrough;
> default:
> return BLK_STS_IOERR;
> }
> @@ -708,7 +747,8 @@ static inline void __ublk_complete_rq(struct request *req)
> *
> * Both the two needn't unmap.
> */
> - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
> + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
> + req_op(req) != REQ_OP_DRV_IN)
> goto exit;
>
> /* for READ request, writing data in iod->addr to rq buffers */
> @@ -1835,6 +1875,15 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
> if (ub->nr_privileged_daemon != ub->nr_queues_ready)
> set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + ub->dev_info.flags & UBLK_F_ZONED) {
> + disk_set_zoned(disk, BLK_ZONED_HM);
> + blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
> + ret = ublk_revalidate_disk_zones(disk);
> + if (ret)
> + goto out_put_disk;
> + }
> +
> get_device(&ub->cdev_dev);
> ub->dev_info.state = UBLK_S_DEV_LIVE;
> ret = add_disk(disk);
> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
> if (ub->dev_info.flags & UBLK_F_USER_COPY)
> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>
> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> + ub->dev_info.flags &= ~UBLK_F_ZONED;
> +
> /* We are not ready to support zero copy */
> ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>
> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
> index f81e62256456..7242430fd6b9 100644
> --- a/drivers/block/ublk_drv.h
> +++ b/drivers/block/ublk_drv.h
> @@ -50,6 +50,21 @@ struct ublk_rq_data {
> struct llist_node node;
>
> struct kref ref;
> + enum ublk_op operation;
> + __u64 sector;
> + __u32 nr_sectors;
> };
>
> +void ublk_set_nr_zones(struct ublk_device *ub);
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> +int ublk_revalidate_disk_zones(struct gendisk *disk);
> +
> +#ifdef CONFIG_BLK_DEV_UBLK_ZONED
> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb,
> + void *data);
> +#else
> +#define ublk_report_zones NULL
> +#endif
> +
> #endif
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 471b3b983045..436525afffe8 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -176,6 +176,11 @@
> /* Copy between request and user buffer by pread()/pwrite() */
> #define UBLK_F_USER_COPY (1UL << 7)
>
> +/*
> + * Enable zoned device support
> + */
> +#define UBLK_F_ZONED (1ULL << 8)
> +
> /* device state */
> #define UBLK_S_DEV_DEAD 0
> #define UBLK_S_DEV_LIVE 1
> @@ -242,6 +247,7 @@ enum ublk_op {
> UBLK_IO_OP_ZONE_APPEND = 13,
> UBLK_IO_OP_ZONE_RESET = 15,
> __UBLK_IO_OP_DRV_IN_START = 32,
> + UBLK_IO_OP_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
> __UBLK_IO_OP_DRV_IN_END = 96,
> __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
> __UBLK_IO_OP_DRV_OUT_END = 160,
> @@ -342,6 +348,12 @@ struct ublk_param_devt {
> __u32 disk_minor;
> };
>
> +struct ublk_param_zoned {
> + __u32 max_open_zones;
> + __u32 max_active_zones;
> + __u8 reserved[24];
> +};
Maybe you can simply borrow virtio_blk_zoned_characteristics definition
here, and make several reserved words.
But that is fine, given six u32 should be flexible enough compared with
virtio_blk_zoned_characteristics.
Thanks,
Ming
Hi Andreas,
kernel test robot noticed the following build errors:
[auto build test ERROR on 3261ea42710e9665c9151006049411bd23b5411f]
url: https://github.com/intel-lab-lkp/linux/commits/Andreas-Hindborg/ublk-change-ublk-IO-command-defines-to-enum/20230629-031015
base: 3261ea42710e9665c9151006049411bd23b5411f
patch link: https://lore.kernel.org/r/20230628190649.11233-4-nmi%40metaspace.dk
patch subject: [PATCH v4 3/4] ublk: enable zoned storage support
config: hexagon-randconfig-r041-20230628 (https://download.01.org/0day-ci/archive/20230629/[email protected]/config)
compiler: clang version 15.0.7 (https://github.com/llvm/llvm-project.git 8dfdcc7b7bf66834a761bd8de445840ef68e4d1a)
reproduce: (https://download.01.org/0day-ci/archive/20230629/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All errors (new ones prefixed by >>):
>> ld.lld: error: undefined symbol: __hexagon_udivdi3
>>> referenced by ublk_drv-zoned.c:16 (drivers/block/ublk_drv-zoned.c:16)
>>> drivers/block/ublk_drv-zoned.o:(ublk_set_nr_zones) in archive vmlinux.a
>>> referenced by ublk_drv-zoned.c:16 (drivers/block/ublk_drv-zoned.c:16)
>>> drivers/block/ublk_drv-zoned.o:(ublk_set_nr_zones) in archive vmlinux.a
>>> did you mean: __hexagon_udivsi3
>>> defined in: vmlinux.a(arch/hexagon/lib/udivsi3.o)
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
What's the point in having a separate file for about 100 lines of
actual code? Especially as that needs a header, intefaces, etc?
Ming Lei <[email protected]> writes:
> On Thu, Jun 29, 2023 at 07:47:47AM +0900, Damien Le Moal wrote:
>> On 6/29/23 04:06, Andreas Hindborg wrote:
>> > From: Andreas Hindborg <[email protected]>
>> >
>> > This change is in preparation for zoned storage support.
>> >
>> > Signed-off-by: Andreas Hindborg <[email protected]>
>> > ---
>> > include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
>> > 1 file changed, 17 insertions(+), 6 deletions(-)
>> >
>> > diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> > index 4b8558db90e1..471b3b983045 100644
>> > --- a/include/uapi/linux/ublk_cmd.h
>> > +++ b/include/uapi/linux/ublk_cmd.h
>> > @@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
>> > __u64 reserved2;
>> > };
>> >
>> > -#define UBLK_IO_OP_READ 0
>> > -#define UBLK_IO_OP_WRITE 1
>> > -#define UBLK_IO_OP_FLUSH 2
>> > -#define UBLK_IO_OP_DISCARD 3
>> > -#define UBLK_IO_OP_WRITE_SAME 4
>> > -#define UBLK_IO_OP_WRITE_ZEROES 5
>> > +enum ublk_op {
>> > + UBLK_IO_OP_READ = 0,
>> > + UBLK_IO_OP_WRITE = 1,
>> > + UBLK_IO_OP_FLUSH = 2,
>> > + UBLK_IO_OP_DISCARD = 3,
>> > + UBLK_IO_OP_WRITE_SAME = 4,
>> > + UBLK_IO_OP_WRITE_ZEROES = 5,
>> > + UBLK_IO_OP_ZONE_OPEN = 10,
>> > + UBLK_IO_OP_ZONE_CLOSE = 11,
>> > + UBLK_IO_OP_ZONE_FINISH = 12,
>> > + UBLK_IO_OP_ZONE_APPEND = 13,
>> > + UBLK_IO_OP_ZONE_RESET = 15,
>> > + __UBLK_IO_OP_DRV_IN_START = 32,
>> > + __UBLK_IO_OP_DRV_IN_END = 96,
>> > + __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
>> > + __UBLK_IO_OP_DRV_OUT_END = 160,
>> > +};
>>
>> This patch does not do what the title says. You are also introducing the zone
>> operations, and the very obscure __UBLK_IO_OP_DRV_XXX operations without an
>> explanation. Also, why the "__" prefix for these ? I do not see the point...
>
> It should be to reserve space for ublk passthrough OP.
>
>> Given that this is a uapi, a comment to explain the less obvious commands would
>> be nice.
>>
>> So I think the change to an enum for the existing ops can be done either in
>> patch 2 or as a separate patch and the introduction of the zone operations done
>> in patch 3 or as a separate patch.
>
> Also it might break userspace by changing to enum from macro for existed
> definition, cause userspace may check something by '#ifdef UBLK_IO_OP_*',
> so probably it is better to keep these OPs as enum, or at least keep
> existed definition as macro.
I can change it back to `#define` again, no problem. I only changed it
to `enum` on request from Ming [1]
Best regards,
Andreas
[1] https://lore.kernel.org/all/[email protected]/
Hi Andreas,
kernel test robot noticed the following build errors:
[auto build test ERROR on 3261ea42710e9665c9151006049411bd23b5411f]
url: https://github.com/intel-lab-lkp/linux/commits/Andreas-Hindborg/ublk-change-ublk-IO-command-defines-to-enum/20230629-031015
base: 3261ea42710e9665c9151006049411bd23b5411f
patch link: https://lore.kernel.org/r/20230628190649.11233-4-nmi%40metaspace.dk
patch subject: [PATCH v4 3/4] ublk: enable zoned storage support
config: i386-randconfig-i006-20230628 (https://download.01.org/0day-ci/archive/20230629/[email protected]/config)
compiler: clang version 15.0.7 (https://github.com/llvm/llvm-project.git 8dfdcc7b7bf66834a761bd8de445840ef68e4d1a)
reproduce: (https://download.01.org/0day-ci/archive/20230629/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All errors (new ones prefixed by >>):
>> ld.lld: error: undefined symbol: __udivdi3
>>> referenced by ublk_drv-zoned.c:16 (drivers/block/ublk_drv-zoned.c:16)
>>> drivers/block/ublk_drv-zoned.o:(ublk_set_nr_zones) in archive vmlinux.a
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Damien Le Moal <[email protected]> writes:
> On 6/29/23 04:06, Andreas Hindborg wrote:
>> From: Andreas Hindborg <[email protected]>
>>
>> This change is in preparation for zoned storage support.
>>
>> Signed-off-by: Andreas Hindborg <[email protected]>
>> ---
>> include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
>> 1 file changed, 17 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 4b8558db90e1..471b3b983045 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
>> __u64 reserved2;
>> };
>>
>> -#define UBLK_IO_OP_READ 0
>> -#define UBLK_IO_OP_WRITE 1
>> -#define UBLK_IO_OP_FLUSH 2
>> -#define UBLK_IO_OP_DISCARD 3
>> -#define UBLK_IO_OP_WRITE_SAME 4
>> -#define UBLK_IO_OP_WRITE_ZEROES 5
>> +enum ublk_op {
>> + UBLK_IO_OP_READ = 0,
>> + UBLK_IO_OP_WRITE = 1,
>> + UBLK_IO_OP_FLUSH = 2,
>> + UBLK_IO_OP_DISCARD = 3,
>> + UBLK_IO_OP_WRITE_SAME = 4,
>> + UBLK_IO_OP_WRITE_ZEROES = 5,
>> + UBLK_IO_OP_ZONE_OPEN = 10,
>> + UBLK_IO_OP_ZONE_CLOSE = 11,
>> + UBLK_IO_OP_ZONE_FINISH = 12,
>> + UBLK_IO_OP_ZONE_APPEND = 13,
>> + UBLK_IO_OP_ZONE_RESET = 15,
>> + __UBLK_IO_OP_DRV_IN_START = 32,
>> + __UBLK_IO_OP_DRV_IN_END = 96,
>> + __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
>> + __UBLK_IO_OP_DRV_OUT_END = 160,
>> +};
>
> This patch does not do what the title says. You are also introducing the zone
> operations, and the very obscure __UBLK_IO_OP_DRV_XXX operations without an
> explanation. Also, why the "__" prefix for these ? I do not see the point...
> Given that this is a uapi, a comment to explain the less obvious commands would
> be nice.
It is a little vague, I'll make sure to include a better description ????
>
> So I think the change to an enum for the existing ops can be done either in
> patch 2 or as a separate patch and the introduction of the zone operations done
> in patch 3 or as a separate patch.
I agree, the zone ops should not be introduced in this patch, I will
move them to patch 3. That is a mistake.
Best regards,
Andreas
Christoph Hellwig <[email protected]> writes:
> What's the point in having a separate file for about 100 lines of
> actual code? Especially as that needs a header, intefaces, etc?
This was suggested by Niklas [1] to make the conditionally compiled code
more readable. The alternative would be masking out the code with a big
`#ifdef`.
If you prefer the latter, I can change it back, no problem. But I must
say I think it comes off cleaner when we move as much as possible of the
zoned code to a separate translation unit.
[1] https://lore.kernel.org/all/Y%2Fy+UFEHn1F1sg4i@x1-carbon/
On Thu, Jun 29, 2023 at 09:25:32AM +0200, Andreas Hindborg (Samsung) wrote:
> If you prefer the latter, I can change it back, no problem. But I must
> say I think it comes off cleaner when we move as much as possible of the
> zoned code to a separate translation unit.
I have to say I much prefer the ifdef. In the end Ming is the
maintainer though, so I'll defer to him.
Damien Le Moal <[email protected]> writes:
> On 6/29/23 04:06, Andreas Hindborg wrote:
>> From: Andreas Hindborg <[email protected]>
>>
>> Add zoned storage support to ublk: report_zones and operations:
>> - REQ_OP_ZONE_OPEN
>> - REQ_OP_ZONE_CLOSE
>> - REQ_OP_ZONE_FINISH
>> - REQ_OP_ZONE_RESET
>>
>> Note: This commit changes the ublk kernel module name from `ublk_drv.ko` to
>> `ublk.ko` in order to link multiple translation units into the module.
>
> You probably could rename ublk_drv.c to ublk.c to avoid that and keep the module
> name as it was, ublk_drv.ko.
OK, will change.
>
>>
>> Signed-off-by: Andreas Hindborg <[email protected]>
>> ---
>> MAINTAINERS | 1 +
>> drivers/block/Kconfig | 4 +
>> drivers/block/Makefile | 4 +-
>> drivers/block/ublk_drv-zoned.c | 144 +++++++++++++++++++++++++++++++++
>> drivers/block/ublk_drv.c | 64 +++++++++++++--
>> drivers/block/ublk_drv.h | 15 ++++
>> include/uapi/linux/ublk_cmd.h | 14 ++++
>> 7 files changed, 239 insertions(+), 7 deletions(-)
>> create mode 100644 drivers/block/ublk_drv-zoned.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index ace71c90751c..db8a8deb5926 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -21555,6 +21555,7 @@ S: Maintained
>> F: Documentation/block/ublk.rst
>> F: drivers/block/ublk_drv.c
>> F: drivers/block/ublk_drv.h
>> +F: drivers/block/ublk_drv-zoned.c
>> F: include/uapi/linux/ublk_cmd.h
>>
>> UCLINUX (M68KNOMMU AND COLDFIRE)
>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>> index 5b9d4aaebb81..c58dfd035557 100644
>> --- a/drivers/block/Kconfig
>> +++ b/drivers/block/Kconfig
>> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
>> suggested to enable N if your application(ublk server) switches to
>> ioctl command encoding.
>>
>> +config BLK_DEV_UBLK_ZONED
>> + def_bool y
>
> This can be "bool" only.
I did it like this because I wanted BLK_DEV_UBLK_ZONED to be set
automatically to "y" when BLK_DEV_UBLK and BLK_DEV_ZONED are enabled.
BLK_DEV_UBLK_ZONED has no menu entry. If we change it to "bool" it will
be default "n" and we would have to make it appear in menuconfig to be
manually enabled.
Isn't it more sane if it is just unconditionally enabled when
BLK_DEV_ZONED and BLK_DEV_UBLK is enabled?
>
>> + depends on BLK_DEV_UBLK && BLK_DEV_ZONED
>> +
>> source "drivers/block/rnbd/Kconfig"
>>
>> endif # BLK_DEV
>> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
>> index 101612cba303..bc1649e20ec2 100644
>> --- a/drivers/block/Makefile
>> +++ b/drivers/block/Makefile
>> @@ -37,6 +37,8 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
>>
>> obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
>>
>> -obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
>> +obj-$(CONFIG_BLK_DEV_UBLK) += ublk.o
>> +ublk-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
>> +ublk-$(CONFIG_BLK_DEV_UBLK_ZONED) += ublk_drv-zoned.o
>>
>> swim_mod-y := swim.o swim_asm.o
>> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
>> new file mode 100644
>> index 000000000000..ea86bf4b3681
>> --- /dev/null
>> +++ b/drivers/block/ublk_drv-zoned.c
>> @@ -0,0 +1,144 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright 2023 Andreas Hindborg <[email protected]>
>> + */
>> +#include <linux/blkzoned.h>
>> +#include <linux/ublk_cmd.h>
>> +#include <linux/vmalloc.h>
>> +
>> +#include "ublk_drv.h"
>> +
>> +void ublk_set_nr_zones(struct ublk_device *ub)
>> +{
>> + const struct ublk_param_basic *p = &ub->params.basic;
>> +
>> + if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
>
> If UBLK_F_ZONED is set but chunk_sectors is not, that is a bug on the user
> driver side, no ? So an error message about that would be nice instead of
> silently ignoring the zoned flag.
Yes, I'll add a check to `ublk_validate_params()`.
>
>> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> +}
>> +
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> +{
>> + const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> + if (ub->dev_info.flags & UBLK_F_ZONED) {
>> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>
> You do not need to check if the max_active_zones and max_open_zones values are
> sensible ? E.g. what if they are larger than the number of zones ?
I don't think it will be a problem. I assume you can set them to 0 to
indicate infinite. If user space sets them to more than the actual
number of available zones (that user space set up also), user space will
have to handle that when it gets a zone request for a zone outside the
logical drive LBA space. I don't think the kernel has to care. Will this
break anything kernel side?
>
>> + }
>> +}
>> +
>> +int ublk_revalidate_disk_zones(struct gendisk *disk)
>> +{
>> + return blk_revalidate_disk_zones(disk, NULL);
>> +}
>
> I do not think this helper is needed at all (see below comment on the call site).
This is required because the prototype for `blk_revalidate_disk_zones()`
is not defined when BLK_DEV_ZONED is not defined. Without this helper
for which the prototype is always defined, we will have a compile error
at the call site.
>
>> +
>> +/* Based on virtblk_alloc_report_buffer */
>> +static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
>> + unsigned int nr_zones,
>> + unsigned int zone_sectors, size_t *buflen)
>> +{
>> + struct request_queue *q = ublk->ub_disk->queue;
>> + size_t bufsize;
>> + void *buf;
>> +
>> + nr_zones = min_t(unsigned int, nr_zones,
>> + get_capacity(ublk->ub_disk) >> ilog2(zone_sectors));
>> +
>> + bufsize = nr_zones * sizeof(struct blk_zone);
>> + bufsize =
>> + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
>> + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
>> +
>> + while (bufsize >= sizeof(struct blk_zone)) {
>> + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>> + if (buf) {
>> + *buflen = bufsize;
>> + return buf;
>> + }
>> + bufsize >>= 1;
>> + }
>> +
>> + bufsize = 0;
>
> This is not needed. This should rather be "*buflen = 0;".
Yes, this an ugly oops.
>
>> + return NULL;
>> +}
>> +
>> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> + unsigned int nr_zones, report_zones_cb cb, void *data)
>> +{
>> + struct ublk_device *ub = disk->private_data;
>> + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
>> + unsigned int first_zone = sector >> ilog2(zone_size_sectors);
>> + unsigned int done_zones = 0;
>> + unsigned int max_zones_per_request;
>> + struct blk_zone *buffer;
>> + size_t buffer_length;
>> +
>> + if (!(ub->dev_info.flags & UBLK_F_ZONED))
>
> This is repeated a lot. So a small inline helper ublk_dev_is_zoned() would be nice.
Yes, thanks.
>
>> + return -EOPNOTSUPP;
>> +
>> + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
>> + nr_zones);
>> +
>> + buffer = ublk_alloc_report_buffer(ub, nr_zones, zone_size_sectors,
>> + &buffer_length);
>> + if (!buffer)
>> + return -ENOMEM;
>> +
>> + max_zones_per_request = buffer_length / sizeof(struct blk_zone);
>> +
>> + while (done_zones < nr_zones) {
>> + unsigned int remaining_zones = nr_zones - done_zones;
>> + unsigned int zones_in_request = min_t(
>> + unsigned int, remaining_zones, max_zones_per_request);
>> + int err = 0;
>> + struct request *req;
>> + struct ublk_rq_data *pdu;
>> + blk_status_t status;
>> +
>> + memset(buffer, 0, buffer_length);
>> +
>> + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
>> + if (IS_ERR(req))
>> + return PTR_ERR(req);
>
> You are leaking buffer.
????
>
>> +
>> + pdu = blk_mq_rq_to_pdu(req);
>> + pdu->operation = UBLK_IO_OP_REPORT_ZONES;
>> + pdu->sector = sector;
>> + pdu->nr_sectors = remaining_zones * zone_size_sectors;
>> +
>> + err = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
>> + GFP_KERNEL);
>> + if (err) {
>> + blk_mq_free_request(req);
>> + kvfree(buffer);
>> + return err;
>> + }
>> +
>> + status = blk_execute_rq(req, 0);
>> + err = blk_status_to_errno(status);
>> + blk_mq_free_request(req);
>> + if (err) {
>> + kvfree(buffer);
>> + return err;
>
> You are repeating this a lot. Use a goto to cleanup on error.
Ok
>
>> + }
>> +
>> + for (unsigned int i = 0; i < zones_in_request; i++) {
>> + struct blk_zone *zone = buffer + i;
>> +
>> + err = cb(zone, i, data);
>> + if (err)
>> + return err;
>> +
>> + done_zones++;
>> + sector += zone_size_sectors;
>> +
>> + /* A zero length zone means don't ask for more zones */
>> + if (!zone->len) {
>> + kvfree(buffer);
>> + return done_zones;
>> + }
>> + }
>> + }
>> +
>> + kvfree(buffer);
>> + return done_zones;
>> +}
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index e519dc0d9fe7..88fa39853c61 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -57,12 +57,13 @@
>> | UBLK_F_USER_RECOVERY_REISSUE \
>> | UBLK_F_UNPRIVILEGED_DEV \
>> | UBLK_F_CMD_IOCTL_ENCODE \
>> - | UBLK_F_USER_COPY)
>> + | UBLK_F_USER_COPY \
>> + | UBLK_F_ZONED)
>>
>> /* All UBLK_PARAM_TYPE_* should be included here */
>> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
>> - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
>> -
>> +#define UBLK_PARAM_TYPE_ALL \
>> + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
>> + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
>>
>> struct ublk_uring_cmd_pdu {
>> struct ublk_queue *ubq;
>> @@ -209,6 +210,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>> set_disk_ro(ub->ub_disk, true);
>>
>> set_capacity(ub->ub_disk, p->dev_sectors);
>> +
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> + ublk_set_nr_zones(ub);
>
> So if the user is attempting to setup a zoned drive but the kernel does not have
> CONFIG_BLK_DEV_ZONED=y, the user setup will be silently ignored. Not exactly
> nice I think, unless I am missing something.
We have this in `ublk_ctrl_add_dev()`:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
ub->dev_info.flags &= ~UBLK_F_ZONED;
User space is supposed to check the flags after the call to know the
kernel capabilities.
>
> Also, repeating that "if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))" for all zone
> related functions is very verbose. Stub the functions in ublk_drv.h. That will
> make the main C code lighter.
Not sure what you mean "stub the functions". Like so:
@@ -216,8 +216,7 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
set_capacity(ub->ub_disk, p->dev_sectors);
- if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
- ublk_set_nr_zones(ub);
+ ublk_config_nr_zones(ub);
}
And then in the header:
void ublk_config_nr_zones(struct ublk_device *ub)
{
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
ublk_set_nr_zones(ub);
}
Or something else?
>
>> }
>>
>> static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>> @@ -269,6 +273,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>> if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>> ublk_dev_param_discard_apply(ub);
>>
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
>> + ublk_dev_param_zoned_apply(ub);
>
> Similar comment as above. If the user tries to apply zoned parameters to a non
> zoned drive, no error reported...
>
>> +
>> return 0;
>> }
>>
>> @@ -439,6 +446,7 @@ static const struct block_device_operations ub_fops = {
>> .owner = THIS_MODULE,
>> .open = ublk_open,
>> .free_disk = ublk_free_disk,
>> + .report_zones = ublk_report_zones,
>> };
>>
>> #define UBLK_MAX_PIN_PAGES 32
>> @@ -553,7 +561,8 @@ static inline bool ublk_need_map_req(const struct request *req)
>>
>> static inline bool ublk_need_unmap_req(const struct request *req)
>> {
>> - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
>> + return ublk_rq_has_data(req) &&
>> + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
>> }
>>
>> static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
>> @@ -637,6 +646,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>> {
>> struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
>> struct ublk_io *io = &ubq->ios[req->tag];
>> + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
>> u32 ublk_op;
>>
>> switch (req_op(req)) {
>> @@ -655,6 +665,35 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>> case REQ_OP_WRITE_ZEROES:
>> ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>> break;
>> + case REQ_OP_ZONE_OPEN:
>> + ublk_op = UBLK_IO_OP_ZONE_OPEN;
>> + break;
>> + case REQ_OP_ZONE_CLOSE:
>> + ublk_op = UBLK_IO_OP_ZONE_CLOSE;
>> + break;
>> + case REQ_OP_ZONE_FINISH:
>> + ublk_op = UBLK_IO_OP_ZONE_FINISH;
>> + break;
>> + case REQ_OP_ZONE_RESET:
>> + ublk_op = UBLK_IO_OP_ZONE_RESET;
>> + break;
>> + case REQ_OP_DRV_IN:
>> + ublk_op = pdu->operation;
>> + switch (ublk_op) {
>> + case UBLK_IO_OP_REPORT_ZONES:
>> + iod->op_flags = ublk_op | ublk_req_build_flags(req);
>> + iod->nr_sectors = pdu->nr_sectors;
>> + iod->start_sector = pdu->sector;
>> + iod->addr = io->addr;
>> + return BLK_STS_OK;
>> + default:
>> + return BLK_STS_IOERR;
>> + }
>> + case REQ_OP_ZONE_APPEND:
>> + case REQ_OP_ZONE_RESET_ALL:
>> + case REQ_OP_DRV_OUT:
>> + /* We do not support zone append or reset_all yet */
>> + fallthrough;
>
> Not OK ! zone append is mandatory for zoned block devices. So zone append
> support needs to come with this patch. reset all can be a different patch as
> that is optional.
I will merge the patches. I thought it would be easier to review when split.
>
>> default:
>> return BLK_STS_IOERR;
>> }
>> @@ -708,7 +747,8 @@ static inline void __ublk_complete_rq(struct request *req)
>> *
>> * Both the two needn't unmap.
>> */
>> - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
>> + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
>> + req_op(req) != REQ_OP_DRV_IN)
>> goto exit;
>>
>> /* for READ request, writing data in iod->addr to rq buffers */
>> @@ -1835,6 +1875,15 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
>> if (ub->nr_privileged_daemon != ub->nr_queues_ready)
>> set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
>>
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> + ub->dev_info.flags & UBLK_F_ZONED) {
>
> Same comment as above about error return instead of silently ignoring the zoned
> flag for the CONFIG_BLK_DEV_ZONED=n case.
>
>> + disk_set_zoned(disk, BLK_ZONED_HM);
>> + blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>> + ret = ublk_revalidate_disk_zones(disk);
>> + if (ret)
>> + goto out_put_disk;
>
> This should be all a helper ublk_set_zoned() or something.
Ok ???? Unfortunately this block of code is split up in the zone append
patch. I will see what I can do, maybe 2 functions.
>
>> + }
>> +
>> get_device(&ub->cdev_dev);
>> ub->dev_info.state = UBLK_S_DEV_LIVE;
>> ret = add_disk(disk);
>> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>> if (ub->dev_info.flags & UBLK_F_USER_COPY)
>> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>>
>> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> + ub->dev_info.flags &= ~UBLK_F_ZONED;
>
> Arg, no. The user should be notified with an error that he/she is attempting to
> create a zoned device that cannot be supported.
As I wrote above, this is part of the ublk uapi. This is the way user
space does kernel capability check. User space will check the flags when
this call returns. @Ming did I understand this correctly or did I miss something?
>
>> +
>> /* We are not ready to support zero copy */
>> ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>>
>> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
>> index f81e62256456..7242430fd6b9 100644
>> --- a/drivers/block/ublk_drv.h
>> +++ b/drivers/block/ublk_drv.h
>> @@ -50,6 +50,21 @@ struct ublk_rq_data {
>> struct llist_node node;
>>
>> struct kref ref;
>> + enum ublk_op operation;
>> + __u64 sector;
>> + __u32 nr_sectors;
>> };
>>
>> +void ublk_set_nr_zones(struct ublk_device *ub);
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> +int ublk_revalidate_disk_zones(struct gendisk *disk);
>> +
>> +#ifdef CONFIG_BLK_DEV_UBLK_ZONED
>> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> + unsigned int nr_zones, report_zones_cb cb,
>> + void *data);
>> +#else
>> +#define ublk_report_zones NULL
>> +#endif
>> +
>> #endif
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 471b3b983045..436525afffe8 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -176,6 +176,11 @@
>> /* Copy between request and user buffer by pread()/pwrite() */
>> #define UBLK_F_USER_COPY (1UL << 7)
>>
>> +/*
>> + * Enable zoned device support
>
> Isn't this for "Indicate that the device is zoned" ?
User space sets this flag when setting up the device to enable zoned
storage support. Kernel may reject it. I think the wording is
sufficient. Could be updated to:
"User space sets this flag when setting up the device to request zoned
storage support. Kernel may deny the request by clearing the flag
during setup."
>
>> + */
>> +#define UBLK_F_ZONED (1ULL << 8)
>> +
>> /* device state */
>> #define UBLK_S_DEV_DEAD 0
>> #define UBLK_S_DEV_LIVE 1
>> @@ -242,6 +247,7 @@ enum ublk_op {
>> UBLK_IO_OP_ZONE_APPEND = 13,
>> UBLK_IO_OP_ZONE_RESET = 15,
>> __UBLK_IO_OP_DRV_IN_START = 32,
>> + UBLK_IO_OP_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
>> __UBLK_IO_OP_DRV_IN_END = 96,
>> __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
>> __UBLK_IO_OP_DRV_OUT_END = 160,
>> @@ -342,6 +348,12 @@ struct ublk_param_devt {
>> __u32 disk_minor;
>> };
>>
>> +struct ublk_param_zoned {
>> + __u32 max_open_zones;
>> + __u32 max_active_zones;
>> + __u8 reserved[24];
>> +};
>> +
>> struct ublk_params {
>> /*
>> * Total length of parameters, userspace has to set 'len' for both
>> @@ -353,11 +365,13 @@ struct ublk_params {
>> #define UBLK_PARAM_TYPE_BASIC (1 << 0)
>> #define UBLK_PARAM_TYPE_DISCARD (1 << 1)
>> #define UBLK_PARAM_TYPE_DEVT (1 << 2)
>> +#define UBLK_PARAM_TYPE_ZONED (1 << 3)
>> __u32 types; /* types of parameter included */
>>
>> struct ublk_param_basic basic;
>> struct ublk_param_discard discard;
>> struct ublk_param_devt devt;
>> + struct ublk_param_zoned zoned;
>> };
>>
>> #endif
On 6/29/23 16:50, Andreas Hindborg (Samsung) wrote:
>>> UCLINUX (M68KNOMMU AND COLDFIRE)
>>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>>> index 5b9d4aaebb81..c58dfd035557 100644
>>> --- a/drivers/block/Kconfig
>>> +++ b/drivers/block/Kconfig
>>> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
>>> suggested to enable N if your application(ublk server) switches to
>>> ioctl command encoding.
>>>
>>> +config BLK_DEV_UBLK_ZONED
>>> + def_bool y
>>
>> This can be "bool" only.
>
> I did it like this because I wanted BLK_DEV_UBLK_ZONED to be set
> automatically to "y" when BLK_DEV_UBLK and BLK_DEV_ZONED are enabled.
> BLK_DEV_UBLK_ZONED has no menu entry. If we change it to "bool" it will
> be default "n" and we would have to make it appear in menuconfig to be
> manually enabled.
>
> Isn't it more sane if it is just unconditionally enabled when
> BLK_DEV_ZONED and BLK_DEV_UBLK is enabled?
Maybe something like this then:
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 5b9d4aaebb81..105e1121bf5f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -370,9 +370,13 @@ config BLK_DEV_RBD
If unsure, say N.
+config BLK_DEV_UBLK_ZONED
+ bool
+
config BLK_DEV_UBLK
tristate "Userspace block driver (Experimental)"
select IO_URING
+ select BLK_DEV_UBLK_ZONED if BLK_DEV_ZONED
help
io_uring based userspace block driver. Together with ublk server, ublk
has been working well, but interface with userspace or command data
>>> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>>> +}
>>> +
>>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>>> +{
>>> + const struct ublk_param_zoned *p = &ub->params.zoned;
>>> +
>>> + if (ub->dev_info.flags & UBLK_F_ZONED) {
>>> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>>> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>>
>> You do not need to check if the max_active_zones and max_open_zones values are
>> sensible ? E.g. what if they are larger than the number of zones ?
>
> I don't think it will be a problem. I assume you can set them to 0 to
> indicate infinite. If user space sets them to more than the actual
> number of available zones (that user space set up also), user space will
> have to handle that when it gets a zone request for a zone outside the
> logical drive LBA space. I don't think the kernel has to care. Will this
> break anything kernel side?
zonefs and btrfs look at these limits. So I would rather prefer seeing sensible
values. 0 == no limit is fine. But seeing maz or moz > nr_zones for the device
is just too strange and I would prefer an error in that case to let the user
know it is not doing something right. Getting moz > maz is also nonsense that
needs to be checked. There is one case we could silently change: if maz ==
nr_zones or moz == nr_zones, then that essentially means "no limit", so we could
use 0 to let the in-kernel users that they do not need to care about the limits.
>
>>
>>> + }
>>> +}
>>> +
>>> +int ublk_revalidate_disk_zones(struct gendisk *disk)
>>> +{
>>> + return blk_revalidate_disk_zones(disk, NULL);
>>> +}
>>
>> I do not think this helper is needed at all (see below comment on the call site).
>
> This is required because the prototype for `blk_revalidate_disk_zones()`
> is not defined when BLK_DEV_ZONED is not defined. Without this helper
> for which the prototype is always defined, we will have a compile error
> at the call site.
You have this hunk in ublk_ctrl_start_dev:
disk_set_zoned(disk, BLK_ZONED_HM);
blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
ret = ublk_revalidate_disk_zones(disk);
if (ret)
goto out_put_disk;
And that is called under "if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)". So I do not see
how you can get compile errors using directly blk_revalidate_disk_zones(). The
entire hunk above should be a helper function I think.
>>> +
>>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>>> + ublk_set_nr_zones(ub);
>>
>> So if the user is attempting to setup a zoned drive but the kernel does not have
>> CONFIG_BLK_DEV_ZONED=y, the user setup will be silently ignored. Not exactly
>> nice I think, unless I am missing something.
>
> We have this in `ublk_ctrl_add_dev()`:
>
> if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> ub->dev_info.flags &= ~UBLK_F_ZONED;
>
> User space is supposed to check the flags after the call to know the
> kernel capabilities.
And you trust user space to be always correct ? I never do :)
So definitely, check and error if not supported. No silently clearing the device
zoned flag.
>> Also, repeating that "if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))" for all zone
>> related functions is very verbose. Stub the functions in ublk_drv.h. That will
>> make the main C code lighter.
>
> Not sure what you mean "stub the functions". Like so:
>
> @@ -216,8 +216,7 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>
> set_capacity(ub->ub_disk, p->dev_sectors);
>
> - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> - ublk_set_nr_zones(ub);
> + ublk_config_nr_zones(ub);
> }
>
> And then in the header:
>
> void ublk_config_nr_zones(struct ublk_device *ub)
> {
> if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> ublk_set_nr_zones(ub);
> }
>
> Or something else?
#ifndef CONFIG_BLK_DEV_ZONED
static inline void ublk_set_nr_zones() {}
#endif
That avoid repeating that if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) all over the
place. The stubbed functions can be called only for "if (ub->dev_info.flags &
UBLK_F_ZONED", which is OK since this flag is never supposed to be accepted and
set for the CONFIG_BLK_DEV_ZONED=n case.
>>> + disk_set_zoned(disk, BLK_ZONED_HM);
>>> + blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>>> + ret = ublk_revalidate_disk_zones(disk);
>>> + if (ret)
>>> + goto out_put_disk;
>>
>> This should be all a helper ublk_set_zoned() or something.
>
> Ok ???? Unfortunately this block of code is split up in the zone append
> patch. I will see what I can do, maybe 2 functions.
Beware that I just posted patches that require zone size (chunk_sectors limit)
and max append sectors limit to be set *before* calling
blk_revalidate_disk_zones(). Otherwise, you will get an error.
>>> get_device(&ub->cdev_dev);
>>> ub->dev_info.state = UBLK_S_DEV_LIVE;
>>> ret = add_disk(disk);
>>> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>>> if (ub->dev_info.flags & UBLK_F_USER_COPY)
>>> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>>>
>>> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>>> + ub->dev_info.flags &= ~UBLK_F_ZONED;
>>
>> Arg, no. The user should be notified with an error that he/she is attempting to
>> create a zoned device that cannot be supported.
>
> As I wrote above, this is part of the ublk uapi. This is the way user
> space does kernel capability check. User space will check the flags when
> this call returns. @Ming did I understand this correctly or did I miss something?
OK. So this is like a virtio capability flags thing ? Negotiation between device
and driver ?
>>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>>> index 471b3b983045..436525afffe8 100644
>>> --- a/include/uapi/linux/ublk_cmd.h
>>> +++ b/include/uapi/linux/ublk_cmd.h
>>> @@ -176,6 +176,11 @@
>>> /* Copy between request and user buffer by pread()/pwrite() */
>>> #define UBLK_F_USER_COPY (1UL << 7)
>>>
>>> +/*
>>> + * Enable zoned device support
>>
>> Isn't this for "Indicate that the device is zoned" ?
>
> User space sets this flag when setting up the device to enable zoned
> storage support. Kernel may reject it. I think the wording is
> sufficient. Could be updated to:
>
> "User space sets this flag when setting up the device to request zoned
> storage support. Kernel may deny the request by clearing the flag
> during setup."
I really have a big problem with this. If the user driver does not check that
the flag was cleared, the user will assume that the device is zoned, but it is
not. This is not super nice... I would really prefer just failing the devie
creation if zoned was requested but cannot be satisfied, either because the
kernel does not support zoned block devices or when the user specified something
nonsensical.
--
Damien Le Moal
Western Digital Research
On 6/29/23 18:17, Andreas Hindborg (Samsung) wrote:
>
> Ming Lei <[email protected]> writes:
>
>> On Wed, Jun 28, 2023 at 09:06:49PM +0200, Andreas Hindborg wrote:
>>> From: Andreas Hindborg <[email protected]>
>>>
>>> Add zone append feature to ublk. This feature uses the `addr` field of `struct
>>> ublksrv_io_cmd`. Therefore ublk must be used with the user copy
>>> feature (UBLK_F_USER_COPY) for zone append to be available. Without this
>>> feature, ublk will fail zone append requests.
>>
>> Given zone append is a must, please fail to add device in case of zoned
>> and !user_copy, then we can make fast IO code path clean.
>
> I will squash the patches and reject zone support if not user copy is
> enabled ????
Or if !CONFIG_BLK_DEV_ZONED or if the user specifies a bad parameter (invalid
limits etc).
--
Damien Le Moal
Western Digital Research
Ming Lei <[email protected]> writes:
> On Wed, Jun 28, 2023 at 09:06:49PM +0200, Andreas Hindborg wrote:
>> From: Andreas Hindborg <[email protected]>
>>
>> Add zone append feature to ublk. This feature uses the `addr` field of `struct
>> ublksrv_io_cmd`. Therefore ublk must be used with the user copy
>> feature (UBLK_F_USER_COPY) for zone append to be available. Without this
>> feature, ublk will fail zone append requests.
>
> Given zone append is a must, please fail to add device in case of zoned
> and !user_copy, then we can make fast IO code path clean.
I will squash the patches and reject zone support if not user copy is
enabled ????
>
>>
>> Suggested-by: Ming Lei <[email protected]>
>> Signed-off-by: Andreas Hindborg <[email protected]>
>> ---
>> drivers/block/ublk_drv-zoned.c | 11 +++++++++
>> drivers/block/ublk_drv.c | 43 ++++++++++++++++++++++++++++++----
>> drivers/block/ublk_drv.h | 1 +
>> include/uapi/linux/ublk_cmd.h | 3 ++-
>> 4 files changed, 52 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
>> index ea86bf4b3681..007e8fc7ff25 100644
>> --- a/drivers/block/ublk_drv-zoned.c
>> +++ b/drivers/block/ublk_drv-zoned.c
>> @@ -16,6 +16,16 @@ void ublk_set_nr_zones(struct ublk_device *ub)
>> ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> }
>>
>> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
>> +{
>> + const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> + if (! p->max_zone_append_sectors)
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>> +
>> void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> {
>> const struct ublk_param_zoned *p = &ub->params.zoned;
>> @@ -23,6 +33,7 @@ void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> if (ub->dev_info.flags & UBLK_F_ZONED) {
>> disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>> + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
>> }
>> }
>>
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index 88fa39853c61..6a949669b47e 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -107,6 +107,11 @@ struct ublk_uring_cmd_pdu {
>> */
>> #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
>>
>> +/*
>> + * Set when IO is Zone Append
>> + */
>> +#define UBLK_IO_FLAG_ZONE_APPEND 0x10
>> +
>> struct ublk_io {
>> /* userspace buffer address from io cmd */
>> __u64 addr;
>> @@ -230,6 +235,8 @@ static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>>
>> static int ublk_validate_params(const struct ublk_device *ub)
>> {
>> + int ret = 0;
>> +
>> /* basic param is the only one which must be set */
>> if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
>> const struct ublk_param_basic *p = &ub->params.basic;
>> @@ -260,6 +267,13 @@ static int ublk_validate_params(const struct ublk_device *ub)
>> if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
>> return -EINVAL;
>>
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> + (ub->params.types & UBLK_PARAM_TYPE_ZONED)) {
>> + ret = ublk_dev_param_zoned_validate(ub);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> return 0;
>> }
>>
>> @@ -690,6 +704,11 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>> return BLK_STS_IOERR;
>> }
>> case REQ_OP_ZONE_APPEND:
>> + if (!(ubq->dev->dev_info.flags & UBLK_F_USER_COPY))
>> + return BLK_STS_IOERR;
>> + ublk_op = UBLK_IO_OP_ZONE_APPEND;
>> + io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
>> + break;
>> case REQ_OP_ZONE_RESET_ALL:
>> case REQ_OP_DRV_OUT:
>> /* We do not support zone append or reset_all yet */
>> @@ -1112,6 +1131,12 @@ static void ublk_commit_completion(struct ublk_device *ub,
>> /* find the io request and complete */
>> req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>>
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
>> + if (io->flags & UBLK_IO_FLAG_ZONE_APPEND)
>> + req->__sector = ub_cmd->addr;
>> + io->flags &= ~UBLK_IO_FLAG_ZONE_APPEND;
>> + }
>> +
>> if (req && likely(!blk_should_fake_timeout(req->q)))
>> ublk_put_req_ref(ubq, req);
>> }
>> @@ -1411,7 +1436,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>> ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
>> goto out;
>>
>> - if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
>> + if (ublk_support_user_copy(ubq) &&
>> + !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
>> ret = -EINVAL;
>> goto out;
>> }
>> @@ -1534,11 +1560,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
>> int ubuf_dir)
>> {
>> /* copy ubuf to request pages */
>> - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
>> + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
>> + ubuf_dir == ITER_SOURCE)
>> return true;
>>
>> /* copy request pages to ubuf */
>> - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
>> + if ((req_op(req) == REQ_OP_WRITE ||
>> + req_op(req) == REQ_OP_ZONE_APPEND) &&
>> + ubuf_dir == ITER_DEST)
>> return true;
>>
>> return false;
>> @@ -1867,6 +1896,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
>> ub->dev_info.ublksrv_pid = ublksrv_pid;
>> ub->ub_disk = disk;
>>
>> + ub->dev_info.state = UBLK_S_DEV_LIVE;
>
> I guess the above line change isn't necessary?
It needs to go before the call to `ublk_revalidate_disk_zones()` because
that does IO to the disk in the form of a call to `report_zones()`. So I
can move it down a little bit.
Best regards,
Andreas
> On 29/06/23, 12:37 AM, "Andreas Hindborg" <[email protected]
> <mailto:[email protected]>> wrote:
>
>
> From: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
>
>
> Hi All,
>
>
> This patch set adds zoned storage support to `ublk`. The first two
> patches does
> some house cleaning in preparation for the last two patches. The third
> patch
> adds support for report_zones and the following operations:
>
Just to clarify, we do need you ublk user space patches
to create a ublk device node (with these patches in kernel), right ?
>
> - REQ_OP_ZONE_OPEN
> - REQ_OP_ZONE_CLOSE
> - REQ_OP_ZONE_FINISH
> - REQ_OP_ZONE_RES
REQ_OP_ZONE_RESET
>
>
> The last patch adds support for REQ_OP_ZONE_APPEND.
>
>
> v3 [2] -> v4 changes:
> - Split up v3 patches
> - Add zone append support
> - Change order of variables in `ublk_report_zones`
>
>
> Read/write and zone operations are tested with zenfs [3].
>
>
> The zone append path is tested with fio -> zonefs -> ublk -> null_blk.
>
>
> The implementation of zone append requires ublk user copy feature, and
> therefore
> the series is based on branch for-next (6afa337a3789) of [4].
>
>
> [1]
> https://github.com/metaspace/ubdsrv/commit/7de0d901c329fde7dc5a2e998952dd88bf5e668b
> <https://github.com/metaspace/ubdsrv/commit/7de0d901c329fde7dc5a2e998952dd88bf5e668b>
> [2]
> https://lore.kernel.org/linux-block/[email protected]
> <mailto:[email protected]>/
> [3] https://github.com/westerndigitalcorporation/zenfs
> <https://github.com/westerndigitalcorporation/zenfs>
> [4] https://git.kernel.dk/linux.git <https://git.kernel.dk/linux.git>
>
>
> Andreas Hindborg (4):
> ublk: change ublk IO command defines to enum
> ublk: move types to shared header file
> ublk: enable zoned storage support
> ublk: add zone append
>
>
> MAINTAINERS | 2 +
> drivers/block/Kconfig | 4 +
> drivers/block/Makefile | 4 +-
> drivers/block/ublk_drv-zoned.c | 155 +++++++++++++++++++++++++++++++++
> drivers/block/ublk_drv.c | 150 +++++++++++++++++++------------
> drivers/block/ublk_drv.h | 71 +++++++++++++++
> include/uapi/linux/ublk_cmd.h | 38 ++++++--
> 7 files changed, 363 insertions(+), 61 deletions(-)
> create mode 100644 drivers/block/ublk_drv-zoned.c
> create mode 100644 drivers/block/ublk_drv.h
>
>
>
>
> base-commit: 3261ea42710e9665c9151006049411bd23b5411f
Regards,
Aravind
> From: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
>
>
> Add zoned storage support to ublk: report_zones and operations:
> - REQ_OP_ZONE_OPEN
> - REQ_OP_ZONE_CLOSE
> - REQ_OP_ZONE_FINISH
> - REQ_OP_ZONE_RESET
>
>
> Note: This commit changes the ublk kernel module name from
> `ublk_drv.ko` to
> `ublk.ko` in order to link multiple translation units into the module.
>
>
> Signed-off-by: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
> ---
> MAINTAINERS | 1 +
> drivers/block/Kconfig | 4 +
> drivers/block/Makefile | 4 +-
> drivers/block/ublk_drv-zoned.c | 144 +++++++++++++++++++++++++++++++++
> drivers/block/ublk_drv.c | 64 +++++++++++++--
> drivers/block/ublk_drv.h | 15 ++++
> include/uapi/linux/ublk_cmd.h | 14 ++++
> 7 files changed, 239 insertions(+), 7 deletions(-)
> create mode 100644 drivers/block/ublk_drv-zoned.c
>
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ace71c90751c..db8a8deb5926 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -21555,6 +21555,7 @@ S: Maintained
> F: Documentation/block/ublk.rst
> F: drivers/block/ublk_drv.c
> F: drivers/block/ublk_drv.h
> +F: drivers/block/ublk_drv-zoned.c
nitpick: checkpatch.pl gives a warning on the ordering of the files.
WARNING: Misordered MAINTAINERS entry - list file patterns in alphabetic
order
> F: include/uapi/linux/ublk_cmd.h
>
>
> UCLINUX (M68KNOMMU AND COLDFIRE)
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 5b9d4aaebb81..c58dfd035557 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
> suggested to enable N if your application(ublk server) switches to
> ioctl command encoding.
>
>
> +config BLK_DEV_UBLK_ZONED
> + def_bool y
> + depends on BLK_DEV_UBLK && BLK_DEV_ZONED
> +
> source "drivers/block/rnbd/Kconfig"
>
>
> endif # BLK_DEV
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 101612cba303..bc1649e20ec2 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -37,6 +37,8 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
>
>
> obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
>
>
> -obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
> +obj-$(CONFIG_BLK_DEV_UBLK) += ublk.o
> +ublk-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
> +ublk-$(CONFIG_BLK_DEV_UBLK_ZONED) += ublk_drv-zoned.o
>
>
> swim_mod-y := swim.o swim_asm.o
> diff --git a/drivers/block/ublk_drv-zoned.c
> b/drivers/block/ublk_drv-zoned.c
> new file mode 100644
> index 000000000000..ea86bf4b3681
> --- /dev/null
> +++ b/drivers/block/ublk_drv-zoned.c
> @@ -0,0 +1,144 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2023 Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
> + */
> +#include <linux/blkzoned.h>
> +#include <linux/ublk_cmd.h>
> +#include <linux/vmalloc.h>
> +
> +#include "ublk_drv.h"
> +
> +void ublk_set_nr_zones(struct ublk_device *ub)
> +{
> + const struct ublk_param_basic *p = &ub->params.basic;
> +
> + if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> +}
> +
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> +{
> + const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> + if (ub->dev_info.flags & UBLK_F_ZONED) {
> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> + }
> +}
> +
> +int ublk_revalidate_disk_zones(struct gendisk *disk)
> +{
> + return blk_revalidate_disk_zones(disk, NULL);
> +}
> +
> +/* Based on virtblk_alloc_report_buffer */
> +static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
> + unsigned int nr_zones,
> + unsigned int zone_sectors, size_t *buflen)
> +{
> + struct request_queue *q = ublk->ub_disk->queue;
> + size_t bufsize;
> + void *buf;
> +
> + nr_zones = min_t(unsigned int, nr_zones,
> + get_capacity(ublk->ub_disk) >> ilog2(zone_sectors));
> +
> + bufsize = nr_zones * sizeof(struct blk_zone);
> + bufsize =
> + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
> + bufsize = min_t(size_t, bufsize, queue_max_segments(q) <<
> PAGE_SHIFT);
> +
> + while (bufsize >= sizeof(struct blk_zone)) {
> + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> + if (buf) {
> + *buflen = bufsize;
> + return buf;
> + }
> + bufsize >>= 1;
> + }
> +
> + bufsize = 0;
> + return NULL;
> +}
> +
> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb, void *data)
> +{
> + struct ublk_device *ub = disk->private_data;
> + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
> + unsigned int first_zone = sector >> ilog2(zone_size_sectors);
> + unsigned int done_zones = 0;
> + unsigned int max_zones_per_request;
> + struct blk_zone *buffer;
> + size_t buffer_length;
> +
> + if (!(ub->dev_info.flags & UBLK_F_ZONED))
> + return -EOPNOTSUPP;
> +
> + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
> + nr_zones);
> +
> + buffer = ublk_alloc_report_buffer(ub, nr_zones, zone_size_sectors,
> + &buffer_length);
> + if (!buffer)
> + return -ENOMEM;
> +
> + max_zones_per_request = buffer_length / sizeof(struct blk_zone);
> +
> + while (done_zones < nr_zones) {
> + unsigned int remaining_zones = nr_zones - done_zones;
> + unsigned int zones_in_request = min_t(
> + unsigned int, remaining_zones, max_zones_per_request);
> + int err = 0;
> + struct request *req;
> + struct ublk_rq_data *pdu;
> + blk_status_t status;
> +
> + memset(buffer, 0, buffer_length);
> +
> + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
> + if (IS_ERR(req))
> + return PTR_ERR(req);
> +
> + pdu = blk_mq_rq_to_pdu(req);
> + pdu->operation = UBLK_IO_OP_REPORT_ZONES;
> + pdu->sector = sector;
> + pdu->nr_sectors = remaining_zones * zone_size_sectors;
> +
> + err = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
> + GFP_KERNEL);
> + if (err) {
> + blk_mq_free_request(req);
> + kvfree(buffer);
> + return err;
> + }
> +
> + status = blk_execute_rq(req, 0);
> + err = blk_status_to_errno(status);
> + blk_mq_free_request(req);
> + if (err) {
> + kvfree(buffer);
> + return err;
> + }
> +
> + for (unsigned int i = 0; i < zones_in_request; i++) {
> + struct blk_zone *zone = buffer + i;
> +
> + err = cb(zone, i, data);
> + if (err)
> + return err;
> +
> + done_zones++;
> + sector += zone_size_sectors;
> +
> + /* A zero length zone means don't ask for more zones */
> + if (!zone->len) {
> + kvfree(buffer);
> + return done_zones;
> + }
> + }
> + }
> +
> + kvfree(buffer);
> + return done_zones;
> +}
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index e519dc0d9fe7..88fa39853c61 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -57,12 +57,13 @@
> | UBLK_F_USER_RECOVERY_REISSUE \
> | UBLK_F_UNPRIVILEGED_DEV \
> | UBLK_F_CMD_IOCTL_ENCODE \
> - | UBLK_F_USER_COPY)
> + | UBLK_F_USER_COPY \
> + | UBLK_F_ZONED)
>
>
> /* All UBLK_PARAM_TYPE_* should be included here */
> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
> - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
> -
> +#define UBLK_PARAM_TYPE_ALL \
> + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
>
>
> struct ublk_uring_cmd_pdu {
> struct ublk_queue *ubq;
> @@ -209,6 +210,9 @@ static void ublk_dev_param_basic_apply(struct
> ublk_device *ub)
> set_disk_ro(ub->ub_disk, true);
>
>
> set_capacity(ub->ub_disk, p->dev_sectors);
> +
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> + ublk_set_nr_zones(ub);
> }
>
>
> static void ublk_dev_param_discard_apply(struct ublk_device *ub)
> @@ -269,6 +273,9 @@ static int ublk_apply_params(struct ublk_device
> *ub)
> if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
> ublk_dev_param_discard_apply(ub);
>
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types &
> UBLK_PARAM_TYPE_ZONED))
> + ublk_dev_param_zoned_apply(ub);
> +
> return 0;
> }
>
>
> @@ -439,6 +446,7 @@ static const struct block_device_operations ub_fops
> = {
> .owner = THIS_MODULE,
> .open = ublk_open,
> .free_disk = ublk_free_disk,
> + .report_zones = ublk_report_zones,
> };
>
>
> #define UBLK_MAX_PIN_PAGES 32
> @@ -553,7 +561,8 @@ static inline bool ublk_need_map_req(const struct
> request *req)
>
>
> static inline bool ublk_need_unmap_req(const struct request *req)
> {
> - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
> + return ublk_rq_has_data(req) &&
> + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
> }
>
>
> static int ublk_map_io(const struct ublk_queue *ubq, const struct
> request *req,
> @@ -637,6 +646,7 @@ static blk_status_t ublk_setup_iod(struct
> ublk_queue *ubq, struct request *req)
> {
> struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> struct ublk_io *io = &ubq->ios[req->tag];
> + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
> u32 ublk_op;
>
>
> switch (req_op(req)) {
> @@ -655,6 +665,35 @@ static blk_status_t ublk_setup_iod(struct
> ublk_queue *ubq, struct request *req)
> case REQ_OP_WRITE_ZEROES:
> ublk_op = UBLK_IO_OP_WRITE_ZEROES;
> break;
> + case REQ_OP_ZONE_OPEN:
> + ublk_op = UBLK_IO_OP_ZONE_OPEN;
> + break;
> + case REQ_OP_ZONE_CLOSE:
> + ublk_op = UBLK_IO_OP_ZONE_CLOSE;
> + break;
> + case REQ_OP_ZONE_FINISH:
> + ublk_op = UBLK_IO_OP_ZONE_FINISH;
> + break;
> + case REQ_OP_ZONE_RESET:
> + ublk_op = UBLK_IO_OP_ZONE_RESET;
> + break;
> + case REQ_OP_DRV_IN:
> + ublk_op = pdu->operation;
> + switch (ublk_op) {
> + case UBLK_IO_OP_REPORT_ZONES:
> + iod->op_flags = ublk_op | ublk_req_build_flags(req);
> + iod->nr_sectors = pdu->nr_sectors;
> + iod->start_sector = pdu->sector;
> + iod->addr = io->addr;
> + return BLK_STS_OK;
> + default:
> + return BLK_STS_IOERR;
> + }
> + case REQ_OP_ZONE_APPEND:
> + case REQ_OP_ZONE_RESET_ALL:
> + case REQ_OP_DRV_OUT:
> + /* We do not support zone append or reset_all yet */
> + fallthrough;
> default:
> return BLK_STS_IOERR;
> }
> @@ -708,7 +747,8 @@ static inline void __ublk_complete_rq(struct
> request *req)
> *
> * Both the two needn't unmap.
> */
> - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
> + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
> + req_op(req) != REQ_OP_DRV_IN)
> goto exit;
>
>
> /* for READ request, writing data in iod->addr to rq buffers */
> @@ -1835,6 +1875,15 @@ static int ublk_ctrl_start_dev(struct
> ublk_device *ub, struct io_uring_cmd *cmd)
> if (ub->nr_privileged_daemon != ub->nr_queues_ready)
> set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
>
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + ub->dev_info.flags & UBLK_F_ZONED) {
> + disk_set_zoned(disk, BLK_ZONED_HM);
> + blk_queue_required_elevator_features(disk->queue,
> ELEVATOR_F_ZBD_SEQ_WRITE);
> + ret = ublk_revalidate_disk_zones(disk);
> + if (ret)
> + goto out_put_disk;
> + }
> +
> get_device(&ub->cdev_dev);
> ub->dev_info.state = UBLK_S_DEV_LIVE;
> ret = add_disk(disk);
> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd
> *cmd)
> if (ub->dev_info.flags & UBLK_F_USER_COPY)
> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>
>
> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
> + ub->dev_info.flags &= ~UBLK_F_ZONED;
> +
> /* We are not ready to support zero copy */
> ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>
>
> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
> index f81e62256456..7242430fd6b9 100644
> --- a/drivers/block/ublk_drv.h
> +++ b/drivers/block/ublk_drv.h
> @@ -50,6 +50,21 @@ struct ublk_rq_data {
> struct llist_node node;
>
>
> struct kref ref;
> + enum ublk_op operation;
> + __u64 sector;
> + __u32 nr_sectors;
> };
>
>
> +void ublk_set_nr_zones(struct ublk_device *ub);
> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> +int ublk_revalidate_disk_zones(struct gendisk *disk);
> +
> +#ifdef CONFIG_BLK_DEV_UBLK_ZONED
> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
> + unsigned int nr_zones, report_zones_cb cb,
> + void *data);
> +#else
> +#define ublk_report_zones NULL
> +#endif
> +
> #endif
> diff --git a/include/uapi/linux/ublk_cmd.h
> b/include/uapi/linux/ublk_cmd.h
> index 471b3b983045..436525afffe8 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -176,6 +176,11 @@
> /* Copy between request and user buffer by pread()/pwrite() */
> #define UBLK_F_USER_COPY (1UL << 7)
>
>
> +/*
> + * Enable zoned device support
> + */
> +#define UBLK_F_ZONED (1ULL << 8)
> +
> /* device state */
> #define UBLK_S_DEV_DEAD 0
> #define UBLK_S_DEV_LIVE 1
> @@ -242,6 +247,7 @@ enum ublk_op {
> UBLK_IO_OP_ZONE_APPEND = 13,
> UBLK_IO_OP_ZONE_RESET = 15,
> __UBLK_IO_OP_DRV_IN_START = 32,
> + UBLK_IO_OP_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
> __UBLK_IO_OP_DRV_IN_END = 96,
> __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
> __UBLK_IO_OP_DRV_OUT_END = 160,
> @@ -342,6 +348,12 @@ struct ublk_param_devt {
> __u32 disk_minor;
> };
>
>
> +struct ublk_param_zoned {
> + __u32 max_open_zones;
> + __u32 max_active_zones;
> + __u8 reserved[24];
> +};
> +
> struct ublk_params {
> /*
> * Total length of parameters, userspace has to set 'len' for both
> @@ -353,11 +365,13 @@ struct ublk_params {
> #define UBLK_PARAM_TYPE_BASIC (1 << 0)
> #define UBLK_PARAM_TYPE_DISCARD (1 << 1)
> #define UBLK_PARAM_TYPE_DEVT (1 << 2)
> +#define UBLK_PARAM_TYPE_ZONED (1 << 3)
> __u32 types; /* types of parameter included */
>
>
> struct ublk_param_basic basic;
> struct ublk_param_discard discard;
> struct ublk_param_devt devt;
> + struct ublk_param_zoned zoned;
> };
>
>
> #endif
I applied the patches and verified it compiles with and without the
CONFIG_BLK_DEV_ZONED flag.
Working on creating a ublk device also.
Regards,
Aravind
> From: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
>
>
> This change is in preparation for ublk zoned storage support.
>
>
> Signed-off-by: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
> ---
> MAINTAINERS | 1 +
> drivers/block/ublk_drv.c | 45 +-------------------------------
> drivers/block/ublk_drv.h | 55 ++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 57 insertions(+), 44 deletions(-)
> create mode 100644 drivers/block/ublk_drv.h
>
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 27ef11624748..ace71c90751c 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -21554,6 +21554,7 @@ L: [email protected]
> <mailto:[email protected]>
> S: Maintained
> F: Documentation/block/ublk.rst
> F: drivers/block/ublk_drv.c
> +F: drivers/block/ublk_drv.h
> F: include/uapi/linux/ublk_cmd.h
>
>
> UCLINUX (M68KNOMMU AND COLDFIRE)
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 1c823750c95a..e519dc0d9fe7 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -45,6 +45,7 @@
> #include <linux/namei.h>
> #include <linux/kref.h>
> #include <uapi/linux/ublk_cmd.h>
> +#include "ublk_drv.h"
>
>
> #define UBLK_MINORS (1U << MINORBITS)
>
>
> @@ -62,11 +63,6 @@
> #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
> UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
>
>
> -struct ublk_rq_data {
> - struct llist_node node;
> -
> - struct kref ref;
> -};
>
>
> struct ublk_uring_cmd_pdu {
> struct ublk_queue *ubq;
> @@ -140,45 +136,6 @@ struct ublk_queue {
>
>
> #define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
>
>
> -struct ublk_device {
> - struct gendisk *ub_disk;
> -
> - char *__queues;
> -
> - unsigned int queue_size;
> - struct ublksrv_ctrl_dev_info dev_info;
> -
> - struct blk_mq_tag_set tag_set;
> -
> - struct cdev cdev;
> - struct device cdev_dev;
> -
> -#define UB_STATE_OPEN 0
> -#define UB_STATE_USED 1
> -#define UB_STATE_DELETED 2
> - unsigned long state;
> - int ub_number;
> -
> - struct mutex mutex;
> -
> - spinlock_t mm_lock;
> - struct mm_struct *mm;
> -
> - struct ublk_params params;
> -
> - struct completion completion;
> - unsigned int nr_queues_ready;
> - unsigned int nr_privileged_daemon;
> -
> - /*
> - * Our ubq->daemon may be killed without any notification, so
> - * monitor each queue's daemon periodically
> - */
> - struct delayed_work monitor_work;
> - struct work_struct quiesce_work;
> - struct work_struct stop_work;
> -};
> -
> /* header of ublk_params */
> struct ublk_params_header {
> __u32 len;
> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
> new file mode 100644
> index 000000000000..f81e62256456
> --- /dev/null
> +++ b/drivers/block/ublk_drv.h
> @@ -0,0 +1,55 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _UBLK_DRV_H
> +#define _UBLK_DRV_H
> +
> +#include <uapi/linux/ublk_cmd.h>
> +#include <linux/blk-mq.h>
> +#include <linux/cdev.h>
> +
> +struct ublk_device {
> + struct gendisk *ub_disk;
> +
> + char *__queues;
> +
> + unsigned int queue_size;
> + struct ublksrv_ctrl_dev_info dev_info;
> +
> + struct blk_mq_tag_set tag_set;
> +
> + struct cdev cdev;
> + struct device cdev_dev;
> +
> +#define UB_STATE_OPEN 0
> +#define UB_STATE_USED 1
> +#define UB_STATE_DELETED 2
> + unsigned long state;
> + int ub_number;
> +
> + struct mutex mutex;
> +
> + spinlock_t mm_lock;
> + struct mm_struct *mm;
> +
> + struct ublk_params params;
> +
> + struct completion completion;
> + unsigned int nr_queues_ready;
> + unsigned int nr_privileged_daemon;
> +
> + /*
> + * Our ubq->daemon may be killed without any notification, so
> + * monitor each queue's daemon periodically
> + */
> + struct delayed_work monitor_work;
> + struct work_struct quiesce_work;
> + struct work_struct stop_work;
> +};
> +
> +struct ublk_rq_data {
> + struct llist_node node;
> +
> + struct kref ref;
> +};
> +
> +#endif
LGTM
Regards,
Aravind
> From: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
>
>
> This change is in preparation for zoned storage support.
>
>
> Signed-off-by: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
> ---
> include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
> 1 file changed, 17 insertions(+), 6 deletions(-)
>
>
> diff --git a/include/uapi/linux/ublk_cmd.h
> b/include/uapi/linux/ublk_cmd.h
> index 4b8558db90e1..471b3b983045 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
> __u64 reserved2;
> };
>
>
> -#define UBLK_IO_OP_READ 0
> -#define UBLK_IO_OP_WRITE 1
> -#define UBLK_IO_OP_FLUSH 2
> -#define UBLK_IO_OP_DISCARD 3
> -#define UBLK_IO_OP_WRITE_SAME 4
> -#define UBLK_IO_OP_WRITE_ZEROES 5
> +enum ublk_op {
> + UBLK_IO_OP_READ = 0,
> + UBLK_IO_OP_WRITE = 1,
> + UBLK_IO_OP_FLUSH = 2,
> + UBLK_IO_OP_DISCARD = 3,
> + UBLK_IO_OP_WRITE_SAME = 4,
> + UBLK_IO_OP_WRITE_ZEROES = 5,
> + UBLK_IO_OP_ZONE_OPEN = 10,
> + UBLK_IO_OP_ZONE_CLOSE = 11,
> + UBLK_IO_OP_ZONE_FINISH = 12,
> + UBLK_IO_OP_ZONE_APPEND = 13,
Curious to know if there is a reason to miss enum 14 here ?
And if UBLK_IO_OP_ZONE_APPEND is defined along with other operations
better to define that in patch 3 itself.
> + UBLK_IO_OP_ZONE_RESET = 15,
> + __UBLK_IO_OP_DRV_IN_START = 32,
> + __UBLK_IO_OP_DRV_IN_END = 96,
> + __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
> + __UBLK_IO_OP_DRV_OUT_END = 160,
> +};
>
>
> #define UBLK_IO_F_FAILFAST_DEV (1U << 8)
> #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9)
Regards,
Aravind
> From: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
>
>
> Add zone append feature to ublk. This feature uses the `addr` field of
> `struct
checkpatch.pl warns on the keeping the line within 75 characters.
WARNING: Possible unwrapped commit description (prefer a maximum 75
chars per line)
> ublksrv_io_cmd`. Therefore ublk must be used with the user copy
> feature (UBLK_F_USER_COPY) for zone append to be available. Without
> this
> feature, ublk will fail zone append requests.
>
>
> Suggested-by: Ming Lei <[email protected]
> <mailto:[email protected]>>
> Signed-off-by: Andreas Hindborg <[email protected]
> <mailto:[email protected]>>
> ---
> drivers/block/ublk_drv-zoned.c | 11 +++++++++
> drivers/block/ublk_drv.c | 43 ++++++++++++++++++++++++++++++----
> drivers/block/ublk_drv.h | 1 +
> include/uapi/linux/ublk_cmd.h | 3 ++-
> 4 files changed, 52 insertions(+), 6 deletions(-)
>
>
> diff --git a/drivers/block/ublk_drv-zoned.c
> b/drivers/block/ublk_drv-zoned.c
> index ea86bf4b3681..007e8fc7ff25 100644
> --- a/drivers/block/ublk_drv-zoned.c
> +++ b/drivers/block/ublk_drv-zoned.c
> @@ -16,6 +16,16 @@ void ublk_set_nr_zones(struct ublk_device *ub)
> ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
> }
>
>
> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
> +{
> + const struct ublk_param_zoned *p = &ub->params.zoned;
> +
> + if (! p->max_zone_append_sectors)
checkpatch.pl errors out here
ERROR: space prohibited after that '!' (ctx:BxW)
if (!p->max_zone_append_sectors)
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> void ublk_dev_param_zoned_apply(struct ublk_device *ub)
> {
> const struct ublk_param_zoned *p = &ub->params.zoned;
> @@ -23,6 +33,7 @@ void ublk_dev_param_zoned_apply(struct ublk_device
> *ub)
> if (ub->dev_info.flags & UBLK_F_ZONED) {
> disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
> disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
> + blk_queue_max_zone_append_sectors(ub->ub_disk->queue,
> p->max_zone_append_sectors);
> }
> }
>
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 88fa39853c61..6a949669b47e 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -107,6 +107,11 @@ struct ublk_uring_cmd_pdu {
> */
> #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
>
>
> +/*
> + * Set when IO is Zone Append
> + */
> +#define UBLK_IO_FLAG_ZONE_APPEND 0x10
> +
> struct ublk_io {
> /* userspace buffer address from io cmd */
> __u64 addr;
> @@ -230,6 +235,8 @@ static void ublk_dev_param_discard_apply(struct
> ublk_device *ub)
>
>
> static int ublk_validate_params(const struct ublk_device *ub)
> {
> + int ret = 0;
> +
> /* basic param is the only one which must be set */
> if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
> const struct ublk_param_basic *p = &ub->params.basic;
> @@ -260,6 +267,13 @@ static int ublk_validate_params(const struct
> ublk_device *ub)
> if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
> return -EINVAL;
>
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + (ub->params.types & UBLK_PARAM_TYPE_ZONED)) {
> + ret = ublk_dev_param_zoned_validate(ub);
> + if (ret)
> + return ret;
> + }
> +
> return 0;
> }
>
>
> @@ -690,6 +704,11 @@ static blk_status_t ublk_setup_iod(struct
> ublk_queue *ubq, struct request *req)
> return BLK_STS_IOERR;
> }
> case REQ_OP_ZONE_APPEND:
> + if (!(ubq->dev->dev_info.flags & UBLK_F_USER_COPY))
> + return BLK_STS_IOERR;
> + ublk_op = UBLK_IO_OP_ZONE_APPEND;
> + io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
> + break;
> case REQ_OP_ZONE_RESET_ALL:
> case REQ_OP_DRV_OUT:
> /* We do not support zone append or reset_all yet */
> @@ -1112,6 +1131,12 @@ static void ublk_commit_completion(struct
> ublk_device *ub,
> /* find the io request and complete */
> req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>
>
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
> + if (io->flags & UBLK_IO_FLAG_ZONE_APPEND)
> + req->__sector = ub_cmd->addr;
> + io->flags &= ~UBLK_IO_FLAG_ZONE_APPEND;
> + }
> +
> if (req && likely(!blk_should_fake_timeout(req->q)))
> ublk_put_req_ref(ubq, req);
> }
> @@ -1411,7 +1436,8 @@ static int __ublk_ch_uring_cmd(struct
> io_uring_cmd *cmd,
> ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
> goto out;
>
>
> - if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
> + if (ublk_support_user_copy(ubq) &&
> + !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
> ret = -EINVAL;
> goto out;
> }
> @@ -1534,11 +1560,14 @@ static inline bool ublk_check_ubuf_dir(const
> struct request *req,
> int ubuf_dir)
> {
> /* copy ubuf to request pages */
> - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
> + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
> + ubuf_dir == ITER_SOURCE)
> return true;
>
>
> /* copy request pages to ubuf */
> - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
> + if ((req_op(req) == REQ_OP_WRITE ||
> + req_op(req) == REQ_OP_ZONE_APPEND) &&
> + ubuf_dir == ITER_DEST)
> return true;
>
>
> return false;
> @@ -1867,6 +1896,12 @@ static int ublk_ctrl_start_dev(struct
> ublk_device *ub, struct io_uring_cmd *cmd)
> ub->dev_info.ublksrv_pid = ublksrv_pid;
> ub->ub_disk = disk;
>
>
> + ub->dev_info.state = UBLK_S_DEV_LIVE;
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> + ub->dev_info.flags & UBLK_F_ZONED) {
> + disk_set_zoned(disk, BLK_ZONED_HM);
> + }
> +
> ret = ublk_apply_params(ub);
> if (ret)
> goto out_put_disk;
> @@ -1877,7 +1912,6 @@ static int ublk_ctrl_start_dev(struct
> ublk_device *ub, struct io_uring_cmd *cmd)
>
>
> if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> ub->dev_info.flags & UBLK_F_ZONED) {
> - disk_set_zoned(disk, BLK_ZONED_HM);
> blk_queue_required_elevator_features(disk->queue,
> ELEVATOR_F_ZBD_SEQ_WRITE);
> ret = ublk_revalidate_disk_zones(disk);
> if (ret)
> @@ -1885,7 +1919,6 @@ static int ublk_ctrl_start_dev(struct
> ublk_device *ub, struct io_uring_cmd *cmd)
> }
>
>
> get_device(&ub->cdev_dev);
> - ub->dev_info.state = UBLK_S_DEV_LIVE;
> ret = add_disk(disk);
> if (ret) {
> /*
> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
> index 7242430fd6b9..f55e1c25531d 100644
> --- a/drivers/block/ublk_drv.h
> +++ b/drivers/block/ublk_drv.h
> @@ -56,6 +56,7 @@ struct ublk_rq_data {
> };
>
>
> void ublk_set_nr_zones(struct ublk_device *ub);
> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub);
> void ublk_dev_param_zoned_apply(struct ublk_device *ub);
> int ublk_revalidate_disk_zones(struct gendisk *disk);
>
>
> diff --git a/include/uapi/linux/ublk_cmd.h
> b/include/uapi/linux/ublk_cmd.h
> index 436525afffe8..4b6ad0b28598 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -351,7 +351,8 @@ struct ublk_param_devt {
> struct ublk_param_zoned {
> __u32 max_open_zones;
> __u32 max_active_zones;
> - __u8 reserved[24];
> + __u32 max_zone_append_sectors;
> + __u8 reserved[20];
> };
>
>
> struct ublk_params {
Regards,
Aravind
Damien Le Moal <[email protected]> writes:
> On 6/29/23 16:50, Andreas Hindborg (Samsung) wrote:
>>>> UCLINUX (M68KNOMMU AND COLDFIRE)
>>>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>>>> index 5b9d4aaebb81..c58dfd035557 100644
>>>> --- a/drivers/block/Kconfig
>>>> +++ b/drivers/block/Kconfig
>>>> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
>>>> suggested to enable N if your application(ublk server) switches to
>>>> ioctl command encoding.
>>>>
>>>> +config BLK_DEV_UBLK_ZONED
>>>> + def_bool y
>>>
>>> This can be "bool" only.
>>
>> I did it like this because I wanted BLK_DEV_UBLK_ZONED to be set
>> automatically to "y" when BLK_DEV_UBLK and BLK_DEV_ZONED are enabled.
>> BLK_DEV_UBLK_ZONED has no menu entry. If we change it to "bool" it will
>> be default "n" and we would have to make it appear in menuconfig to be
>> manually enabled.
>>
>> Isn't it more sane if it is just unconditionally enabled when
>> BLK_DEV_ZONED and BLK_DEV_UBLK is enabled?
>
> Maybe something like this then:
>
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 5b9d4aaebb81..105e1121bf5f 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -370,9 +370,13 @@ config BLK_DEV_RBD
>
> If unsure, say N.
>
> +config BLK_DEV_UBLK_ZONED
> + bool
> +
> config BLK_DEV_UBLK
> tristate "Userspace block driver (Experimental)"
> select IO_URING
> + select BLK_DEV_UBLK_ZONED if BLK_DEV_ZONED
> help
> io_uring based userspace block driver. Together with ublk server, ublk
> has been working well, but interface with userspace or command data
OK ????
>>>> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>>>> +}
>>>> +
>>>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>>>> +{
>>>> + const struct ublk_param_zoned *p = &ub->params.zoned;
>>>> +
>>>> + if (ub->dev_info.flags & UBLK_F_ZONED) {
>>>> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>>>> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>>>
>>> You do not need to check if the max_active_zones and max_open_zones values are
>>> sensible ? E.g. what if they are larger than the number of zones ?
>>
>> I don't think it will be a problem. I assume you can set them to 0 to
>> indicate infinite. If user space sets them to more than the actual
>> number of available zones (that user space set up also), user space will
>> have to handle that when it gets a zone request for a zone outside the
>> logical drive LBA space. I don't think the kernel has to care. Will this
>> break anything kernel side?
>
> zonefs and btrfs look at these limits. So I would rather prefer seeing sensible
> values. 0 == no limit is fine. But seeing maz or moz > nr_zones for the device
> is just too strange and I would prefer an error in that case to let the user
> know it is not doing something right. Getting moz > maz is also nonsense that
> needs to be checked. There is one case we could silently change: if maz ==
> nr_zones or moz == nr_zones, then that essentially means "no limit", so we could
> use 0 to let the in-kernel users that they do not need to care about the limits.
OK, will check.
>
>>
>>>
>>>> + }
>>>> +}
>>>> +
>>>> +int ublk_revalidate_disk_zones(struct gendisk *disk)
>>>> +{
>>>> + return blk_revalidate_disk_zones(disk, NULL);
>>>> +}
>>>
>>> I do not think this helper is needed at all (see below comment on the call site).
>>
>> This is required because the prototype for `blk_revalidate_disk_zones()`
>> is not defined when BLK_DEV_ZONED is not defined. Without this helper
>> for which the prototype is always defined, we will have a compile error
>> at the call site.
>
> You have this hunk in ublk_ctrl_start_dev:
>
> disk_set_zoned(disk, BLK_ZONED_HM);
> blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
> ret = ublk_revalidate_disk_zones(disk);
> if (ret)
> goto out_put_disk;
>
> And that is called under "if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)". So I do not see
> how you can get compile errors using directly blk_revalidate_disk_zones(). The
> entire hunk above should be a helper function I think.
The prototype for `disk_set_zoned()` is defined independent of
CONFIG_BLK_DEV_ZONED. The prototype for `blk_revalidate_disk_zones()` is
only defined when CONFIG_BLK_DEV_ZONED is enabled. It is not the same.
We can't have a call to `blk_revalidate_disk_zones()` in a translation
unit when the prototype is not defined, even behind a `if
(IS_ENABLED())` guard. It would be fine behind an `#ifdef` though.
If we move the entire hunk to a helper for which the prototype is always
defined, we should be able to call `blk_revalidate_disk_zones()`
directly ???? I'll see if I can do that.
>>>> +
>>>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>>>> + ublk_set_nr_zones(ub);
>>>
>>> So if the user is attempting to setup a zoned drive but the kernel does not have
>>> CONFIG_BLK_DEV_ZONED=y, the user setup will be silently ignored. Not exactly
>>> nice I think, unless I am missing something.
>>
>> We have this in `ublk_ctrl_add_dev()`:
>>
>> if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> ub->dev_info.flags &= ~UBLK_F_ZONED;
>>
>> User space is supposed to check the flags after the call to know the
>> kernel capabilities.
>
> And you trust user space to be always correct ? I never do :)
> So definitely, check and error if not supported. No silently clearing the device
> zoned flag.
>
I'll make it fail instead of just clearing the flag ????
>
>>> Also, repeating that "if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))" for all zone
>>> related functions is very verbose. Stub the functions in ublk_drv.h. That will
>>> make the main C code lighter.
>>
>> Not sure what you mean "stub the functions". Like so:
>>
>> @@ -216,8 +216,7 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>>
>> set_capacity(ub->ub_disk, p->dev_sectors);
>>
>> - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> - ublk_set_nr_zones(ub);
>> + ublk_config_nr_zones(ub);
>> }
>>
>> And then in the header:
>>
>> void ublk_config_nr_zones(struct ublk_device *ub)
>> {
>> if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> ublk_set_nr_zones(ub);
>> }
>>
>> Or something else?
>
> #ifndef CONFIG_BLK_DEV_ZONED
> static inline void ublk_set_nr_zones() {}
> #endif
>
> That avoid repeating that if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) all over the
> place. The stubbed functions can be called only for "if (ub->dev_info.flags &
> UBLK_F_ZONED", which is OK since this flag is never supposed to be accepted and
> set for the CONFIG_BLK_DEV_ZONED=n case.
Alright ????
>
>>>> + disk_set_zoned(disk, BLK_ZONED_HM);
>>>> + blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>>>> + ret = ublk_revalidate_disk_zones(disk);
>>>> + if (ret)
>>>> + goto out_put_disk;
>>>
>>> This should be all a helper ublk_set_zoned() or something.
>>
>> Ok ???? Unfortunately this block of code is split up in the zone append
>> patch. I will see what I can do, maybe 2 functions.
>
> Beware that I just posted patches that require zone size (chunk_sectors limit)
> and max append sectors limit to be set *before* calling
> blk_revalidate_disk_zones(). Otherwise, you will get an error.
Got it, thanks.
>
>>>> get_device(&ub->cdev_dev);
>>>> ub->dev_info.state = UBLK_S_DEV_LIVE;
>>>> ret = add_disk(disk);
>>>> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>>>> if (ub->dev_info.flags & UBLK_F_USER_COPY)
>>>> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>>>>
>>>> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>>>> + ub->dev_info.flags &= ~UBLK_F_ZONED;
>>>
>>> Arg, no. The user should be notified with an error that he/she is attempting to
>>> create a zoned device that cannot be supported.
>>
>> As I wrote above, this is part of the ublk uapi. This is the way user
>> space does kernel capability check. User space will check the flags when
>> this call returns. @Ming did I understand this correctly or did I miss something?
>
> OK. So this is like a virtio capability flags thing ? Negotiation between device
> and driver ?
I'm not familiar with that one, but I think you are right we should fail
the request here. I makes no sense to configure a regular ublk device if
user space wants to set up a zoned one. Failing is more sensible.
>
>>>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>>>> index 471b3b983045..436525afffe8 100644
>>>> --- a/include/uapi/linux/ublk_cmd.h
>>>> +++ b/include/uapi/linux/ublk_cmd.h
>>>> @@ -176,6 +176,11 @@
>>>> /* Copy between request and user buffer by pread()/pwrite() */
>>>> #define UBLK_F_USER_COPY (1UL << 7)
>>>>
>>>> +/*
>>>> + * Enable zoned device support
>>>
>>> Isn't this for "Indicate that the device is zoned" ?
>>
>> User space sets this flag when setting up the device to enable zoned
>> storage support. Kernel may reject it. I think the wording is
>> sufficient. Could be updated to:
>>
>> "User space sets this flag when setting up the device to request zoned
>> storage support. Kernel may deny the request by clearing the flag
>> during setup."
>
> I really have a big problem with this. If the user driver does not check that
> the flag was cleared, the user will assume that the device is zoned, but it is
> not. This is not super nice... I would really prefer just failing the devie
> creation if zoned was requested but cannot be satisfied, either because the
> kernel does not support zoned block devices or when the user specified something
> nonsensical.
Changed to:
/*
* User space sets this flag when setting up the device to request zoned storage support. Kernel may
* deny the request by returning an error.
*/
#define UBLK_F_ZONED (1ULL << 8)
Best regards,
Andreas
[email protected] writes:
>> On 29/06/23, 12:37 AM, "Andreas Hindborg" <[email protected]
>> <mailto:[email protected]>> wrote:
>> From: Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> Hi All,
>> This patch set adds zoned storage support to `ublk`. The first two patches
>> does
>> some house cleaning in preparation for the last two patches. The third patch
>> adds support for report_zones and the following operations:
>>
>
> Just to clarify, we do need you ublk user space patches
> to create a ublk device node (with these patches in kernel), right ?
I provide an example implementation. I put a link in the cover letter,
but I think the sentence referring to the link got lost, thanks for
pointing out.
https://github.com/metaspace/ubdsrv/commit/7de0d901c329fde7dc5a2e998952dd88bf5e668b
This implementation is based on Ming's ubdsrv code. You do not need to
use this one, you can write your own. I am also working on an
implementation in Rust, but that is still very early. I think Ming is
also writing a Rust library for user space ublk devices.
But currently my patched ubdsrv is the only user space implementation
supporting zoned ublk devices (with the loop and null targets).
>
>> - REQ_OP_ZONE_OPEN
>> - REQ_OP_ZONE_CLOSE
>> - REQ_OP_ZONE_FINISH
>> - REQ_OP_ZONE_RES
>
> REQ_OP_ZONE_RESET
Thanks!
>
>> The last patch adds support for REQ_OP_ZONE_APPEND.
>> v3 [2] -> v4 changes:
>> - Split up v3 patches
>> - Add zone append support
>> - Change order of variables in `ublk_report_zones`
>> Read/write and zone operations are tested with zenfs [3].
>> The zone append path is tested with fio -> zonefs -> ublk -> null_blk.
>> The implementation of zone append requires ublk user copy feature, and
>> therefore
>> the series is based on branch for-next (6afa337a3789) of [4].
>> [1]
>> https://github.com/metaspace/ubdsrv/commit/7de0d901c329fde7dc5a2e998952dd88bf5e668b
>> <https://github.com/metaspace/ubdsrv/commit/7de0d901c329fde7dc5a2e998952dd88bf5e668b>
>> [2]
>> https://lore.kernel.org/linux-block/[email protected]
>> <mailto:[email protected]>/
>> [3] https://github.com/westerndigitalcorporation/zenfs
>> <https://github.com/westerndigitalcorporation/zenfs>
>> [4] https://git.kernel.dk/linux.git <https://git.kernel.dk/linux.git>
>> Andreas Hindborg (4):
>> ublk: change ublk IO command defines to enum
>> ublk: move types to shared header file
>> ublk: enable zoned storage support
>> ublk: add zone append
>> MAINTAINERS | 2 +
>> drivers/block/Kconfig | 4 +
>> drivers/block/Makefile | 4 +-
>> drivers/block/ublk_drv-zoned.c | 155 +++++++++++++++++++++++++++++++++
>> drivers/block/ublk_drv.c | 150 +++++++++++++++++++------------
>> drivers/block/ublk_drv.h | 71 +++++++++++++++
>> include/uapi/linux/ublk_cmd.h | 38 ++++++--
>> 7 files changed, 363 insertions(+), 61 deletions(-)
>> create mode 100644 drivers/block/ublk_drv-zoned.c
>> create mode 100644 drivers/block/ublk_drv.h
>> base-commit: 3261ea42710e9665c9151006049411bd23b5411f
>
> Regards,
> Aravind
[email protected] writes:
>> From: Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> This change is in preparation for zoned storage support.
>> Signed-off-by: Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> ---
>> include/uapi/linux/ublk_cmd.h | 23 +++++++++++++++++------
>> 1 file changed, 17 insertions(+), 6 deletions(-)
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 4b8558db90e1..471b3b983045 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -229,12 +229,23 @@ struct ublksrv_ctrl_dev_info {
>> __u64 reserved2;
>> };
>> -#define UBLK_IO_OP_READ 0
>> -#define UBLK_IO_OP_WRITE 1
>> -#define UBLK_IO_OP_FLUSH 2
>> -#define UBLK_IO_OP_DISCARD 3
>> -#define UBLK_IO_OP_WRITE_SAME 4
>> -#define UBLK_IO_OP_WRITE_ZEROES 5
>> +enum ublk_op {
>> + UBLK_IO_OP_READ = 0,
>> + UBLK_IO_OP_WRITE = 1,
>> + UBLK_IO_OP_FLUSH = 2,
>> + UBLK_IO_OP_DISCARD = 3,
>> + UBLK_IO_OP_WRITE_SAME = 4,
>> + UBLK_IO_OP_WRITE_ZEROES = 5,
>> + UBLK_IO_OP_ZONE_OPEN = 10,
>> + UBLK_IO_OP_ZONE_CLOSE = 11,
>> + UBLK_IO_OP_ZONE_FINISH = 12,
>> + UBLK_IO_OP_ZONE_APPEND = 13,
>
> Curious to know if there is a reason to miss enum 14 here ?
> And if UBLK_IO_OP_ZONE_APPEND is defined along with other operations
> better to define that in patch 3 itself.
They are defined after REQ_OP_ZONE_*, and they have a hole at 14 ????
BR Andreas
>
>> + UBLK_IO_OP_ZONE_RESET = 15,
>> + __UBLK_IO_OP_DRV_IN_START = 32,
>> + __UBLK_IO_OP_DRV_IN_END = 96,
>> + __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
>> + __UBLK_IO_OP_DRV_OUT_END = 160,
>> +};
>> #define UBLK_IO_F_FAILFAST_DEV (1U << 8)
>> #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9)
>
> Regards,
> Aravind
[email protected] writes:
>> From: Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> Add zone append feature to ublk. This feature uses the `addr` field of `struct
>
> checkpatch.pl warns on the keeping the line within 75 characters.
> WARNING: Possible unwrapped commit description (prefer a maximum 75 chars per
> line)
Thanks, I will make sure to rerun the checker. I thought I ran it
already ????
BR Andreas
>
>> ublksrv_io_cmd`. Therefore ublk must be used with the user copy
>> feature (UBLK_F_USER_COPY) for zone append to be available. Without this
>> feature, ublk will fail zone append requests.
>> Suggested-by: Ming Lei <[email protected] <mailto:[email protected]>>
>> Signed-off-by: Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> ---
>> drivers/block/ublk_drv-zoned.c | 11 +++++++++
>> drivers/block/ublk_drv.c | 43 ++++++++++++++++++++++++++++++----
>> drivers/block/ublk_drv.h | 1 +
>> include/uapi/linux/ublk_cmd.h | 3 ++-
>> 4 files changed, 52 insertions(+), 6 deletions(-)
>> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
>> index ea86bf4b3681..007e8fc7ff25 100644
>> --- a/drivers/block/ublk_drv-zoned.c
>> +++ b/drivers/block/ublk_drv-zoned.c
>> @@ -16,6 +16,16 @@ void ublk_set_nr_zones(struct ublk_device *ub)
>> ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> }
>> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
>> +{
>> + const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> + if (! p->max_zone_append_sectors)
>
> checkpatch.pl errors out here
> ERROR: space prohibited after that '!' (ctx:BxW)
> if (!p->max_zone_append_sectors)
>
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>> +
>> void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> {
>> const struct ublk_param_zoned *p = &ub->params.zoned;
>> @@ -23,6 +33,7 @@ void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> if (ub->dev_info.flags & UBLK_F_ZONED) {
>> disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>> + blk_queue_max_zone_append_sectors(ub->ub_disk->queue,
>> p->max_zone_append_sectors);
>> }
>> }
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index 88fa39853c61..6a949669b47e 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -107,6 +107,11 @@ struct ublk_uring_cmd_pdu {
>> */
>> #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
>> +/*
>> + * Set when IO is Zone Append
>> + */
>> +#define UBLK_IO_FLAG_ZONE_APPEND 0x10
>> +
>> struct ublk_io {
>> /* userspace buffer address from io cmd */
>> __u64 addr;
>> @@ -230,6 +235,8 @@ static void ublk_dev_param_discard_apply(struct
>> ublk_device *ub)
>> static int ublk_validate_params(const struct ublk_device *ub)
>> {
>> + int ret = 0;
>> +
>> /* basic param is the only one which must be set */
>> if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
>> const struct ublk_param_basic *p = &ub->params.basic;
>> @@ -260,6 +267,13 @@ static int ublk_validate_params(const struct
>> ublk_device *ub)
>> if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
>> return -EINVAL;
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> + (ub->params.types & UBLK_PARAM_TYPE_ZONED)) {
>> + ret = ublk_dev_param_zoned_validate(ub);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> return 0;
>> }
>> @@ -690,6 +704,11 @@ static blk_status_t ublk_setup_iod(struct
>> ublk_queue *ubq, struct request *req)
>> return BLK_STS_IOERR;
>> }
>> case REQ_OP_ZONE_APPEND:
>> + if (!(ubq->dev->dev_info.flags & UBLK_F_USER_COPY))
>> + return BLK_STS_IOERR;
>> + ublk_op = UBLK_IO_OP_ZONE_APPEND;
>> + io->flags |= UBLK_IO_FLAG_ZONE_APPEND;
>> + break;
>> case REQ_OP_ZONE_RESET_ALL:
>> case REQ_OP_DRV_OUT:
>> /* We do not support zone append or reset_all yet */
>> @@ -1112,6 +1131,12 @@ static void ublk_commit_completion(struct
>> ublk_device *ub,
>> /* find the io request and complete */
>> req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
>> + if (io->flags & UBLK_IO_FLAG_ZONE_APPEND)
>> + req->__sector = ub_cmd->addr;
>> + io->flags &= ~UBLK_IO_FLAG_ZONE_APPEND;
>> + }
>> +
>> if (req && likely(!blk_should_fake_timeout(req->q)))
>> ublk_put_req_ref(ubq, req);
>> }
>> @@ -1411,7 +1436,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>> ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
>> goto out;
>> - if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
>> + if (ublk_support_user_copy(ubq) &&
>> + !(io->flags & UBLK_IO_FLAG_ZONE_APPEND) && ub_cmd->addr) {
>> ret = -EINVAL;
>> goto out;
>> }
>> @@ -1534,11 +1560,14 @@ static inline bool ublk_check_ubuf_dir(const
>> struct request *req,
>> int ubuf_dir)
>> {
>> /* copy ubuf to request pages */
>> - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
>> + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
>> + ubuf_dir == ITER_SOURCE)
>> return true;
>> /* copy request pages to ubuf */
>> - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
>> + if ((req_op(req) == REQ_OP_WRITE ||
>> + req_op(req) == REQ_OP_ZONE_APPEND) &&
>> + ubuf_dir == ITER_DEST)
>> return true;
>> return false;
>> @@ -1867,6 +1896,12 @@ static int ublk_ctrl_start_dev(struct
>> ublk_device *ub, struct io_uring_cmd *cmd)
>> ub->dev_info.ublksrv_pid = ublksrv_pid;
>> ub->ub_disk = disk;
>> + ub->dev_info.state = UBLK_S_DEV_LIVE;
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> + ub->dev_info.flags & UBLK_F_ZONED) {
>> + disk_set_zoned(disk, BLK_ZONED_HM);
>> + }
>> +
>> ret = ublk_apply_params(ub);
>> if (ret)
>> goto out_put_disk;
>> @@ -1877,7 +1912,6 @@ static int ublk_ctrl_start_dev(struct
>> ublk_device *ub, struct io_uring_cmd *cmd)
>> if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> ub->dev_info.flags & UBLK_F_ZONED) {
>> - disk_set_zoned(disk, BLK_ZONED_HM);
>> blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>> ret = ublk_revalidate_disk_zones(disk);
>> if (ret)
>> @@ -1885,7 +1919,6 @@ static int ublk_ctrl_start_dev(struct
>> ublk_device *ub, struct io_uring_cmd *cmd)
>> }
>> get_device(&ub->cdev_dev);
>> - ub->dev_info.state = UBLK_S_DEV_LIVE;
>> ret = add_disk(disk);
>> if (ret) {
>> /*
>> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
>> index 7242430fd6b9..f55e1c25531d 100644
>> --- a/drivers/block/ublk_drv.h
>> +++ b/drivers/block/ublk_drv.h
>> @@ -56,6 +56,7 @@ struct ublk_rq_data {
>> };
>> void ublk_set_nr_zones(struct ublk_device *ub);
>> +int ublk_dev_param_zoned_validate(const struct ublk_device *ub);
>> void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> int ublk_revalidate_disk_zones(struct gendisk *disk);
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 436525afffe8..4b6ad0b28598 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -351,7 +351,8 @@ struct ublk_param_devt {
>> struct ublk_param_zoned {
>> __u32 max_open_zones;
>> __u32 max_active_zones;
>> - __u8 reserved[24];
>> + __u32 max_zone_append_sectors;
>> + __u8 reserved[20];
>> };
>> struct ublk_params {
>
> Regards,
> Aravind
[email protected] writes:
>> From: Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> Add zoned storage support to ublk: report_zones and operations:
>> - REQ_OP_ZONE_OPEN
>> - REQ_OP_ZONE_CLOSE
>> - REQ_OP_ZONE_FINISH
>> - REQ_OP_ZONE_RESET
>> Note: This commit changes the ublk kernel module name from `ublk_drv.ko` to
>> `ublk.ko` in order to link multiple translation units into the module.
>> Signed-off-by: Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> ---
>> MAINTAINERS | 1 +
>> drivers/block/Kconfig | 4 +
>> drivers/block/Makefile | 4 +-
>> drivers/block/ublk_drv-zoned.c | 144 +++++++++++++++++++++++++++++++++
>> drivers/block/ublk_drv.c | 64 +++++++++++++--
>> drivers/block/ublk_drv.h | 15 ++++
>> include/uapi/linux/ublk_cmd.h | 14 ++++
>> 7 files changed, 239 insertions(+), 7 deletions(-)
>> create mode 100644 drivers/block/ublk_drv-zoned.c
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index ace71c90751c..db8a8deb5926 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -21555,6 +21555,7 @@ S: Maintained
>> F: Documentation/block/ublk.rst
>> F: drivers/block/ublk_drv.c
>> F: drivers/block/ublk_drv.h
>> +F: drivers/block/ublk_drv-zoned.c
>
> nitpick: checkpatch.pl gives a warning on the ordering of the files.
> WARNING: Misordered MAINTAINERS entry - list file patterns in alphabetic order
>
>> F: include/uapi/linux/ublk_cmd.h
>> UCLINUX (M68KNOMMU AND COLDFIRE)
>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>> index 5b9d4aaebb81..c58dfd035557 100644
>> --- a/drivers/block/Kconfig
>> +++ b/drivers/block/Kconfig
>> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
>> suggested to enable N if your application(ublk server) switches to
>> ioctl command encoding.
>> +config BLK_DEV_UBLK_ZONED
>> + def_bool y
>> + depends on BLK_DEV_UBLK && BLK_DEV_ZONED
>> +
>> source "drivers/block/rnbd/Kconfig"
>> endif # BLK_DEV
>> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
>> index 101612cba303..bc1649e20ec2 100644
>> --- a/drivers/block/Makefile
>> +++ b/drivers/block/Makefile
>> @@ -37,6 +37,8 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
>> obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
>> -obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
>> +obj-$(CONFIG_BLK_DEV_UBLK) += ublk.o
>> +ublk-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
>> +ublk-$(CONFIG_BLK_DEV_UBLK_ZONED) += ublk_drv-zoned.o
>> swim_mod-y := swim.o swim_asm.o
>> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
>> new file mode 100644
>> index 000000000000..ea86bf4b3681
>> --- /dev/null
>> +++ b/drivers/block/ublk_drv-zoned.c
>> @@ -0,0 +1,144 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright 2023 Andreas Hindborg <[email protected]
>> <mailto:[email protected]>>
>> + */
>> +#include <linux/blkzoned.h>
>> +#include <linux/ublk_cmd.h>
>> +#include <linux/vmalloc.h>
>> +
>> +#include "ublk_drv.h"
>> +
>> +void ublk_set_nr_zones(struct ublk_device *ub)
>> +{
>> + const struct ublk_param_basic *p = &ub->params.basic;
>> +
>> + if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
>> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> +}
>> +
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> +{
>> + const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> + if (ub->dev_info.flags & UBLK_F_ZONED) {
>> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>> + }
>> +}
>> +
>> +int ublk_revalidate_disk_zones(struct gendisk *disk)
>> +{
>> + return blk_revalidate_disk_zones(disk, NULL);
>> +}
>> +
>> +/* Based on virtblk_alloc_report_buffer */
>> +static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
>> + unsigned int nr_zones,
>> + unsigned int zone_sectors, size_t *buflen)
>> +{
>> + struct request_queue *q = ublk->ub_disk->queue;
>> + size_t bufsize;
>> + void *buf;
>> +
>> + nr_zones = min_t(unsigned int, nr_zones,
>> + get_capacity(ublk->ub_disk) >> ilog2(zone_sectors));
>> +
>> + bufsize = nr_zones * sizeof(struct blk_zone);
>> + bufsize =
>> + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
>> + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
>> +
>> + while (bufsize >= sizeof(struct blk_zone)) {
>> + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>> + if (buf) {
>> + *buflen = bufsize;
>> + return buf;
>> + }
>> + bufsize >>= 1;
>> + }
>> +
>> + bufsize = 0;
>> + return NULL;
>> +}
>> +
>> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> + unsigned int nr_zones, report_zones_cb cb, void *data)
>> +{
>> + struct ublk_device *ub = disk->private_data;
>> + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
>> + unsigned int first_zone = sector >> ilog2(zone_size_sectors);
>> + unsigned int done_zones = 0;
>> + unsigned int max_zones_per_request;
>> + struct blk_zone *buffer;
>> + size_t buffer_length;
>> +
>> + if (!(ub->dev_info.flags & UBLK_F_ZONED))
>> + return -EOPNOTSUPP;
>> +
>> + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
>> + nr_zones);
>> +
>> + buffer = ublk_alloc_report_buffer(ub, nr_zones, zone_size_sectors,
>> + &buffer_length);
>> + if (!buffer)
>> + return -ENOMEM;
>> +
>> + max_zones_per_request = buffer_length / sizeof(struct blk_zone);
>> +
>> + while (done_zones < nr_zones) {
>> + unsigned int remaining_zones = nr_zones - done_zones;
>> + unsigned int zones_in_request = min_t(
>> + unsigned int, remaining_zones, max_zones_per_request);
>> + int err = 0;
>> + struct request *req;
>> + struct ublk_rq_data *pdu;
>> + blk_status_t status;
>> +
>> + memset(buffer, 0, buffer_length);
>> +
>> + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
>> + if (IS_ERR(req))
>> + return PTR_ERR(req);
>> +
>> + pdu = blk_mq_rq_to_pdu(req);
>> + pdu->operation = UBLK_IO_OP_REPORT_ZONES;
>> + pdu->sector = sector;
>> + pdu->nr_sectors = remaining_zones * zone_size_sectors;
>> +
>> + err = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
>> + GFP_KERNEL);
>> + if (err) {
>> + blk_mq_free_request(req);
>> + kvfree(buffer);
>> + return err;
>> + }
>> +
>> + status = blk_execute_rq(req, 0);
>> + err = blk_status_to_errno(status);
>> + blk_mq_free_request(req);
>> + if (err) {
>> + kvfree(buffer);
>> + return err;
>> + }
>> +
>> + for (unsigned int i = 0; i < zones_in_request; i++) {
>> + struct blk_zone *zone = buffer + i;
>> +
>> + err = cb(zone, i, data);
>> + if (err)
>> + return err;
>> +
>> + done_zones++;
>> + sector += zone_size_sectors;
>> +
>> + /* A zero length zone means don't ask for more zones */
>> + if (!zone->len) {
>> + kvfree(buffer);
>> + return done_zones;
>> + }
>> + }
>> + }
>> +
>> + kvfree(buffer);
>> + return done_zones;
>> +}
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index e519dc0d9fe7..88fa39853c61 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -57,12 +57,13 @@
>> | UBLK_F_USER_RECOVERY_REISSUE \
>> | UBLK_F_UNPRIVILEGED_DEV \
>> | UBLK_F_CMD_IOCTL_ENCODE \
>> - | UBLK_F_USER_COPY)
>> + | UBLK_F_USER_COPY \
>> + | UBLK_F_ZONED)
>> /* All UBLK_PARAM_TYPE_* should be included here */
>> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
>> - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
>> -
>> +#define UBLK_PARAM_TYPE_ALL \
>> + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
>> + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
>> struct ublk_uring_cmd_pdu {
>> struct ublk_queue *ubq;
>> @@ -209,6 +210,9 @@ static void ublk_dev_param_basic_apply(struct
>> ublk_device *ub)
>> set_disk_ro(ub->ub_disk, true);
>> set_capacity(ub->ub_disk, p->dev_sectors);
>> +
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> + ublk_set_nr_zones(ub);
>> }
>> static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>> @@ -269,6 +273,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>> if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>> ublk_dev_param_discard_apply(ub);
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types &
>> UBLK_PARAM_TYPE_ZONED))
>> + ublk_dev_param_zoned_apply(ub);
>> +
>> return 0;
>> }
>> @@ -439,6 +446,7 @@ static const struct block_device_operations ub_fops = {
>> .owner = THIS_MODULE,
>> .open = ublk_open,
>> .free_disk = ublk_free_disk,
>> + .report_zones = ublk_report_zones,
>> };
>> #define UBLK_MAX_PIN_PAGES 32
>> @@ -553,7 +561,8 @@ static inline bool ublk_need_map_req(const struct
>> request *req)
>> static inline bool ublk_need_unmap_req(const struct request *req)
>> {
>> - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
>> + return ublk_rq_has_data(req) &&
>> + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
>> }
>> static int ublk_map_io(const struct ublk_queue *ubq, const struct request
>> *req,
>> @@ -637,6 +646,7 @@ static blk_status_t ublk_setup_iod(struct
>> ublk_queue *ubq, struct request *req)
>> {
>> struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
>> struct ublk_io *io = &ubq->ios[req->tag];
>> + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
>> u32 ublk_op;
>> switch (req_op(req)) {
>> @@ -655,6 +665,35 @@ static blk_status_t ublk_setup_iod(struct
>> ublk_queue *ubq, struct request *req)
>> case REQ_OP_WRITE_ZEROES:
>> ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>> break;
>> + case REQ_OP_ZONE_OPEN:
>> + ublk_op = UBLK_IO_OP_ZONE_OPEN;
>> + break;
>> + case REQ_OP_ZONE_CLOSE:
>> + ublk_op = UBLK_IO_OP_ZONE_CLOSE;
>> + break;
>> + case REQ_OP_ZONE_FINISH:
>> + ublk_op = UBLK_IO_OP_ZONE_FINISH;
>> + break;
>> + case REQ_OP_ZONE_RESET:
>> + ublk_op = UBLK_IO_OP_ZONE_RESET;
>> + break;
>> + case REQ_OP_DRV_IN:
>> + ublk_op = pdu->operation;
>> + switch (ublk_op) {
>> + case UBLK_IO_OP_REPORT_ZONES:
>> + iod->op_flags = ublk_op | ublk_req_build_flags(req);
>> + iod->nr_sectors = pdu->nr_sectors;
>> + iod->start_sector = pdu->sector;
>> + iod->addr = io->addr;
>> + return BLK_STS_OK;
>> + default:
>> + return BLK_STS_IOERR;
>> + }
>> + case REQ_OP_ZONE_APPEND:
>> + case REQ_OP_ZONE_RESET_ALL:
>> + case REQ_OP_DRV_OUT:
>> + /* We do not support zone append or reset_all yet */
>> + fallthrough;
>> default:
>> return BLK_STS_IOERR;
>> }
>> @@ -708,7 +747,8 @@ static inline void __ublk_complete_rq(struct request *req)
>> *
>> * Both the two needn't unmap.
>> */
>> - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
>> + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
>> + req_op(req) != REQ_OP_DRV_IN)
>> goto exit;
>> /* for READ request, writing data in iod->addr to rq buffers */
>> @@ -1835,6 +1875,15 @@ static int ublk_ctrl_start_dev(struct
>> ublk_device *ub, struct io_uring_cmd *cmd)
>> if (ub->nr_privileged_daemon != ub->nr_queues_ready)
>> set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> + ub->dev_info.flags & UBLK_F_ZONED) {
>> + disk_set_zoned(disk, BLK_ZONED_HM);
>> + blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>> + ret = ublk_revalidate_disk_zones(disk);
>> + if (ret)
>> + goto out_put_disk;
>> + }
>> +
>> get_device(&ub->cdev_dev);
>> ub->dev_info.state = UBLK_S_DEV_LIVE;
>> ret = add_disk(disk);
>> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>> if (ub->dev_info.flags & UBLK_F_USER_COPY)
>> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> + ub->dev_info.flags &= ~UBLK_F_ZONED;
>> +
>> /* We are not ready to support zero copy */
>> ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
>> index f81e62256456..7242430fd6b9 100644
>> --- a/drivers/block/ublk_drv.h
>> +++ b/drivers/block/ublk_drv.h
>> @@ -50,6 +50,21 @@ struct ublk_rq_data {
>> struct llist_node node;
>> struct kref ref;
>> + enum ublk_op operation;
>> + __u64 sector;
>> + __u32 nr_sectors;
>> };
>> +void ublk_set_nr_zones(struct ublk_device *ub);
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> +int ublk_revalidate_disk_zones(struct gendisk *disk);
>> +
>> +#ifdef CONFIG_BLK_DEV_UBLK_ZONED
>> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> + unsigned int nr_zones, report_zones_cb cb,
>> + void *data);
>> +#else
>> +#define ublk_report_zones NULL
>> +#endif
>> +
>> #endif
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 471b3b983045..436525afffe8 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -176,6 +176,11 @@
>> /* Copy between request and user buffer by pread()/pwrite() */
>> #define UBLK_F_USER_COPY (1UL << 7)
>> +/*
>> + * Enable zoned device support
>> + */
>> +#define UBLK_F_ZONED (1ULL << 8)
>> +
>> /* device state */
>> #define UBLK_S_DEV_DEAD 0
>> #define UBLK_S_DEV_LIVE 1
>> @@ -242,6 +247,7 @@ enum ublk_op {
>> UBLK_IO_OP_ZONE_APPEND = 13,
>> UBLK_IO_OP_ZONE_RESET = 15,
>> __UBLK_IO_OP_DRV_IN_START = 32,
>> + UBLK_IO_OP_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
>> __UBLK_IO_OP_DRV_IN_END = 96,
>> __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
>> __UBLK_IO_OP_DRV_OUT_END = 160,
>> @@ -342,6 +348,12 @@ struct ublk_param_devt {
>> __u32 disk_minor;
>> };
>> +struct ublk_param_zoned {
>> + __u32 max_open_zones;
>> + __u32 max_active_zones;
>> + __u8 reserved[24];
>> +};
>> +
>> struct ublk_params {
>> /*
>> * Total length of parameters, userspace has to set 'len' for both
>> @@ -353,11 +365,13 @@ struct ublk_params {
>> #define UBLK_PARAM_TYPE_BASIC (1 << 0)
>> #define UBLK_PARAM_TYPE_DISCARD (1 << 1)
>> #define UBLK_PARAM_TYPE_DEVT (1 << 2)
>> +#define UBLK_PARAM_TYPE_ZONED (1 << 3)
>> __u32 types; /* types of parameter included */
>> struct ublk_param_basic basic;
>> struct ublk_param_discard discard;
>> struct ublk_param_devt devt;
>> + struct ublk_param_zoned zoned;
>> };
>> #endif
> I applied the patches and verified it compiles with and without the
> CONFIG_BLK_DEV_ZONED flag.
> Working on creating a ublk device also.
For reference, here are my test scripts:
------------------------------------
test-append.sh:
#!/bin/bash
set -e
set -x
/mnt/nullblk.py
./ublk add -x --usercopy -t loop -f /dev/nullblk -q 8 -d 128
mkzonefs -f /dev/ublkb0
mount -t zonefs /dev/ublkb0 /zone
fio /mnt/append.fio
umount /zone
./ublk del 0
------------------------------------
nullblk.py:
#!/usr/bin/env python3
from subprocess import run
from pathlib import Path
block_size_bytes = 4096
zone_size_mib = 1024
num_zones = 32
config_path = Path("/sys/kernel/config/nullb")
# Remove all devices
for node in config_path.iterdir():
if node.is_dir():
node.rmdir()
device_capacity_mib = zone_size_mib * num_zones
control = config_path / "nullblk"
control.mkdir()
(control / "blocksize").write_text(str(block_size_bytes))
(control / "completion_nsec").write_text("0")
(control / "irqmode").write_text("0")
(control / "queue_mode").write_text("2")
(control / "hw_queue_depth").write_text("1024")
(control / "memory_backed").write_text("1")
(control / "size").write_text(str(device_capacity_mib))
(control / "zoned").write_text("1")
(control / "zone_size").write_text(str(zone_size_mib))
(control / "zone_nr_conv").write_text("0")
(control / "zone_max_active").write_text("14")
(control / "zone_max_open").write_text("14")
(control / "power").write_text("1")
------------------------------------
append.fio:
[global]
ioengine=psync
do_verify=1
verify=md5
random_generator=lfsr
[append0]
file_append=1
rw=write
bs=4k
direct=1
size=1024m
filename=/zone/seq/0
[append1]
file_append=1
rw=write
bs=4k
direct=1
size=1024m
filename=/zone/seq/1
[append2]
file_append=1
rw=write
bs=4k
direct=1
size=1024m
filename=/zone/seq/2
[append3]
file_append=1
rw=write
bs=4k
direct=1
size=1024m
filename=/zone/seq/3
[append4]
file_append=1
rw=write
bs=4k
direct=1
size=1024m
filename=/zone/seq/4
[append5]
file_append=1
rw=write
bs=4k
direct=1
size=1024m
filename=/zone/seq/5
...
Nothing fancy, but it will test the append path. I have been running the
ZenFS test suite as well, but that does not hit the append path. Maybe
if you run it against zonefs?
Best regards,
Andreas
Ming Lei <[email protected]> writes:
> On Wed, Jun 28, 2023 at 09:06:48PM +0200, Andreas Hindborg wrote:
>> From: Andreas Hindborg <[email protected]>
>>
>> Add zoned storage support to ublk: report_zones and operations:
>> - REQ_OP_ZONE_OPEN
>> - REQ_OP_ZONE_CLOSE
>> - REQ_OP_ZONE_FINISH
>> - REQ_OP_ZONE_RESET
>>
>> Note: This commit changes the ublk kernel module name from `ublk_drv.ko` to
>> `ublk.ko` in order to link multiple translation units into the module.
>>
>> Signed-off-by: Andreas Hindborg <[email protected]>
>> ---
>> MAINTAINERS | 1 +
>> drivers/block/Kconfig | 4 +
>> drivers/block/Makefile | 4 +-
>> drivers/block/ublk_drv-zoned.c | 144 +++++++++++++++++++++++++++++++++
>> drivers/block/ublk_drv.c | 64 +++++++++++++--
>> drivers/block/ublk_drv.h | 15 ++++
>> include/uapi/linux/ublk_cmd.h | 14 ++++
>> 7 files changed, 239 insertions(+), 7 deletions(-)
>> create mode 100644 drivers/block/ublk_drv-zoned.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index ace71c90751c..db8a8deb5926 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -21555,6 +21555,7 @@ S: Maintained
>> F: Documentation/block/ublk.rst
>> F: drivers/block/ublk_drv.c
>> F: drivers/block/ublk_drv.h
>> +F: drivers/block/ublk_drv-zoned.c
>> F: include/uapi/linux/ublk_cmd.h
>>
>> UCLINUX (M68KNOMMU AND COLDFIRE)
>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>> index 5b9d4aaebb81..c58dfd035557 100644
>> --- a/drivers/block/Kconfig
>> +++ b/drivers/block/Kconfig
>> @@ -402,6 +402,10 @@ config BLKDEV_UBLK_LEGACY_OPCODES
>> suggested to enable N if your application(ublk server) switches to
>> ioctl command encoding.
>>
>> +config BLK_DEV_UBLK_ZONED
>> + def_bool y
>> + depends on BLK_DEV_UBLK && BLK_DEV_ZONED
>> +
>> source "drivers/block/rnbd/Kconfig"
>>
>> endif # BLK_DEV
>> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
>> index 101612cba303..bc1649e20ec2 100644
>> --- a/drivers/block/Makefile
>> +++ b/drivers/block/Makefile
>> @@ -37,6 +37,8 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
>>
>> obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
>>
>> -obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
>> +obj-$(CONFIG_BLK_DEV_UBLK) += ublk.o
>> +ublk-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
>> +ublk-$(CONFIG_BLK_DEV_UBLK_ZONED) += ublk_drv-zoned.o
>>
>> swim_mod-y := swim.o swim_asm.o
>> diff --git a/drivers/block/ublk_drv-zoned.c b/drivers/block/ublk_drv-zoned.c
>> new file mode 100644
>> index 000000000000..ea86bf4b3681
>> --- /dev/null
>> +++ b/drivers/block/ublk_drv-zoned.c
>> @@ -0,0 +1,144 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright 2023 Andreas Hindborg <[email protected]>
>> + */
>> +#include <linux/blkzoned.h>
>> +#include <linux/ublk_cmd.h>
>> +#include <linux/vmalloc.h>
>> +
>> +#include "ublk_drv.h"
>> +
>> +void ublk_set_nr_zones(struct ublk_device *ub)
>> +{
>> + const struct ublk_param_basic *p = &ub->params.basic;
>> +
>> + if (ub->dev_info.flags & UBLK_F_ZONED && p->chunk_sectors)
>> + ub->ub_disk->nr_zones = p->dev_sectors / p->chunk_sectors;
>> +}
>> +
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub)
>> +{
>> + const struct ublk_param_zoned *p = &ub->params.zoned;
>> +
>> + if (ub->dev_info.flags & UBLK_F_ZONED) {
>> + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
>> + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
>> + }
>> +}
>> +
>> +int ublk_revalidate_disk_zones(struct gendisk *disk)
>> +{
>> + return blk_revalidate_disk_zones(disk, NULL);
>> +}
>> +
>> +/* Based on virtblk_alloc_report_buffer */
>> +static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
>> + unsigned int nr_zones,
>> + unsigned int zone_sectors, size_t *buflen)
>> +{
>> + struct request_queue *q = ublk->ub_disk->queue;
>> + size_t bufsize;
>> + void *buf;
>> +
>> + nr_zones = min_t(unsigned int, nr_zones,
>> + get_capacity(ublk->ub_disk) >> ilog2(zone_sectors));
>> +
>> + bufsize = nr_zones * sizeof(struct blk_zone);
>> + bufsize =
>> + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
>> + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
>
> This means 'blk_zone'(includes any sub-field type) is reused as part of ublk's UAPI,
> and it is smart, please document it around UBLK_IO_OP_REPORT_ZONES's definition.
Will do ????
>
>> +
>> + while (bufsize >= sizeof(struct blk_zone)) {
>> + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>> + if (buf) {
>> + *buflen = bufsize;
>> + return buf;
>> + }
>> + bufsize >>= 1;
>> + }
>> +
>> + bufsize = 0;
>> + return NULL;
>> +}
>> +
>> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> + unsigned int nr_zones, report_zones_cb cb, void *data)
>> +{
>> + struct ublk_device *ub = disk->private_data;
>> + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
>> + unsigned int first_zone = sector >> ilog2(zone_size_sectors);
>> + unsigned int done_zones = 0;
>> + unsigned int max_zones_per_request;
>> + struct blk_zone *buffer;
>> + size_t buffer_length;
>> +
>> + if (!(ub->dev_info.flags & UBLK_F_ZONED))
>> + return -EOPNOTSUPP;
>> +
>> + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
>> + nr_zones);
>> +
>> + buffer = ublk_alloc_report_buffer(ub, nr_zones, zone_size_sectors,
>> + &buffer_length);
>> + if (!buffer)
>> + return -ENOMEM;
>> +
>> + max_zones_per_request = buffer_length / sizeof(struct blk_zone);
>> +
>> + while (done_zones < nr_zones) {
>> + unsigned int remaining_zones = nr_zones - done_zones;
>> + unsigned int zones_in_request = min_t(
>> + unsigned int, remaining_zones, max_zones_per_request);
>> + int err = 0;
>> + struct request *req;
>> + struct ublk_rq_data *pdu;
>> + blk_status_t status;
>> +
>> + memset(buffer, 0, buffer_length);
>> +
>> + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
>> + if (IS_ERR(req))
>> + return PTR_ERR(req);
>> +
>> + pdu = blk_mq_rq_to_pdu(req);
>> + pdu->operation = UBLK_IO_OP_REPORT_ZONES;
>> + pdu->sector = sector;
>> + pdu->nr_sectors = remaining_zones * zone_size_sectors;
>> +
>> + err = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
>> + GFP_KERNEL);
>> + if (err) {
>> + blk_mq_free_request(req);
>> + kvfree(buffer);
>> + return err;
>> + }
>> +
>> + status = blk_execute_rq(req, 0);
>> + err = blk_status_to_errno(status);
>> + blk_mq_free_request(req);
>> + if (err) {
>> + kvfree(buffer);
>> + return err;
>> + }
>> +
>> + for (unsigned int i = 0; i < zones_in_request; i++) {
>> + struct blk_zone *zone = buffer + i;
>> +
>> + err = cb(zone, i, data);
>> + if (err)
>> + return err;
>> +
>> + done_zones++;
>> + sector += zone_size_sectors;
>> +
>> + /* A zero length zone means don't ask for more zones */
>> + if (!zone->len) {
>
> This way is part of UAPI UBLK_IO_OP_REPORT_ZONES, please document it
> around UBLK_IO_OP_REPORT_ZONES.
OK ????
>
>> + kvfree(buffer);
>> + return done_zones;
>> + }
>> + }
>> + }
>> +
>> + kvfree(buffer);
>> + return done_zones;
>> +}
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index e519dc0d9fe7..88fa39853c61 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -57,12 +57,13 @@
>> | UBLK_F_USER_RECOVERY_REISSUE \
>> | UBLK_F_UNPRIVILEGED_DEV \
>> | UBLK_F_CMD_IOCTL_ENCODE \
>> - | UBLK_F_USER_COPY)
>> + | UBLK_F_USER_COPY \
>> + | UBLK_F_ZONED)
>>
>> /* All UBLK_PARAM_TYPE_* should be included here */
>> -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
>> - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
>> -
>> +#define UBLK_PARAM_TYPE_ALL \
>> + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
>> + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
>>
>> struct ublk_uring_cmd_pdu {
>> struct ublk_queue *ubq;
>> @@ -209,6 +210,9 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
>> set_disk_ro(ub->ub_disk, true);
>>
>> set_capacity(ub->ub_disk, p->dev_sectors);
>> +
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> + ublk_set_nr_zones(ub);
>> }
>>
>> static void ublk_dev_param_discard_apply(struct ublk_device *ub)
>> @@ -269,6 +273,9 @@ static int ublk_apply_params(struct ublk_device *ub)
>> if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
>> ublk_dev_param_discard_apply(ub);
>>
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (ub->params.types & UBLK_PARAM_TYPE_ZONED))
>> + ublk_dev_param_zoned_apply(ub);
>> +
>> return 0;
>> }
>>
>> @@ -439,6 +446,7 @@ static const struct block_device_operations ub_fops = {
>> .owner = THIS_MODULE,
>> .open = ublk_open,
>> .free_disk = ublk_free_disk,
>> + .report_zones = ublk_report_zones,
>> };
>>
>> #define UBLK_MAX_PIN_PAGES 32
>> @@ -553,7 +561,8 @@ static inline bool ublk_need_map_req(const struct request *req)
>>
>> static inline bool ublk_need_unmap_req(const struct request *req)
>> {
>> - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
>> + return ublk_rq_has_data(req) &&
>> + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
>> }
>>
>> static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
>> @@ -637,6 +646,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>> {
>> struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
>> struct ublk_io *io = &ubq->ios[req->tag];
>> + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
>> u32 ublk_op;
>>
>> switch (req_op(req)) {
>> @@ -655,6 +665,35 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
>> case REQ_OP_WRITE_ZEROES:
>> ublk_op = UBLK_IO_OP_WRITE_ZEROES;
>> break;
>> + case REQ_OP_ZONE_OPEN:
>> + ublk_op = UBLK_IO_OP_ZONE_OPEN;
>> + break;
>> + case REQ_OP_ZONE_CLOSE:
>> + ublk_op = UBLK_IO_OP_ZONE_CLOSE;
>> + break;
>> + case REQ_OP_ZONE_FINISH:
>> + ublk_op = UBLK_IO_OP_ZONE_FINISH;
>> + break;
>> + case REQ_OP_ZONE_RESET:
>> + ublk_op = UBLK_IO_OP_ZONE_RESET;
>> + break;
>> + case REQ_OP_DRV_IN:
>> + ublk_op = pdu->operation;
>> + switch (ublk_op) {
>> + case UBLK_IO_OP_REPORT_ZONES:
>> + iod->op_flags = ublk_op | ublk_req_build_flags(req);
>> + iod->nr_sectors = pdu->nr_sectors;
>> + iod->start_sector = pdu->sector;
>> + iod->addr = io->addr;
>
> As Damien commented, zone append needs to be moved into this patch,
> so zoned depends on user copy feature, and you can simply fail to
> add device if user copy isn't enabled.
>
> So the above 'iod->addr = io->addr' can be removed.
Thanks ????
>
>> + return BLK_STS_OK;
>> + default:
>> + return BLK_STS_IOERR;
>> + }
>> + case REQ_OP_ZONE_APPEND:
>> + case REQ_OP_ZONE_RESET_ALL:
>> + case REQ_OP_DRV_OUT:
>> + /* We do not support zone append or reset_all yet */
>> + fallthrough;
>> default:
>> return BLK_STS_IOERR;
>> }
>> @@ -708,7 +747,8 @@ static inline void __ublk_complete_rq(struct request *req)
>> *
>> * Both the two needn't unmap.
>> */
>> - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
>> + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
>> + req_op(req) != REQ_OP_DRV_IN)
>> goto exit;
>>
>> /* for READ request, writing data in iod->addr to rq buffers */
>> @@ -1835,6 +1875,15 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
>> if (ub->nr_privileged_daemon != ub->nr_queues_ready)
>> set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
>>
>> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> + ub->dev_info.flags & UBLK_F_ZONED) {
>> + disk_set_zoned(disk, BLK_ZONED_HM);
>> + blk_queue_required_elevator_features(disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE);
>> + ret = ublk_revalidate_disk_zones(disk);
>> + if (ret)
>> + goto out_put_disk;
>> + }
>> +
>> get_device(&ub->cdev_dev);
>> ub->dev_info.state = UBLK_S_DEV_LIVE;
>> ret = add_disk(disk);
>> @@ -1997,6 +2046,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
>> if (ub->dev_info.flags & UBLK_F_USER_COPY)
>> ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
>>
>> + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
>> + ub->dev_info.flags &= ~UBLK_F_ZONED;
>> +
>> /* We are not ready to support zero copy */
>> ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
>>
>> diff --git a/drivers/block/ublk_drv.h b/drivers/block/ublk_drv.h
>> index f81e62256456..7242430fd6b9 100644
>> --- a/drivers/block/ublk_drv.h
>> +++ b/drivers/block/ublk_drv.h
>> @@ -50,6 +50,21 @@ struct ublk_rq_data {
>> struct llist_node node;
>>
>> struct kref ref;
>> + enum ublk_op operation;
>> + __u64 sector;
>> + __u32 nr_sectors;
>> };
>>
>> +void ublk_set_nr_zones(struct ublk_device *ub);
>> +void ublk_dev_param_zoned_apply(struct ublk_device *ub);
>> +int ublk_revalidate_disk_zones(struct gendisk *disk);
>> +
>> +#ifdef CONFIG_BLK_DEV_UBLK_ZONED
>> +int ublk_report_zones(struct gendisk *disk, sector_t sector,
>> + unsigned int nr_zones, report_zones_cb cb,
>> + void *data);
>> +#else
>> +#define ublk_report_zones NULL
>> +#endif
>> +
>> #endif
>> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
>> index 471b3b983045..436525afffe8 100644
>> --- a/include/uapi/linux/ublk_cmd.h
>> +++ b/include/uapi/linux/ublk_cmd.h
>> @@ -176,6 +176,11 @@
>> /* Copy between request and user buffer by pread()/pwrite() */
>> #define UBLK_F_USER_COPY (1UL << 7)
>>
>> +/*
>> + * Enable zoned device support
>> + */
>> +#define UBLK_F_ZONED (1ULL << 8)
>> +
>> /* device state */
>> #define UBLK_S_DEV_DEAD 0
>> #define UBLK_S_DEV_LIVE 1
>> @@ -242,6 +247,7 @@ enum ublk_op {
>> UBLK_IO_OP_ZONE_APPEND = 13,
>> UBLK_IO_OP_ZONE_RESET = 15,
>> __UBLK_IO_OP_DRV_IN_START = 32,
>> + UBLK_IO_OP_REPORT_ZONES = __UBLK_IO_OP_DRV_IN_START,
>> __UBLK_IO_OP_DRV_IN_END = 96,
>> __UBLK_IO_OP_DRV_OUT_START = __UBLK_IO_OP_DRV_IN_END,
>> __UBLK_IO_OP_DRV_OUT_END = 160,
>> @@ -342,6 +348,12 @@ struct ublk_param_devt {
>> __u32 disk_minor;
>> };
>>
>> +struct ublk_param_zoned {
>> + __u32 max_open_zones;
>> + __u32 max_active_zones;
>> + __u8 reserved[24];
>> +};
>
> Maybe you can simply borrow virtio_blk_zoned_characteristics definition
> here, and make several reserved words.
>
> But that is fine, given six u32 should be flexible enough compared with
> virtio_blk_zoned_characteristics.
I agree, this should be enough. One field will go to max append size, so
20 bytes left.
Best regards,
Andreas