This patchset tries to add support for TP4065a ("Simple Copy Command"),
v2020.05.04 ("Ratified")
The Specification can be found in following link.
https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
This is an RFC. Looking forward for any feedbacks or other alternate
designs for plumbing simple copy to IO stack.
Simple copy command is a copy offloading operation and is used to copy
multiple contiguous ranges (source_ranges) of LBA's to a single destination
LBA within the device reducing traffic between host and device.
This implementation accepts destination, no of sources and arrays of
source ranges from application and attach it as payload to the bio and
submits to the device.
Following limits are added to queue limits and are exposed in sysfs
to userspace
- *max_copy_sectors* limits the sum of all source_range length
- *max_copy_nr_ranges* limits the number of source ranges
- *max_copy_range_sectors* limit the maximum number of sectors
that can constitute a single source range.
Changes from v1:
1. Fix memory leak in __blkdev_issue_copy
2. Unmark blk_check_copy inline
3. Fix line break in blk_check_copy_eod
4. Remove p checks and made code more readable
5. Don't use bio_set_op_attrs and remove op and set
bi_opf directly
6. Use struct_size to calculate total_size
7. Fix partition remap of copy destination
8. Remove mcl,mssrl,msrc from nvme_ns
9. Initialize copy queue limits to 0 in nvme_config_copy
10. Remove return in QUEUE_FLAG_COPY check
11. Remove unused OCFS
SelvaKumar S (2):
block: add simple copy support
nvme: add simple copy support
block/blk-core.c | 94 ++++++++++++++++++++++++++---
block/blk-lib.c | 123 ++++++++++++++++++++++++++++++++++++++
block/blk-merge.c | 2 +
block/blk-settings.c | 11 ++++
block/blk-sysfs.c | 23 +++++++
block/blk-zoned.c | 1 +
block/bounce.c | 1 +
block/ioctl.c | 43 +++++++++++++
drivers/nvme/host/core.c | 87 +++++++++++++++++++++++++++
include/linux/bio.h | 1 +
include/linux/blk_types.h | 15 +++++
include/linux/blkdev.h | 15 +++++
include/linux/nvme.h | 43 ++++++++++++-
include/uapi/linux/fs.h | 13 ++++
14 files changed, 461 insertions(+), 11 deletions(-)
--
2.25.1
Add support for TP 4065a ("Simple Copy Command"), v2020.05.04
("Ratified")
The implementation uses the payload passed from the block layer
to form simple copy command. Set the device copy limits to queue
limits.
Signed-off-by: SelvaKumar S <[email protected]>
Signed-off-by: Kanchan Joshi <[email protected]>
Signed-off-by: Nitesh Shetty <[email protected]>
Signed-off-by: Javier González <[email protected]>
---
drivers/nvme/host/core.c | 87 ++++++++++++++++++++++++++++++++++++++++
include/linux/nvme.h | 43 ++++++++++++++++++--
2 files changed, 127 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9b6ebeb29cca..ff45e57223f0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -647,6 +647,65 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
}
+static inline blk_status_t nvme_setup_copy(struct nvme_ns *ns,
+ struct request *req, struct nvme_command *cmnd)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ struct nvme_copy_range *range = NULL;
+ struct blk_copy_payload *payload;
+ unsigned short nr_range = 0;
+ u16 control = 0, ssrl;
+ u32 dsmgmt = 0;
+ u64 slba;
+ int i;
+
+ payload = bio_data(req->bio);
+ nr_range = payload->copy_range;
+
+ if (req->cmd_flags & REQ_FUA)
+ control |= NVME_RW_FUA;
+
+ if (req->cmd_flags & REQ_FAILFAST_DEV)
+ control |= NVME_RW_LR;
+
+ cmnd->copy.opcode = nvme_cmd_copy;
+ cmnd->copy.nsid = cpu_to_le32(ns->head->ns_id);
+ cmnd->copy.sdlba = cpu_to_le64(blk_rq_pos(req) >> (ns->lba_shift - 9));
+
+ range = kmalloc_array(nr_range, sizeof(*range),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!range)
+ return BLK_STS_RESOURCE;
+
+ for (i = 0; i < nr_range; i++) {
+ slba = payload->range[i].src;
+ slba = slba >> (ns->lba_shift - 9);
+
+ ssrl = payload->range[i].len;
+ ssrl = ssrl >> (ns->lba_shift - 9);
+
+ range[i].slba = cpu_to_le64(slba);
+ range[i].nlb = cpu_to_le16(ssrl - 1);
+ }
+
+ cmnd->copy.nr_range = nr_range - 1;
+
+ req->special_vec.bv_page = virt_to_page(range);
+ req->special_vec.bv_offset = offset_in_page(range);
+ req->special_vec.bv_len = sizeof(*range) * nr_range;
+ req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+ if (ctrl->nr_streams)
+ nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
+
+ //TBD end-to-end
+
+ cmnd->rw.control = cpu_to_le16(control);
+ cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+
+ return BLK_STS_OK;
+}
+
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd)
{
@@ -829,6 +888,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
+ case REQ_OP_COPY:
+ ret = nvme_setup_copy(ns, req, cmd);
+ break;
case REQ_OP_READ:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
break;
@@ -1850,6 +1912,29 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
}
+static void nvme_config_copy(struct gendisk *disk, struct nvme_ns *ns,
+ struct nvme_id_ns *id)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ struct request_queue *queue = disk->queue;
+
+ if (!(ctrl->oncs & NVME_CTRL_ONCS_COPY)) {
+ queue->limits.max_copy_sectors = 0;
+ queue->limits.max_copy_range_sectors = 0;
+ queue->limits.max_copy_nr_ranges = 0;
+ blk_queue_flag_clear(QUEUE_FLAG_COPY, queue);
+ return;
+ }
+
+ /* setting copy limits */
+ blk_queue_flag_test_and_set(QUEUE_FLAG_COPY, queue);
+ queue->limits.max_copy_sectors = le64_to_cpu(id->mcl) *
+ (1 << (ns->lba_shift - 9));
+ queue->limits.max_copy_range_sectors = le32_to_cpu(id->mssrl) *
+ (1 << (ns->lba_shift - 9));
+ queue->limits.max_copy_nr_ranges = id->msrc + 1;
+}
+
static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
{
u64 max_blocks;
@@ -2045,6 +2130,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
set_capacity_and_notify(disk, capacity);
nvme_config_discard(disk, ns);
+ nvme_config_copy(disk, ns, id);
nvme_config_write_zeroes(disk, ns);
if (id->nsattr & NVME_NS_ATTR_RO)
@@ -4616,6 +4702,7 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_copy_command) != 64);
BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d92535997687..11ed72a2164d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -289,7 +289,7 @@ struct nvme_id_ctrl {
__u8 nvscc;
__u8 nwpc;
__le16 acwu;
- __u8 rsvd534[2];
+ __le16 ocfs;
__le32 sgls;
__le32 mnan;
__u8 rsvd544[224];
@@ -314,6 +314,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_ONCS_RESERVATIONS = 1 << 5,
NVME_CTRL_ONCS_TIMESTAMP = 1 << 6,
+ NVME_CTRL_ONCS_COPY = 1 << 8,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
@@ -362,7 +363,10 @@ struct nvme_id_ns {
__le16 npdg;
__le16 npda;
__le16 nows;
- __u8 rsvd74[18];
+ __le16 mssrl;
+ __le32 mcl;
+ __u8 msrc;
+ __u8 rsvd91[11];
__le32 anagrpid;
__u8 rsvd96[3];
__u8 nsattr;
@@ -673,6 +677,7 @@ enum nvme_opcode {
nvme_cmd_resv_report = 0x0e,
nvme_cmd_resv_acquire = 0x11,
nvme_cmd_resv_release = 0x15,
+ nvme_cmd_copy = 0x19,
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
nvme_cmd_zone_append = 0x7d,
@@ -691,7 +696,8 @@ enum nvme_opcode {
nvme_opcode_name(nvme_cmd_resv_register), \
nvme_opcode_name(nvme_cmd_resv_report), \
nvme_opcode_name(nvme_cmd_resv_acquire), \
- nvme_opcode_name(nvme_cmd_resv_release))
+ nvme_opcode_name(nvme_cmd_resv_release), \
+ nvme_opcode_name(nvme_cmd_copy))
/*
@@ -863,6 +869,36 @@ struct nvme_dsm_range {
__le64 slba;
};
+struct nvme_copy_command {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2;
+ __le64 metadata;
+ union nvme_data_ptr dptr;
+ __le64 sdlba;
+ __u8 nr_range;
+ __u8 rsvd12;
+ __le16 control;
+ __le16 rsvd13;
+ __le16 dspec;
+ __le32 ilbrt;
+ __le16 lbat;
+ __le16 lbatm;
+};
+
+struct nvme_copy_range {
+ __le64 rsvd0;
+ __le64 slba;
+ __le16 nlb;
+ __le16 rsvd18;
+ __le32 rsvd20;
+ __le32 eilbrt;
+ __le16 elbat;
+ __le16 elbatm;
+};
+
struct nvme_write_zeroes_cmd {
__u8 opcode;
__u8 flags;
@@ -1400,6 +1436,7 @@ struct nvme_command {
struct nvme_download_firmware dlfw;
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
+ struct nvme_copy_command copy;
struct nvme_write_zeroes_cmd write_zeroes;
struct nvme_zone_mgmt_send_cmd zms;
struct nvme_zone_mgmt_recv_cmd zmr;
--
2.25.1
Add new BLKCOPY ioctl that offloads copying of multiple sources
to a destination to the device. Accept copy_ranges that contains
destination, no of sources and pointer to the array of source
ranges. Each range_entry contains start and length of source
ranges (in bytes).
Introduce REQ_OP_COPY, a no-merge copy offload operation. Create
bio with control information as payload and submit to the device.
REQ_OP_COPY(19) is a write op and takes zone_write_lock when submitted
to zoned device.
Introduce queue limits for simple copy and other helper functions.
Add device limits as sysfs entries.
- max_copy_sectors
- max_copy_ranges_sectors
- max_copy_nr_ranges
max_copy_sectors = 0 indicates the device doesn't support copy.
Signed-off-by: SelvaKumar S <[email protected]>
Signed-off-by: Kanchan Joshi <[email protected]>
Signed-off-by: Nitesh Shetty <[email protected]>
Signed-off-by: Javier González <[email protected]>
---
block/blk-core.c | 94 ++++++++++++++++++++++++++---
block/blk-lib.c | 123 ++++++++++++++++++++++++++++++++++++++
block/blk-merge.c | 2 +
block/blk-settings.c | 11 ++++
block/blk-sysfs.c | 23 +++++++
block/blk-zoned.c | 1 +
block/bounce.c | 1 +
block/ioctl.c | 43 +++++++++++++
include/linux/bio.h | 1 +
include/linux/blk_types.h | 15 +++++
include/linux/blkdev.h | 15 +++++
include/uapi/linux/fs.h | 13 ++++
12 files changed, 334 insertions(+), 8 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 2db8bda43b6e..07d64514e77b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -719,6 +719,17 @@ static noinline int should_fail_bio(struct bio *bio)
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
+static inline int bio_check_copy_eod(struct bio *bio, sector_t start,
+ sector_t nr_sectors, sector_t maxsector)
+{
+ if (nr_sectors && maxsector &&
+ (nr_sectors > maxsector || start > maxsector - nr_sectors)) {
+ handle_bad_sector(bio, maxsector);
+ return -EIO;
+ }
+ return 0;
+}
+
/*
* Check whether this bio extends beyond the end of the device or partition.
* This may well happen - the kernel calls bread() without checking the size of
@@ -737,6 +748,65 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
return 0;
}
+/*
+ * Check for copy limits and remap source ranges if needed.
+ */
+static int blk_check_copy(struct bio *bio)
+{
+ struct hd_struct *p = NULL;
+ struct request_queue *q = bio->bi_disk->queue;
+ struct blk_copy_payload *payload;
+ int i, maxsector, start_sect = 0, ret = -EIO;
+ unsigned short nr_range;
+
+ rcu_read_lock();
+
+ if (bio->bi_partno) {
+ p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+ if (unlikely(!p))
+ goto out;
+ if (unlikely(bio_check_ro(bio, p)))
+ goto out;
+ maxsector = part_nr_sects_read(p);
+ start_sect = p->start_sect;
+ } else {
+ if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+ goto out;
+ maxsector = get_capacity(bio->bi_disk);
+ }
+
+ payload = bio_data(bio);
+ nr_range = payload->copy_range;
+
+ /* cannot handle copy crossing nr_ranges limit */
+ if (payload->copy_range > q->limits.max_copy_nr_ranges)
+ goto out;
+
+ /* cannot handle copy more than copy limits */
+ if (payload->copy_size > q->limits.max_copy_sectors)
+ goto out;
+
+ /* check if copy length crosses eod */
+ if (unlikely(bio_check_copy_eod(bio, bio->bi_iter.bi_sector,
+ payload->copy_size, maxsector)))
+ goto out;
+ bio->bi_iter.bi_sector += start_sect;
+
+ for (i = 0; i < nr_range; i++) {
+ if (unlikely(bio_check_copy_eod(bio, payload->range[i].src,
+ payload->range[i].len, maxsector)))
+ goto out;
+ payload->range[i].src += start_sect;
+ }
+
+ if (p)
+ bio->bi_partno = 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
@@ -825,14 +895,16 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
if (should_fail_bio(bio))
goto end_io;
- if (bio->bi_partno) {
- if (unlikely(blk_partition_remap(bio)))
- goto end_io;
- } else {
- if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
- goto end_io;
- if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
- goto end_io;
+ if (likely(!op_is_copy(bio->bi_opf))) {
+ if (bio->bi_partno) {
+ if (unlikely(blk_partition_remap(bio)))
+ goto end_io;
+ } else {
+ if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+ goto end_io;
+ if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
+ goto end_io;
+ }
}
/*
@@ -856,6 +928,12 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
if (!blk_queue_discard(q))
goto not_supported;
break;
+ case REQ_OP_COPY:
+ if (!blk_queue_copy(q))
+ goto not_supported;
+ if (unlikely(blk_check_copy(bio)))
+ goto end_io;
+ break;
case REQ_OP_SECURE_ERASE:
if (!blk_queue_secure_erase(q))
goto not_supported;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e90614fd8d6a..96f727c4d0de 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -150,6 +150,129 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);
+int __blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+ sector_t nr_srcs, struct range_entry *rlist, gfp_t gfp_mask,
+ int flags, struct bio **biop)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct bio *bio;
+ struct blk_copy_payload *payload;
+ sector_t bs_mask;
+ sector_t src_sects, len = 0, total_len = 0;
+ int i, ret, total_size;
+
+ if (!q)
+ return -ENXIO;
+
+ if (!nr_srcs)
+ return -EINVAL;
+
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
+ if (!blk_queue_copy(q))
+ return -EOPNOTSUPP;
+
+ bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+ if (dest & bs_mask)
+ return -EINVAL;
+
+ total_size = struct_size(payload, range, nr_srcs);
+ payload = kmalloc(total_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!payload)
+ return -ENOMEM;
+
+ bio = bio_alloc(gfp_mask, 1);
+ bio->bi_iter.bi_sector = dest;
+ bio->bi_opf = REQ_OP_COPY | REQ_NOMERGE;
+ bio_set_dev(bio, bdev);
+
+ payload->dest = dest;
+
+ for (i = 0; i < nr_srcs; i++) {
+ /* copy payload provided are in bytes */
+ src_sects = rlist[i].src;
+ if (src_sects & bs_mask) {
+ ret = -EINVAL;
+ goto err;
+ }
+ src_sects = src_sects >> SECTOR_SHIFT;
+
+ if (len & bs_mask) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ len = rlist[i].len >> SECTOR_SHIFT;
+ if (len > q->limits.max_copy_range_sectors) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ total_len += len;
+
+ WARN_ON_ONCE((src_sects << 9) > UINT_MAX);
+
+ payload->range[i].src = src_sects;
+ payload->range[i].len = len;
+ }
+
+ /* storing # of source ranges */
+ payload->copy_range = i;
+ /* storing copy len so far */
+ payload->copy_size = total_len;
+
+ ret = bio_add_page(bio, virt_to_page(payload), total_size,
+ offset_in_page(payload));
+ if (ret != total_size) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *biop = bio;
+ return 0;
+err:
+ kfree(payload);
+ bio_put(bio);
+ return ret;
+}
+EXPORT_SYMBOL(__blkdev_issue_copy);
+
+/**
+ * blkdev_issue_copy - queue a copy
+ * @bdev: blockdev to issue copy for
+ * @dest: dest sector
+ * @nr_srcs: number of source ranges to copy
+ * @rlist: list of range entries
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_COPY_* flags to control behaviour //TODO
+ *
+ * Description:
+ * Issue a copy request for dest sector with source in rlist
+ */
+int blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+ int nr_srcs, struct range_entry *rlist,
+ gfp_t gfp_mask, unsigned long flags)
+{
+ struct bio *bio = NULL;
+ int ret;
+
+ ret = __blkdev_issue_copy(bdev, dest, nr_srcs, rlist, gfp_mask, flags,
+ &bio);
+ if (!ret && bio) {
+ ret = submit_bio_wait(bio);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+
+ kfree(page_address(bio_first_bvec_all(bio)->bv_page) +
+ bio_first_bvec_all(bio)->bv_offset);
+ bio_put(bio);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
+
/**
* __blkdev_issue_write_same - generate number of bios with same page
* @bdev: target blockdev
diff --git a/block/blk-merge.c b/block/blk-merge.c
index bcf5e4580603..a16e7598d6ad 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -301,6 +301,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
struct bio *split = NULL;
switch (bio_op(*bio)) {
+ case REQ_OP_COPY:
+ break;
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9741d1d83e98..18e357939ed4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,6 +60,9 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = BLK_ZONED_NONE;
+ lim->max_copy_sectors = 0;
+ lim->max_copy_nr_ranges = 0;
+ lim->max_copy_range_sectors = 0;
}
EXPORT_SYMBOL(blk_set_default_limits);
@@ -549,6 +552,14 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
t->chunk_sectors = lcm_not_zero(t->chunk_sectors, b->chunk_sectors);
+ /* copy limits */
+ t->max_copy_sectors = min_not_zero(t->max_copy_sectors,
+ b->max_copy_sectors);
+ t->max_copy_range_sectors = min_not_zero(t->max_copy_range_sectors,
+ b->max_copy_range_sectors);
+ t->max_copy_nr_ranges = min_not_zero(t->max_copy_nr_ranges,
+ b->max_copy_nr_ranges);
+
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
t->physical_block_size = t->logical_block_size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..e5aabb6a3ac1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -166,6 +166,23 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
return queue_var_show(q->limits.discard_granularity, page);
}
+static ssize_t queue_max_copy_sectors_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(q->limits.max_copy_sectors, page);
+}
+
+static ssize_t queue_max_copy_range_sectors_show(struct request_queue *q,
+ char *page)
+{
+ return queue_var_show(q->limits.max_copy_range_sectors, page);
+}
+
+static ssize_t queue_max_copy_nr_ranges_show(struct request_queue *q,
+ char *page)
+{
+ return queue_var_show(q->limits.max_copy_nr_ranges, page);
+}
+
static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
{
@@ -590,6 +607,9 @@ QUEUE_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+QUEUE_RO_ENTRY(queue_max_copy_sectors, "max_copy_sectors");
+QUEUE_RO_ENTRY(queue_max_copy_range_sectors, "max_copy_range_sectors");
+QUEUE_RO_ENTRY(queue_max_copy_nr_ranges, "max_copy_nr_ranges");
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
@@ -636,6 +656,9 @@ static struct attribute *queue_attrs[] = {
&queue_discard_max_entry.attr,
&queue_discard_max_hw_entry.attr,
&queue_discard_zeroes_data_entry.attr,
+ &queue_max_copy_sectors_entry.attr,
+ &queue_max_copy_range_sectors_entry.attr,
+ &queue_max_copy_nr_ranges_entry.attr,
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
&queue_zone_append_max_entry.attr,
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 6817a673e5ce..6e5fef3cc615 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -75,6 +75,7 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_WRITE:
+ case REQ_OP_COPY:
return blk_rq_zone_is_seq(rq);
default:
return false;
diff --git a/block/bounce.c b/block/bounce.c
index 162a6eee8999..7fbdc52decb3 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -254,6 +254,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
switch (bio_op(bio)) {
+ case REQ_OP_COPY:
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
diff --git a/block/ioctl.c b/block/ioctl.c
index 6b785181344f..a4a507d85e56 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -142,6 +142,47 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
GFP_KERNEL, flags);
}
+static int blk_ioctl_copy(struct block_device *bdev, fmode_t mode,
+ unsigned long arg, unsigned long flags)
+{
+ struct copy_range crange;
+ struct range_entry *rlist;
+ struct request_queue *q = bdev_get_queue(bdev);
+ sector_t dest;
+ int ret;
+
+ if (!(mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (!blk_queue_copy(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&crange, (void __user *)arg, sizeof(crange)))
+ return -EFAULT;
+
+ if (crange.dest & ((1 << SECTOR_SHIFT) - 1))
+ return -EFAULT;
+ dest = crange.dest >> SECTOR_SHIFT;
+
+ rlist = kmalloc_array(crange.nr_range, sizeof(*rlist),
+ GFP_ATOMIC | __GFP_NOWARN);
+
+ if (!rlist)
+ return -ENOMEM;
+
+ if (copy_from_user(rlist, (void __user *)crange.range_list,
+ sizeof(*rlist) * crange.nr_range)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = blkdev_issue_copy(bdev, dest, crange.nr_range,
+ rlist, GFP_KERNEL, flags);
+out:
+ kfree(rlist);
+ return ret;
+}
+
static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
unsigned long arg)
{
@@ -467,6 +508,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
case BLKSECDISCARD:
return blk_ioctl_discard(bdev, mode, arg,
BLKDEV_DISCARD_SECURE);
+ case BLKCOPY:
+ return blk_ioctl_copy(bdev, mode, arg, 0);
case BLKZEROOUT:
return blk_ioctl_zeroout(bdev, mode, arg);
case BLKREPORTZONE:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ecf67108f091..7e40a37f0ee5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -71,6 +71,7 @@ static inline bool bio_has_data(struct bio *bio)
static inline bool bio_no_advance_iter(const struct bio *bio)
{
return bio_op(bio) == REQ_OP_DISCARD ||
+ bio_op(bio) == REQ_OP_COPY ||
bio_op(bio) == REQ_OP_SECURE_ERASE ||
bio_op(bio) == REQ_OP_WRITE_SAME ||
bio_op(bio) == REQ_OP_WRITE_ZEROES;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d9b69bbde5cc..4ecb9c16702d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -360,6 +360,8 @@ enum req_opf {
REQ_OP_ZONE_RESET = 15,
/* reset all the zone present on the device */
REQ_OP_ZONE_RESET_ALL = 17,
+ /* copy ranges within device */
+ REQ_OP_COPY = 19,
/* SCSI passthrough using struct scsi_request */
REQ_OP_SCSI_IN = 32,
@@ -486,6 +488,11 @@ static inline bool op_is_discard(unsigned int op)
return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
}
+static inline bool op_is_copy(unsigned int op)
+{
+ return (op & REQ_OP_MASK) == REQ_OP_COPY;
+}
+
/*
* Check if a bio or request operation is a zone management operation, with
* the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
@@ -545,4 +552,12 @@ struct blk_rq_stat {
u64 batch;
};
+struct blk_copy_payload {
+ sector_t dest;
+ int copy_range;
+ int copy_size;
+ int err;
+ struct range_entry range[];
+};
+
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05b346a68c2e..dbeaeebf41c4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -340,10 +340,13 @@ struct queue_limits {
unsigned int max_zone_append_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
+ unsigned int max_copy_sectors;
unsigned short max_segments;
unsigned short max_integrity_segments;
unsigned short max_discard_segments;
+ unsigned short max_copy_range_sectors;
+ unsigned short max_copy_nr_ranges;
unsigned char misaligned;
unsigned char discard_misaligned;
@@ -625,6 +628,7 @@ struct request_queue {
#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
+#define QUEUE_FLAG_COPY 30 /* supports copy */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_SAME_COMP) | \
@@ -647,6 +651,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
+#define blk_queue_copy(q) test_bit(QUEUE_FLAG_COPY, &(q)->queue_flags)
#define blk_queue_zone_resetall(q) \
test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
#define blk_queue_secure_erase(q) \
@@ -1059,6 +1064,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
return min(q->limits.max_discard_sectors,
UINT_MAX >> SECTOR_SHIFT);
+ if (unlikely(op == REQ_OP_COPY))
+ return q->limits.max_copy_sectors;
+
if (unlikely(op == REQ_OP_WRITE_SAME))
return q->limits.max_write_same_sectors;
@@ -1330,6 +1338,13 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop);
+extern int __blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+ sector_t nr_srcs, struct range_entry *rlist, gfp_t gfp_mask,
+ int flags, struct bio **biop);
+extern int blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+ int nr_srcs, struct range_entry *rlist,
+ gfp_t gfp_mask, unsigned long flags);
+
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f44eb0a04afd..5cadb176317a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -64,6 +64,18 @@ struct fstrim_range {
__u64 minlen;
};
+struct range_entry {
+ __u64 src;
+ __u64 len;
+};
+
+struct copy_range {
+ __u64 dest;
+ __u64 nr_range;
+ __u64 range_list;
+ __u64 rsvd;
+};
+
/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
#define FILE_DEDUPE_RANGE_SAME 0
#define FILE_DEDUPE_RANGE_DIFFERS 1
@@ -184,6 +196,7 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKCOPY _IOWR(0x12, 128, struct copy_range)
/*
* A jump here: 130-131 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
--
2.25.1
On 2020/12/04 20:02, SelvaKumar S wrote:
> This patchset tries to add support for TP4065a ("Simple Copy Command"),
> v2020.05.04 ("Ratified")
>
> The Specification can be found in following link.
> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>
> This is an RFC. Looking forward for any feedbacks or other alternate
> designs for plumbing simple copy to IO stack.
>
> Simple copy command is a copy offloading operation and is used to copy
> multiple contiguous ranges (source_ranges) of LBA's to a single destination
> LBA within the device reducing traffic between host and device.
>
> This implementation accepts destination, no of sources and arrays of
> source ranges from application and attach it as payload to the bio and
> submits to the device.
>
> Following limits are added to queue limits and are exposed in sysfs
> to userspace
> - *max_copy_sectors* limits the sum of all source_range length
> - *max_copy_nr_ranges* limits the number of source ranges
> - *max_copy_range_sectors* limit the maximum number of sectors
> that can constitute a single source range.
Same comment as before. I think this is a good start, but for this to be really
useful to users and kernel components alike, this really needs copy emulation
for drives that do not have a native copy feature, similarly to what write zeros
handling for instance: if the drive does not have a copy command (simple copy
for NVMe or XCOPY for scsi), then the block layer should issue read/write
commands to seamlessly execute the copy. Otherwise, this will only serve a small
niche for users and will not be optimal for FS and DM drivers that could be
simplified with a generic block layer copy functionality.
This is my 10 cents though, others may differ about this.
>
> Changes from v1:
>
> 1. Fix memory leak in __blkdev_issue_copy
> 2. Unmark blk_check_copy inline
> 3. Fix line break in blk_check_copy_eod
> 4. Remove p checks and made code more readable
> 5. Don't use bio_set_op_attrs and remove op and set
> bi_opf directly
> 6. Use struct_size to calculate total_size
> 7. Fix partition remap of copy destination
> 8. Remove mcl,mssrl,msrc from nvme_ns
> 9. Initialize copy queue limits to 0 in nvme_config_copy
> 10. Remove return in QUEUE_FLAG_COPY check
> 11. Remove unused OCFS
>
> SelvaKumar S (2):
> block: add simple copy support
> nvme: add simple copy support
>
> block/blk-core.c | 94 ++++++++++++++++++++++++++---
> block/blk-lib.c | 123 ++++++++++++++++++++++++++++++++++++++
> block/blk-merge.c | 2 +
> block/blk-settings.c | 11 ++++
> block/blk-sysfs.c | 23 +++++++
> block/blk-zoned.c | 1 +
> block/bounce.c | 1 +
> block/ioctl.c | 43 +++++++++++++
> drivers/nvme/host/core.c | 87 +++++++++++++++++++++++++++
> include/linux/bio.h | 1 +
> include/linux/blk_types.h | 15 +++++
> include/linux/blkdev.h | 15 +++++
> include/linux/nvme.h | 43 ++++++++++++-
> include/uapi/linux/fs.h | 13 ++++
> 14 files changed, 461 insertions(+), 11 deletions(-)
>
--
Damien Le Moal
Western Digital Research
On Fri, Dec 04, 2020 at 11:25:12AM +0000, Damien Le Moal wrote:
> On 2020/12/04 20:02, SelvaKumar S wrote:
> > This patchset tries to add support for TP4065a ("Simple Copy Command"),
> > v2020.05.04 ("Ratified")
> >
> > The Specification can be found in following link.
> > https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
> >
> > This is an RFC. Looking forward for any feedbacks or other alternate
> > designs for plumbing simple copy to IO stack.
> >
> > Simple copy command is a copy offloading operation and is used to copy
> > multiple contiguous ranges (source_ranges) of LBA's to a single destination
> > LBA within the device reducing traffic between host and device.
> >
> > This implementation accepts destination, no of sources and arrays of
> > source ranges from application and attach it as payload to the bio and
> > submits to the device.
> >
> > Following limits are added to queue limits and are exposed in sysfs
> > to userspace
> > - *max_copy_sectors* limits the sum of all source_range length
> > - *max_copy_nr_ranges* limits the number of source ranges
> > - *max_copy_range_sectors* limit the maximum number of sectors
> > that can constitute a single source range.
>
> Same comment as before. I think this is a good start, but for this to be really
> useful to users and kernel components alike, this really needs copy emulation
> for drives that do not have a native copy feature, similarly to what write zeros
> handling for instance: if the drive does not have a copy command (simple copy
> for NVMe or XCOPY for scsi), then the block layer should issue read/write
> commands to seamlessly execute the copy. Otherwise, this will only serve a small
> niche for users and will not be optimal for FS and DM drivers that could be
> simplified with a generic block layer copy functionality.
>
> This is my 10 cents though, others may differ about this.
Yes, I agree that copy emulation support should be included with the
hardware enabled solution.
On 04.12.2020 23:40, Keith Busch wrote:
>On Fri, Dec 04, 2020 at 11:25:12AM +0000, Damien Le Moal wrote:
>> On 2020/12/04 20:02, SelvaKumar S wrote:
>> > This patchset tries to add support for TP4065a ("Simple Copy Command"),
>> > v2020.05.04 ("Ratified")
>> >
>> > The Specification can be found in following link.
>> > https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>> >
>> > This is an RFC. Looking forward for any feedbacks or other alternate
>> > designs for plumbing simple copy to IO stack.
>> >
>> > Simple copy command is a copy offloading operation and is used to copy
>> > multiple contiguous ranges (source_ranges) of LBA's to a single destination
>> > LBA within the device reducing traffic between host and device.
>> >
>> > This implementation accepts destination, no of sources and arrays of
>> > source ranges from application and attach it as payload to the bio and
>> > submits to the device.
>> >
>> > Following limits are added to queue limits and are exposed in sysfs
>> > to userspace
>> > - *max_copy_sectors* limits the sum of all source_range length
>> > - *max_copy_nr_ranges* limits the number of source ranges
>> > - *max_copy_range_sectors* limit the maximum number of sectors
>> > that can constitute a single source range.
>>
>> Same comment as before. I think this is a good start, but for this to be really
>> useful to users and kernel components alike, this really needs copy emulation
>> for drives that do not have a native copy feature, similarly to what write zeros
>> handling for instance: if the drive does not have a copy command (simple copy
>> for NVMe or XCOPY for scsi), then the block layer should issue read/write
>> commands to seamlessly execute the copy. Otherwise, this will only serve a small
>> niche for users and will not be optimal for FS and DM drivers that could be
>> simplified with a generic block layer copy functionality.
>>
>> This is my 10 cents though, others may differ about this.
>
>Yes, I agree that copy emulation support should be included with the
>hardware enabled solution.
Keith, Damien,
Can we do the block layer emulation with this patchset and then work in
follow-up patchses on (i) the FS interface with F2FS as a first user and
(ii) other HW accelerations such as XCOPY?
For XCOPY, I believe we need to have a separate discussion as much works
is already done that we should align to.
On 2020/12/07 16:46, [email protected] wrote:
> On 04.12.2020 23:40, Keith Busch wrote:
>> On Fri, Dec 04, 2020 at 11:25:12AM +0000, Damien Le Moal wrote:
>>> On 2020/12/04 20:02, SelvaKumar S wrote:
>>>> This patchset tries to add support for TP4065a ("Simple Copy Command"),
>>>> v2020.05.04 ("Ratified")
>>>>
>>>> The Specification can be found in following link.
>>>> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>>>>
>>>> This is an RFC. Looking forward for any feedbacks or other alternate
>>>> designs for plumbing simple copy to IO stack.
>>>>
>>>> Simple copy command is a copy offloading operation and is used to copy
>>>> multiple contiguous ranges (source_ranges) of LBA's to a single destination
>>>> LBA within the device reducing traffic between host and device.
>>>>
>>>> This implementation accepts destination, no of sources and arrays of
>>>> source ranges from application and attach it as payload to the bio and
>>>> submits to the device.
>>>>
>>>> Following limits are added to queue limits and are exposed in sysfs
>>>> to userspace
>>>> - *max_copy_sectors* limits the sum of all source_range length
>>>> - *max_copy_nr_ranges* limits the number of source ranges
>>>> - *max_copy_range_sectors* limit the maximum number of sectors
>>>> that can constitute a single source range.
>>>
>>> Same comment as before. I think this is a good start, but for this to be really
>>> useful to users and kernel components alike, this really needs copy emulation
>>> for drives that do not have a native copy feature, similarly to what write zeros
>>> handling for instance: if the drive does not have a copy command (simple copy
>>> for NVMe or XCOPY for scsi), then the block layer should issue read/write
>>> commands to seamlessly execute the copy. Otherwise, this will only serve a small
>>> niche for users and will not be optimal for FS and DM drivers that could be
>>> simplified with a generic block layer copy functionality.
>>>
>>> This is my 10 cents though, others may differ about this.
>>
>> Yes, I agree that copy emulation support should be included with the
>> hardware enabled solution.
>
> Keith, Damien,
>
> Can we do the block layer emulation with this patchset and then work in
> follow-up patchses on (i) the FS interface with F2FS as a first user and
> (ii) other HW accelerations such as XCOPY?
The initial patchset supporting NVMe simple copy and emulation copy, all under
an API that probably will be similar that of dm-kcopyd will cover all block
devices. Other hardware native support for copy functions such as scsi extended
copy can be added later under the hood without any API changes (or minimal changes).
I am not sure what you mean by "FS interface for F2FS": the block layer API for
this copy functionality will be what F2FS (and other FSes) will call. That is
the interface, no ?
> For XCOPY, I believe we need to have a separate discussion as much works
> is already done that we should align to.
I think Martin (added to this thread) and others have looked into it but I do
not think that anything made it into the kernel yet.
--
Damien Le Moal
Western Digital Research
On 07.12.2020 08:06, Damien Le Moal wrote:
>On 2020/12/07 16:46, [email protected] wrote:
>> On 04.12.2020 23:40, Keith Busch wrote:
>>> On Fri, Dec 04, 2020 at 11:25:12AM +0000, Damien Le Moal wrote:
>>>> On 2020/12/04 20:02, SelvaKumar S wrote:
>>>>> This patchset tries to add support for TP4065a ("Simple Copy Command"),
>>>>> v2020.05.04 ("Ratified")
>>>>>
>>>>> The Specification can be found in following link.
>>>>> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>>>>>
>>>>> This is an RFC. Looking forward for any feedbacks or other alternate
>>>>> designs for plumbing simple copy to IO stack.
>>>>>
>>>>> Simple copy command is a copy offloading operation and is used to copy
>>>>> multiple contiguous ranges (source_ranges) of LBA's to a single destination
>>>>> LBA within the device reducing traffic between host and device.
>>>>>
>>>>> This implementation accepts destination, no of sources and arrays of
>>>>> source ranges from application and attach it as payload to the bio and
>>>>> submits to the device.
>>>>>
>>>>> Following limits are added to queue limits and are exposed in sysfs
>>>>> to userspace
>>>>> - *max_copy_sectors* limits the sum of all source_range length
>>>>> - *max_copy_nr_ranges* limits the number of source ranges
>>>>> - *max_copy_range_sectors* limit the maximum number of sectors
>>>>> that can constitute a single source range.
>>>>
>>>> Same comment as before. I think this is a good start, but for this to be really
>>>> useful to users and kernel components alike, this really needs copy emulation
>>>> for drives that do not have a native copy feature, similarly to what write zeros
>>>> handling for instance: if the drive does not have a copy command (simple copy
>>>> for NVMe or XCOPY for scsi), then the block layer should issue read/write
>>>> commands to seamlessly execute the copy. Otherwise, this will only serve a small
>>>> niche for users and will not be optimal for FS and DM drivers that could be
>>>> simplified with a generic block layer copy functionality.
>>>>
>>>> This is my 10 cents though, others may differ about this.
>>>
>>> Yes, I agree that copy emulation support should be included with the
>>> hardware enabled solution.
>>
>> Keith, Damien,
>>
>> Can we do the block layer emulation with this patchset and then work in
>> follow-up patchses on (i) the FS interface with F2FS as a first user and
>> (ii) other HW accelerations such as XCOPY?
>
>The initial patchset supporting NVMe simple copy and emulation copy, all under
>an API that probably will be similar that of dm-kcopyd will cover all block
>devices. Other hardware native support for copy functions such as scsi extended
>copy can be added later under the hood without any API changes (or minimal changes).
Sounds good. That we can do. We will add a new patch for this.
>
>I am not sure what you mean by "FS interface for F2FS": the block layer API for
>this copy functionality will be what F2FS (and other FSes) will call. That is
>the interface, no ?
Essentially yes.. I mean adding the F2FS logic and potentially some
helpers to the block layer to aid GC.
>
>> For XCOPY, I believe we need to have a separate discussion as much works
>> is already done that we should align to.
>
>I think Martin (added to this thread) and others have looked into it but I do
>not think that anything made it into the kernel yet.
Exactly. Looking at some of the code posted through time and recalling
the discussions at LSF/MM, seems like there are a number of things we
are not addressing here that could be incorporated down the road, such
as dedicated syscalls / extensions, multi namespace / device support,
etc.
>
On 2020/12/07 17:16, [email protected] wrote:
> On 07.12.2020 08:06, Damien Le Moal wrote:
>> On 2020/12/07 16:46, [email protected] wrote:
>>> On 04.12.2020 23:40, Keith Busch wrote:
>>>> On Fri, Dec 04, 2020 at 11:25:12AM +0000, Damien Le Moal wrote:
>>>>> On 2020/12/04 20:02, SelvaKumar S wrote:
>>>>>> This patchset tries to add support for TP4065a ("Simple Copy Command"),
>>>>>> v2020.05.04 ("Ratified")
>>>>>>
>>>>>> The Specification can be found in following link.
>>>>>> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>>>>>>
>>>>>> This is an RFC. Looking forward for any feedbacks or other alternate
>>>>>> designs for plumbing simple copy to IO stack.
>>>>>>
>>>>>> Simple copy command is a copy offloading operation and is used to copy
>>>>>> multiple contiguous ranges (source_ranges) of LBA's to a single destination
>>>>>> LBA within the device reducing traffic between host and device.
>>>>>>
>>>>>> This implementation accepts destination, no of sources and arrays of
>>>>>> source ranges from application and attach it as payload to the bio and
>>>>>> submits to the device.
>>>>>>
>>>>>> Following limits are added to queue limits and are exposed in sysfs
>>>>>> to userspace
>>>>>> - *max_copy_sectors* limits the sum of all source_range length
>>>>>> - *max_copy_nr_ranges* limits the number of source ranges
>>>>>> - *max_copy_range_sectors* limit the maximum number of sectors
>>>>>> that can constitute a single source range.
>>>>>
>>>>> Same comment as before. I think this is a good start, but for this to be really
>>>>> useful to users and kernel components alike, this really needs copy emulation
>>>>> for drives that do not have a native copy feature, similarly to what write zeros
>>>>> handling for instance: if the drive does not have a copy command (simple copy
>>>>> for NVMe or XCOPY for scsi), then the block layer should issue read/write
>>>>> commands to seamlessly execute the copy. Otherwise, this will only serve a small
>>>>> niche for users and will not be optimal for FS and DM drivers that could be
>>>>> simplified with a generic block layer copy functionality.
>>>>>
>>>>> This is my 10 cents though, others may differ about this.
>>>>
>>>> Yes, I agree that copy emulation support should be included with the
>>>> hardware enabled solution.
>>>
>>> Keith, Damien,
>>>
>>> Can we do the block layer emulation with this patchset and then work in
>>> follow-up patchses on (i) the FS interface with F2FS as a first user and
>>> (ii) other HW accelerations such as XCOPY?
>>
>> The initial patchset supporting NVMe simple copy and emulation copy, all under
>> an API that probably will be similar that of dm-kcopyd will cover all block
>> devices. Other hardware native support for copy functions such as scsi extended
>> copy can be added later under the hood without any API changes (or minimal changes).
>
> Sounds good. That we can do. We will add a new patch for this.
>
>>
>> I am not sure what you mean by "FS interface for F2FS": the block layer API for
>> this copy functionality will be what F2FS (and other FSes) will call. That is
>> the interface, no ?
>
> Essentially yes.. I mean adding the F2FS logic and potentially some
> helpers to the block layer to aid GC.
GC is very much special to each FS. SO I do not think adding helpers to the
block layer will have value. We should stick to a pure block copy API for that
layer.
>
>>
>>> For XCOPY, I believe we need to have a separate discussion as much works
>>> is already done that we should align to.
>>
>> I think Martin (added to this thread) and others have looked into it but I do
>> not think that anything made it into the kernel yet.
>
> Exactly. Looking at some of the code posted through time and recalling
> the discussions at LSF/MM, seems like there are a number of things we
> are not addressing here that could be incorporated down the road, such
> as dedicated syscalls / extensions, multi namespace / device support,
> etc.
dm-kcopyd interface supports copy between multiple devices. That of course would
not enable NVMe simple copy use, but that makes the interface generic enough so
that we should not have any problem with other hardware copy functions.
>>
>
--
Damien Le Moal
Western Digital Research
So, I'm really worried about:
a) a good use case. GC in f2fs or btrfs seem like good use cases, as
does accelating dm-kcopyd. I agree with Damien that lifting dm-kcopyd
to common code would also be really nice. I'm not 100% sure it should
be a requirement, but it sure would be nice to have
I don't think just adding an ioctl is enough of a use case for complex
kernel infrastructure.
b) We had a bunch of different attempts at SCSI XCOPY support form IIRC
Martin, Bart and Mikulas. I think we need to pull them into this
discussion, and make sure whatever we do covers the SCSI needs.
On Fri, Dec 04, 2020 at 03:16:57PM +0530, SelvaKumar S wrote:
> This patchset tries to add support for TP4065a ("Simple Copy Command"),
> v2020.05.04 ("Ratified")
>
> The Specification can be found in following link.
> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>
> This is an RFC. Looking forward for any feedbacks or other alternate
> designs for plumbing simple copy to IO stack.
>
> Simple copy command is a copy offloading operation and is used to copy
> multiple contiguous ranges (source_ranges) of LBA's to a single destination
> LBA within the device reducing traffic between host and device.
>
> This implementation accepts destination, no of sources and arrays of
> source ranges from application and attach it as payload to the bio and
> submits to the device.
>
> Following limits are added to queue limits and are exposed in sysfs
> to userspace
> - *max_copy_sectors* limits the sum of all source_range length
> - *max_copy_nr_ranges* limits the number of source ranges
> - *max_copy_range_sectors* limit the maximum number of sectors
> that can constitute a single source range.
>
> Changes from v1:
>
> 1. Fix memory leak in __blkdev_issue_copy
> 2. Unmark blk_check_copy inline
> 3. Fix line break in blk_check_copy_eod
> 4. Remove p checks and made code more readable
> 5. Don't use bio_set_op_attrs and remove op and set
> bi_opf directly
> 6. Use struct_size to calculate total_size
> 7. Fix partition remap of copy destination
> 8. Remove mcl,mssrl,msrc from nvme_ns
> 9. Initialize copy queue limits to 0 in nvme_config_copy
> 10. Remove return in QUEUE_FLAG_COPY check
> 11. Remove unused OCFS
>
> SelvaKumar S (2):
> block: add simple copy support
> nvme: add simple copy support
>
> block/blk-core.c | 94 ++++++++++++++++++++++++++---
> block/blk-lib.c | 123 ++++++++++++++++++++++++++++++++++++++
> block/blk-merge.c | 2 +
> block/blk-settings.c | 11 ++++
> block/blk-sysfs.c | 23 +++++++
> block/blk-zoned.c | 1 +
> block/bounce.c | 1 +
> block/ioctl.c | 43 +++++++++++++
> drivers/nvme/host/core.c | 87 +++++++++++++++++++++++++++
> include/linux/bio.h | 1 +
> include/linux/blk_types.h | 15 +++++
> include/linux/blkdev.h | 15 +++++
> include/linux/nvme.h | 43 ++++++++++++-
> include/uapi/linux/fs.h | 13 ++++
> 14 files changed, 461 insertions(+), 11 deletions(-)
>
> --
> 2.25.1
---end quoted text---
On 12/7/20 3:11 PM, Christoph Hellwig wrote:
> So, I'm really worried about:
>
> a) a good use case. GC in f2fs or btrfs seem like good use cases, as
> does accelating dm-kcopyd. I agree with Damien that lifting dm-kcopyd
> to common code would also be really nice. I'm not 100% sure it should
> be a requirement, but it sure would be nice to have
> I don't think just adding an ioctl is enough of a use case for complex
> kernel infrastructure.
> b) We had a bunch of different attempts at SCSI XCOPY support form IIRC
> Martin, Bart and Mikulas. I think we need to pull them into this
> discussion, and make sure whatever we do covers the SCSI needs.
>
And we shouldn't forget that the main issue which killed all previous
implementations was a missing QoS guarantee.
It's nice to have simply copy, but if the implementation is _slower_
than doing it by hand from the OS there is very little point in even
attempting to do so.
I can't see any provisions for that in the TPAR, leading me to the
assumption that NVMe simple copy will suffer from the same issue.
So if we can't address this I guess this attempt will fail, too.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On 07.12.2020 15:11, Christoph Hellwig wrote:
>So, I'm really worried about:
>
> a) a good use case. GC in f2fs or btrfs seem like good use cases, as
> does accelating dm-kcopyd. I agree with Damien that lifting dm-kcopyd
> to common code would also be really nice. I'm not 100% sure it should
> be a requirement, but it sure would be nice to have
> I don't think just adding an ioctl is enough of a use case for complex
> kernel infrastructure.
We are looking at dm-kcopyd. I would have liked to start with a very
specific use case and build from there, but I see Damien's and Keith's
point of having a default path. I believe we can add this to the next
version.
> b) We had a bunch of different attempts at SCSI XCOPY support form IIRC
> Martin, Bart and Mikulas. I think we need to pull them into this
> discussion, and make sure whatever we do covers the SCSI needs.
Agree. We discussed a lot about the scope and agreed that everything
outside of the specifics of Simple Copy requires the input from the ones
that have worked on XCOPY support in the past.
>
>On Fri, Dec 04, 2020 at 03:16:57PM +0530, SelvaKumar S wrote:
>> This patchset tries to add support for TP4065a ("Simple Copy Command"),
>> v2020.05.04 ("Ratified")
>>
>> The Specification can be found in following link.
>> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>>
>> This is an RFC. Looking forward for any feedbacks or other alternate
>> designs for plumbing simple copy to IO stack.
>>
>> Simple copy command is a copy offloading operation and is used to copy
>> multiple contiguous ranges (source_ranges) of LBA's to a single destination
>> LBA within the device reducing traffic between host and device.
>>
>> This implementation accepts destination, no of sources and arrays of
>> source ranges from application and attach it as payload to the bio and
>> submits to the device.
>>
>> Following limits are added to queue limits and are exposed in sysfs
>> to userspace
>> - *max_copy_sectors* limits the sum of all source_range length
>> - *max_copy_nr_ranges* limits the number of source ranges
>> - *max_copy_range_sectors* limit the maximum number of sectors
>> that can constitute a single source range.
>>
>> Changes from v1:
>>
>> 1. Fix memory leak in __blkdev_issue_copy
>> 2. Unmark blk_check_copy inline
>> 3. Fix line break in blk_check_copy_eod
>> 4. Remove p checks and made code more readable
>> 5. Don't use bio_set_op_attrs and remove op and set
>> bi_opf directly
>> 6. Use struct_size to calculate total_size
>> 7. Fix partition remap of copy destination
>> 8. Remove mcl,mssrl,msrc from nvme_ns
>> 9. Initialize copy queue limits to 0 in nvme_config_copy
>> 10. Remove return in QUEUE_FLAG_COPY check
>> 11. Remove unused OCFS
>>
>> SelvaKumar S (2):
>> block: add simple copy support
>> nvme: add simple copy support
>>
>> block/blk-core.c | 94 ++++++++++++++++++++++++++---
>> block/blk-lib.c | 123 ++++++++++++++++++++++++++++++++++++++
>> block/blk-merge.c | 2 +
>> block/blk-settings.c | 11 ++++
>> block/blk-sysfs.c | 23 +++++++
>> block/blk-zoned.c | 1 +
>> block/bounce.c | 1 +
>> block/ioctl.c | 43 +++++++++++++
>> drivers/nvme/host/core.c | 87 +++++++++++++++++++++++++++
>> include/linux/bio.h | 1 +
>> include/linux/blk_types.h | 15 +++++
>> include/linux/blkdev.h | 15 +++++
>> include/linux/nvme.h | 43 ++++++++++++-
>> include/uapi/linux/fs.h | 13 ++++
>> 14 files changed, 461 insertions(+), 11 deletions(-)
>>
>> --
>> 2.25.1
>---end quoted text---
On 07.12.2020 15:56, Hannes Reinecke wrote:
>On 12/7/20 3:11 PM, Christoph Hellwig wrote:
>>So, I'm really worried about:
>>
>> a) a good use case. GC in f2fs or btrfs seem like good use cases, as
>> does accelating dm-kcopyd. I agree with Damien that lifting dm-kcopyd
>> to common code would also be really nice. I'm not 100% sure it should
>> be a requirement, but it sure would be nice to have
>> I don't think just adding an ioctl is enough of a use case for complex
>> kernel infrastructure.
>> b) We had a bunch of different attempts at SCSI XCOPY support form IIRC
>> Martin, Bart and Mikulas. I think we need to pull them into this
>> discussion, and make sure whatever we do covers the SCSI needs.
>>
>And we shouldn't forget that the main issue which killed all previous
>implementations was a missing QoS guarantee.
>It's nice to have simply copy, but if the implementation is _slower_
>than doing it by hand from the OS there is very little point in even
>attempting to do so.
>I can't see any provisions for that in the TPAR, leading me to the
>assumption that NVMe simple copy will suffer from the same issue.
>
>So if we can't address this I guess this attempt will fail, too.
Good point. We can share some performance data on how Simple Copy scales
in terms of bw / latency and the CPU usage. Do you have anything else in
mind?
On 2020-12-07 9:56 a.m., Hannes Reinecke wrote:
> On 12/7/20 3:11 PM, Christoph Hellwig wrote:
>> So, I'm really worried about:
>>
>> a) a good use case. GC in f2fs or btrfs seem like good use cases, as
>> does accelating dm-kcopyd. I agree with Damien that lifting dm-kcopyd
>> to common code would also be really nice. I'm not 100% sure it should
>> be a requirement, but it sure would be nice to have
>> I don't think just adding an ioctl is enough of a use case for complex
>> kernel infrastructure.
>> b) We had a bunch of different attempts at SCSI XCOPY support form IIRC
>> Martin, Bart and Mikulas. I think we need to pull them into this
>> discussion, and make sure whatever we do covers the SCSI needs.
>>
> And we shouldn't forget that the main issue which killed all previous
> implementations was a missing QoS guarantee.
> It's nice to have simply copy, but if the implementation is _slower_ than doing
> it by hand from the OS there is very little point in even attempting to do so.
> I can't see any provisions for that in the TPAR, leading me to the assumption
> that NVMe simple copy will suffer from the same issue.
>
> So if we can't address this I guess this attempt will fail, too.
I have been doing quite a lot of work and testing in my sg driver rewrite
in the copy and compare area. The baselines for performance are dd and
io_uring-cp (in liburing). There are lots of ways to improve on them. Here
are some:
- the user data need never pass through the user space (could
mmap it out during the READ if there is a good reason). Only the
metadata (e.g. NVMe or SCSI commands) needs to come from the user
space and errors, if any, reported back to the user space.
- break a large copy (or compare) into segments, with each segment
a "comfortable" size for the OS to handle, say 256 KB
- there is one constraint: the READ in each segment must complete
before its paired WRITE can commence
- extra constraint for some zoned disks: WRITEs must be
issued in order (assuming they are applied in that order, if
not, need to wait until each WRITE completes)
- arrange for READ WRITE pair in each segment to share the same bio
- have multiple slots each holding a segment (i.e. a bio and
metadata to process a READ-WRITE pair)
- re-use each slot's bio for the following READ-WRITE pair
- issue the READs in each slot asynchronously and do an interleaved
(io)poll for completion. Then issue the paired WRITE
asynchronously
- the above "slot" algorithm runs in one thread, so there can be
multiple threads doing the same algorithm. Segment manager needs
to be locked (or use an atomics) so that each segment (identified
by its starting LBAs) is issued once and only once when the
next thread wants a segment to copy
Running multiple threads gives diminishing or even worsening returns.
Runtime metrics on lock contention and storage bus capacity may help
choosing the number of threads. A simpler approach might be add more
threads until the combined throughput increase is less than 10% say.
The 'compare' that I mention is based on the SCSI VERIFY(BYTCHK=1) command
(or NVMe NVM Compare command). Using dd logic, a disk to disk compare can
be implemented with not much more work than changing the WRITE to a VERIFY
command. This is a different approach to the Linux cmp utility which
READs in both sides and does a memcmp() type operation. Using ramdisks
(from the scsi_debug driver) the compare operation (max ~ 10 GB/s) was
actually faster than the copy (max ~ 7 GB/s). I put this down to WRITE
operations taking a write lock over the store while the VERIFY only
needs a read lock so many VERIFY operations can co-exist on the same
store. Unfortunately on real SAS and NVMe SSDs that I tested the
performance of the VERIFY and NVM Compare commands is underwhelming.
For comparison, using scsi_debug ramdisks, dd copy throughput was
< 1 GB/s and io_uring-cp was around 2-3 GB/s. The system was Ryzen
3600 based.
Doug Gilbert
On 12/7/20 11:12 PM, Douglas Gilbert wrote:
> On 2020-12-07 9:56 a.m., Hannes Reinecke wrote:
>> On 12/7/20 3:11 PM, Christoph Hellwig wrote:
>>> So, I'm really worried about:
>>>
>>> a) a good use case. GC in f2fs or btrfs seem like good use cases, as
>>> does accelating dm-kcopyd. I agree with Damien that lifting
>>> dm-kcopyd
>>> to common code would also be really nice. I'm not 100% sure it
>>> should
>>> be a requirement, but it sure would be nice to have
>>> I don't think just adding an ioctl is enough of a use case for
>>> complex
>>> kernel infrastructure.
>>> b) We had a bunch of different attempts at SCSI XCOPY support form
>>> IIRC
>>> Martin, Bart and Mikulas. I think we need to pull them into this
>>> discussion, and make sure whatever we do covers the SCSI needs.
>>>
>> And we shouldn't forget that the main issue which killed all previous
>> implementations was a missing QoS guarantee.
>> It's nice to have simply copy, but if the implementation is _slower_
>> than doing it by hand from the OS there is very little point in even
>> attempting to do so.
>> I can't see any provisions for that in the TPAR, leading me to the
>> assumption that NVMe simple copy will suffer from the same issue.
>>
>> So if we can't address this I guess this attempt will fail, too.
>
> I have been doing quite a lot of work and testing in my sg driver rewrite
> in the copy and compare area. The baselines for performance are dd and
> io_uring-cp (in liburing). There are lots of ways to improve on them. Here
> are some:
> - the user data need never pass through the user space (could
> mmap it out during the READ if there is a good reason). Only the
> metadata (e.g. NVMe or SCSI commands) needs to come from the user
> space and errors, if any, reported back to the user space.
> - break a large copy (or compare) into segments, with each segment
> a "comfortable" size for the OS to handle, say 256 KB
> - there is one constraint: the READ in each segment must complete
> before its paired WRITE can commence
> - extra constraint for some zoned disks: WRITEs must be
> issued in order (assuming they are applied in that order, if
> not, need to wait until each WRITE completes)
> - arrange for READ WRITE pair in each segment to share the same bio
> - have multiple slots each holding a segment (i.e. a bio and
> metadata to process a READ-WRITE pair)
> - re-use each slot's bio for the following READ-WRITE pair
> - issue the READs in each slot asynchronously and do an interleaved
> (io)poll for completion. Then issue the paired WRITE
> asynchronously
> - the above "slot" algorithm runs in one thread, so there can be
> multiple threads doing the same algorithm. Segment manager needs
> to be locked (or use an atomics) so that each segment (identified
> by its starting LBAs) is issued once and only once when the
> next thread wants a segment to copy
>
> Running multiple threads gives diminishing or even worsening returns.
> Runtime metrics on lock contention and storage bus capacity may help
> choosing the number of threads. A simpler approach might be add more
> threads until the combined throughput increase is less than 10% say.
>
>
> The 'compare' that I mention is based on the SCSI VERIFY(BYTCHK=1) command
> (or NVMe NVM Compare command). Using dd logic, a disk to disk compare can
> be implemented with not much more work than changing the WRITE to a VERIFY
> command. This is a different approach to the Linux cmp utility which
> READs in both sides and does a memcmp() type operation. Using ramdisks
> (from the scsi_debug driver) the compare operation (max ~ 10 GB/s) was
> actually faster than the copy (max ~ 7 GB/s). I put this down to WRITE
> operations taking a write lock over the store while the VERIFY only
> needs a read lock so many VERIFY operations can co-exist on the same
> store. Unfortunately on real SAS and NVMe SSDs that I tested the
> performance of the VERIFY and NVM Compare commands is underwhelming.
> For comparison, using scsi_debug ramdisks, dd copy throughput was
> < 1 GB/s and io_uring-cp was around 2-3 GB/s. The system was Ryzen
> 3600 based.
>
Which is precisely my concern.
Simple copy might be efficient for one particular implementation, but it
might be completely off the board for others.
But both will be claiming to support it, and us having no idea whether
choosing simple copy will speed up matters or not.
Without having a programmatic way to figure out the speed of the
implementation we have to detect the performance ourselves, like the
benchmarking loop RAID5 does.
I was hoping to avoid that, and just ask the device/controller; but that
turned out to be in vain.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On 07/12/2020 20:27, Javier Gonz?lez wrote:
> Good point. We can share some performance data on how Simple Copy scales
> in terms of bw / latency and the CPU usage. Do you have anything else in
> mind?
>
With an emulation in the kernel, we could make the usd "backend"
implementation configurable. So if the emulation is faster, users can select
the emulation, if the device is faster then the device.
Kind of what the crypto and raid code do as well.
I'm really interested in this work, as BTRFS relocation/balance will have
potential benefits, but we need to get it right.
On 08.12.2020 07:44, Hannes Reinecke wrote:
>On 12/7/20 11:12 PM, Douglas Gilbert wrote:
>>On 2020-12-07 9:56 a.m., Hannes Reinecke wrote:
>>>On 12/7/20 3:11 PM, Christoph Hellwig wrote:
>>>>So, I'm really worried about:
>>>>
>>>> a) a good use case. GC in f2fs or btrfs seem like good use cases, as
>>>> does accelating dm-kcopyd. I agree with Damien that
>>>>lifting dm-kcopyd
>>>> to common code would also be really nice. I'm not 100%
>>>>sure it should
>>>> be a requirement, but it sure would be nice to have
>>>> I don't think just adding an ioctl is enough of a use case
>>>>for complex
>>>> kernel infrastructure.
>>>> b) We had a bunch of different attempts at SCSI XCOPY support
>>>>form IIRC
>>>> Martin, Bart and Mikulas. I think we need to pull them into this
>>>> discussion, and make sure whatever we do covers the SCSI needs.
>>>>
>>>And we shouldn't forget that the main issue which killed all
>>>previous implementations was a missing QoS guarantee.
>>>It's nice to have simply copy, but if the implementation is
>>>_slower_ than doing it by hand from the OS there is very little
>>>point in even attempting to do so.
>>>I can't see any provisions for that in the TPAR, leading me to the
>>>assumption that NVMe simple copy will suffer from the same issue.
>>>
>>>So if we can't address this I guess this attempt will fail, too.
>>
>>I have been doing quite a lot of work and testing in my sg driver rewrite
>>in the copy and compare area. The baselines for performance are dd and
>>io_uring-cp (in liburing). There are lots of ways to improve on them. Here
>>are some:
>> - the user data need never pass through the user space (could
>> mmap it out during the READ if there is a good reason). Only the
>> metadata (e.g. NVMe or SCSI commands) needs to come from the user
>> space and errors, if any, reported back to the user space.
>> - break a large copy (or compare) into segments, with each segment
>> a "comfortable" size for the OS to handle, say 256 KB
>> - there is one constraint: the READ in each segment must complete
>> before its paired WRITE can commence
>> - extra constraint for some zoned disks: WRITEs must be
>> issued in order (assuming they are applied in that order, if
>> not, need to wait until each WRITE completes)
>> - arrange for READ WRITE pair in each segment to share the same bio
>> - have multiple slots each holding a segment (i.e. a bio and
>> metadata to process a READ-WRITE pair)
>> - re-use each slot's bio for the following READ-WRITE pair
>> - issue the READs in each slot asynchronously and do an interleaved
>> (io)poll for completion. Then issue the paired WRITE
>> asynchronously
>> - the above "slot" algorithm runs in one thread, so there can be
>> multiple threads doing the same algorithm. Segment manager needs
>> to be locked (or use an atomics) so that each segment (identified
>> by its starting LBAs) is issued once and only once when the
>> next thread wants a segment to copy
>>
>>Running multiple threads gives diminishing or even worsening returns.
>>Runtime metrics on lock contention and storage bus capacity may help
>>choosing the number of threads. A simpler approach might be add more
>>threads until the combined throughput increase is less than 10% say.
>>
>>
>>The 'compare' that I mention is based on the SCSI VERIFY(BYTCHK=1) command
>>(or NVMe NVM Compare command). Using dd logic, a disk to disk compare can
>>be implemented with not much more work than changing the WRITE to a VERIFY
>>command. This is a different approach to the Linux cmp utility which
>>READs in both sides and does a memcmp() type operation. Using ramdisks
>>(from the scsi_debug driver) the compare operation (max ~ 10 GB/s) was
>>actually faster than the copy (max ~ 7 GB/s). I put this down to WRITE
>>operations taking a write lock over the store while the VERIFY only
>>needs a read lock so many VERIFY operations can co-exist on the same
>>store. Unfortunately on real SAS and NVMe SSDs that I tested the
>>performance of the VERIFY and NVM Compare commands is underwhelming.
>>For comparison, using scsi_debug ramdisks, dd copy throughput was
>>< 1 GB/s and io_uring-cp was around 2-3 GB/s. The system was Ryzen
>>3600 based.
>>
>Which is precisely my concern.
>Simple copy might be efficient for one particular implementation, but
>it might be completely off the board for others.
>But both will be claiming to support it, and us having no idea whether
>choosing simple copy will speed up matters or not.
>Without having a programmatic way to figure out the speed of the
>implementation we have to detect the performance ourselves, like the
>benchmarking loop RAID5 does.
>I was hoping to avoid that, and just ask the device/controller; but
>that turned out to be in vain.
I believe it makes sense to do extensive characterization to understand
how the host and device implementation behave. However, I do not believe
we will get far if the requirement is that any acceleration has to
outperform the legacy path under all circumstances and implementations.
At this moment in time, this is a feature very much targeted to
eliminating the extra read/write traffic generated by ZNS host GC.
This said, we do see the value in aligning with existing efforts to
offload copy under other use cases, so if you have a set of tests we can
run to speak the same language, we would be happy to take them and adapt
them to the fio extensions we have posted for testing this too.
On 08.12.2020 08:40, Johannes Thumshirn wrote:
>On 07/12/2020 20:27, Javier González wrote:
>> Good point. We can share some performance data on how Simple Copy scales
>> in terms of bw / latency and the CPU usage. Do you have anything else in
>> mind?
>>
>
>With an emulation in the kernel, we could make the usd "backend"
>implementation configurable. So if the emulation is faster, users can select
>the emulation, if the device is faster then the device.
>
>Kind of what the crypto and raid code do as well.
Good idea. Are you thinking of a sysfs entry to select the backend?
>
>I'm really interested in this work, as BTRFS relocation/balance will have
>potential benefits, but we need to get it right.
Agree. We will post a V3 with emulation and addressing other comments.
We can take it from there. If you have comments on V2, please send them
our way and we will take them in.
On 08/12/2020 13:22, Javier Gonz?lez wrote:
> Good idea. Are you thinking of a sysfs entry to select the backend?
Not sure on this one, initially I thought of a sysfs file, but then
how would you do it. One "global" sysfs entry is probably a bad idea.
Having one per block device to select native vs emulation maybe? And
a good way to benchmark.
The other idea would be a benchmark loop on boot like the raid library
does.
Then on the other hand, there might be workloads that run faster with
the emulation and some that run faster with the hardware acceleration.
I think these points are the reason the last attempts got stuck.
On 08.12.2020 12:37, Johannes Thumshirn wrote:
>On 08/12/2020 13:22, Javier González wrote:
>> Good idea. Are you thinking of a sysfs entry to select the backend?
>
>Not sure on this one, initially I thought of a sysfs file, but then
>how would you do it. One "global" sysfs entry is probably a bad idea.
>Having one per block device to select native vs emulation maybe? And
>a good way to benchmark.
I was thinking a per block device to target the use case where a certain
implementation / workload is better one way or the other.
>
>The other idea would be a benchmark loop on boot like the raid library
>does.
>
>Then on the other hand, there might be workloads that run faster with
>the emulation and some that run faster with the hardware acceleration.
>
>I think these points are the reason the last attempts got stuck.
Yes. I believe that any benchmark we run would be biased in a certain
way. If we can move forward with a sysfs entry and default to legacy
path, we would not alter current behavior and enable NVMe copy offload
(for now) for those that want to use it. We can then build on top of it.
Does this sound like a reasonable approach?
On 08/12/2020 14:13, Javier Gonz?lez wrote:
> On 08.12.2020 12:37, Johannes Thumshirn wrote:
>> On 08/12/2020 13:22, Javier Gonz?lez wrote:
>>> Good idea. Are you thinking of a sysfs entry to select the backend?
>>
>> Not sure on this one, initially I thought of a sysfs file, but then
>> how would you do it. One "global" sysfs entry is probably a bad idea.
>> Having one per block device to select native vs emulation maybe? And
>> a good way to benchmark.
>
> I was thinking a per block device to target the use case where a certain
> implementation / workload is better one way or the other.
Yes something along those lines.
>>
>> The other idea would be a benchmark loop on boot like the raid library
>> does.
>>
>> Then on the other hand, there might be workloads that run faster with
>> the emulation and some that run faster with the hardware acceleration.
>>
>> I think these points are the reason the last attempts got stuck.
>
> Yes. I believe that any benchmark we run would be biased in a certain
> way. If we can move forward with a sysfs entry and default to legacy
> path, we would not alter current behavior and enable NVMe copy offload
> (for now) for those that want to use it. We can then build on top of it.
>
> Does this sound like a reasonable approach?
>
Yes this sounds like a reasonable approach to me.
Hannes,
[Sorry I'm late to the discussion here, had a few fires going in
addition to the end of the kernel cycle]
> And we shouldn't forget that the main issue which killed all previous
> implementations was a missing QoS guarantee.
That and the fact that our arbitrary stacking situation was hard to
resolve.
The QoS guarantee was somewhat addressed by Fred in T10. But I agree we
need some sort of toggle.
--
Martin K. Petersen Oracle Linux Engineering
SelvaKumar,
> Add new BLKCOPY ioctl that offloads copying of multiple sources
> to a destination to the device.
Your patches are limited in scope to what is currently possible with
NVMe. I.e. multiple source ranges to a single destination within the
same device. That's fine, I think the garbage collection use case is
valid and worth pursuing.
I just wanted to go over what the pain points were for the various
attempts in SCSI over the years.
The main headache was due the stacking situation with DM and MD.
Restricting offload to raw SCSI disks would have been simple but not
really a good fit for most real world developments that often use DM or
MD to provision the storage.
Things are simple for DM/MD with reads and writes because you have one
bio as parent that may get split into many clones that complete
individually prior to the parent being marked as completed.
In the copy offload scenario things quickly become complex once both
source and destination ranges have to be split into multiple commands
for potentially multiple devices. And these clones then need to be
correctly paired at the bottom of the stack. There's also no guarantee
that a 1MB source range maps to a single 1MB destination range. So you
could end up with an M:N relationship to resolve.
After a few failed attempts we focused on single source range/single
destination range. Just to simplify the slicing and dicing. That worked
reasonably well. However, then came along the token-based commands in
SCSI and those threw a wrench in the gears. Now the block layer plumbing
had to support two completely different semantic approaches.
Inspired by a combination of Mikulas' efforts with pointer matching and
the token-based approach in SCSI I switched the block layer
implementation from a single operation (REQ_COPY) to something similar
to the SCSI token approach with a REQ_COPY_IN and a REQ_COPY_OUT.
The premise being that you would send a command to the source device and
"get" the data. In the EXTENDED COPY scenario, the data wasn't really
anything but a confirmation from the SCSI disk driver that the I/O had
reached the bottom of the stack without being split by DM/MD. And once
completion of the REQ_COPY_IN reached blk-lib, a REQ_COPY_OUT would be
issued and, if that arrived unchanged in the disk driver, get turned
into an EXTENDED COPY sent to the destination.
In the token-based scenario the same thing happened except POPULATE
TOKEN was sent all the way out to the device to receive a cookie
representing the source block ranges. Upon completion, that cookie was
used by blk-lib to issue a REQ_COPY_OUT command which was then sent to
the destination device. Again only if the REQ_COPY_OUT I/O hadn't been
split traversing the stack.
The idea was to subsequently leverage the separation of REQ_COPY_IN and
REQ_COPY_OUT to permit a DM/MD iterative approach to both stages of the
operation. That seemed to me like the only reasonable way to approach
the M:N splitting problem (if at all)...
--
Martin K. Petersen Oracle Linux Engineering
On 08.12.2020 13:24, Johannes Thumshirn wrote:
>On 08/12/2020 14:13, Javier González wrote:
>> On 08.12.2020 12:37, Johannes Thumshirn wrote:
>>> On 08/12/2020 13:22, Javier González wrote:
>>>> Good idea. Are you thinking of a sysfs entry to select the backend?
>>>
>>> Not sure on this one, initially I thought of a sysfs file, but then
>>> how would you do it. One "global" sysfs entry is probably a bad idea.
>>> Having one per block device to select native vs emulation maybe? And
>>> a good way to benchmark.
>>
>> I was thinking a per block device to target the use case where a certain
>> implementation / workload is better one way or the other.
>
>Yes something along those lines.
>
>>>
>>> The other idea would be a benchmark loop on boot like the raid library
>>> does.
>>>
>>> Then on the other hand, there might be workloads that run faster with
>>> the emulation and some that run faster with the hardware acceleration.
>>>
>>> I think these points are the reason the last attempts got stuck.
>>
>> Yes. I believe that any benchmark we run would be biased in a certain
>> way. If we can move forward with a sysfs entry and default to legacy
>> path, we would not alter current behavior and enable NVMe copy offload
>> (for now) for those that want to use it. We can then build on top of it.
>>
>> Does this sound like a reasonable approach?
>>
>
>Yes this sounds like a reasonable approach to me.
Cool. We will add this to the V3 then.
Thanks Johannes!
Hi!
> > Good idea. Are you thinking of a sysfs entry to select the backend?
>
> Not sure on this one, initially I thought of a sysfs file, but then
> how would you do it. One "global" sysfs entry is probably a bad idea.
> Having one per block device to select native vs emulation maybe? And
> a good way to benchmark.
>
> The other idea would be a benchmark loop on boot like the raid library
> does.
Doing copy benchmarking would require writes on the media, right?
Kernel should not do such stuff without userspace requesting it...
Best regards,
Pavel
--
http://www.livejournal.com/~pavelmachek