Hi,
This patch series covers iteration 6 of adding support for block
provisioning requests.
Changes from v5:
- Remove explicit supports_provision from dm devices.
- Move provision sectors io hint to pool_io_hint. Other devices
will derive the provisioning limits from the stack.
- Remove artifact from v4 to omit cell_defer_no_holder for
REQ_OP_PROVISION.
- Fix blkdev_fallocate() called with invalid fallocate
modes to propagate errors correctly.
Sarthak Kukreti (5):
block: Don't invalidate pagecache for invalid falloc modes
block: Introduce provisioning primitives
dm: Add block provisioning support
dm-thin: Add REQ_OP_PROVISION support
loop: Add support for provision requests
block/blk-core.c | 5 +++
block/blk-lib.c | 53 ++++++++++++++++++++++++++
block/blk-merge.c | 18 +++++++++
block/blk-settings.c | 19 ++++++++++
block/blk-sysfs.c | 9 +++++
block/bounce.c | 1 +
block/fops.c | 31 +++++++++++++---
drivers/block/loop.c | 42 +++++++++++++++++++++
drivers/md/dm-crypt.c | 4 +-
drivers/md/dm-linear.c | 1 +
drivers/md/dm-snap.c | 7 ++++
drivers/md/dm-table.c | 23 ++++++++++++
drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++--
drivers/md/dm.c | 6 +++
include/linux/bio.h | 6 ++-
include/linux/blk_types.h | 5 ++-
include/linux/blkdev.h | 16 ++++++++
include/linux/device-mapper.h | 17 +++++++++
18 files changed, 319 insertions(+), 14 deletions(-)
--
2.40.1.521.gf1e218fcd8-goog
Only call truncate_bdev_range() if the fallocate mode is
supported. This fixes a bug where data in the pagecache
could be invalidated if the fallocate() was called on the
block device with an invalid mode.
Fixes: 25f4c41415e5 ("block: implement (some of) fallocate for block devices")
Cc: [email protected]
Reported-by: Darrick J. Wong <[email protected]>
Signed-off-by: Sarthak Kukreti <[email protected]>
---
block/fops.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index d2e6be4e3d1c..4c70fdc546e7 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -648,24 +648,35 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
filemap_invalidate_lock(inode->i_mapping);
- /* Invalidate the page cache, including dirty pages. */
- error = truncate_bdev_range(bdev, file->f_mode, start, end);
- if (error)
- goto fail;
-
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ goto fail;
+
error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
len >> SECTOR_SHIFT, GFP_KERNEL,
BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ goto fail;
+
error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
len >> SECTOR_SHIFT, GFP_KERNEL,
BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ goto fail;
+
error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
len >> SECTOR_SHIFT, GFP_KERNEL);
break;
--
2.40.1.521.gf1e218fcd8-goog
Introduce block request REQ_OP_PROVISION. The intent of this request
is to request underlying storage to preallocate disk space for the given
block range. Block devices that support this capability will export
a provision limit within their request queues.
This patch also adds the capability to call fallocate() in mode 0
on block devices, which will send REQ_OP_PROVISION to the block
device for the specified range,
Signed-off-by: Sarthak Kukreti <[email protected]>
---
block/blk-core.c | 5 ++++
block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
block/blk-merge.c | 18 +++++++++++++
block/blk-settings.c | 19 ++++++++++++++
block/blk-sysfs.c | 9 +++++++
block/bounce.c | 1 +
block/fops.c | 10 +++++++-
include/linux/bio.h | 6 +++--
include/linux/blk_types.h | 5 +++-
include/linux/blkdev.h | 16 ++++++++++++
10 files changed, 138 insertions(+), 4 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 42926e6cb83c..4a2342ba3a8b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -123,6 +123,7 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
+ REQ_OP_NAME(PROVISION)
};
#undef REQ_OP_NAME
@@ -798,6 +799,10 @@ void submit_bio_noacct(struct bio *bio)
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
break;
+ case REQ_OP_PROVISION:
+ if (!q->limits.max_provision_sectors)
+ goto not_supported;
+ break;
default:
break;
}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e59c3069e835..647b6451660b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -343,3 +343,56 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
return ret;
}
EXPORT_SYMBOL(blkdev_issue_secure_erase);
+
+/**
+ * blkdev_issue_provision - provision a block range
+ * @bdev: blockdev to write
+ * @sector: start sector
+ * @nr_sects: number of sectors to provision
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ * Issues a provision request to the block device for the range of sectors.
+ * For thinly provisioned block devices, this acts as a signal for the
+ * underlying storage pool to allocate space for this block range.
+ */
+int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp)
+{
+ sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+ unsigned int max_sectors = bdev_max_provision_sectors(bdev);
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int ret = 0;
+
+ if (max_sectors == 0)
+ return -EOPNOTSUPP;
+ if ((sector | nr_sects) & bs_mask)
+ return -EINVAL;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
+ blk_start_plug(&plug);
+ for (;;) {
+ unsigned int req_sects = min_t(sector_t, nr_sects, max_sectors);
+
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_PROVISION, gfp);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_size = req_sects << SECTOR_SHIFT;
+
+ sector += req_sects;
+ nr_sects -= req_sects;
+ if (!nr_sects) {
+ ret = submit_bio_wait(bio);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ bio_put(bio);
+ break;
+ }
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_provision);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6460abdb2426..a3ffebb97a1d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -158,6 +158,21 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}
+static struct bio *bio_split_provision(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nsegs, struct bio_set *bs)
+{
+ *nsegs = 0;
+
+ if (!lim->max_provision_sectors)
+ return NULL;
+
+ if (bio_sectors(bio) <= lim->max_provision_sectors)
+ return NULL;
+
+ return bio_split(bio, lim->max_provision_sectors, GFP_NOIO, bs);
+}
+
/*
* Return the maximum number of sectors from the start of a bio that may be
* submitted as a single request to a block device. If enough sectors remain,
@@ -366,6 +381,9 @@ struct bio *__bio_split_to_limits(struct bio *bio,
case REQ_OP_WRITE_ZEROES:
split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
+ case REQ_OP_PROVISION:
+ split = bio_split_provision(bio, lim, nr_segs, bs);
+ break;
default:
split = bio_split_rw(bio, lim, nr_segs, bs,
get_max_io_size(bio, lim) << SECTOR_SHIFT);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 896b4654ab00..d303e6614c36 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -59,6 +59,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->zoned = BLK_ZONED_NONE;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
+ lim->max_provision_sectors = 0;
}
/**
@@ -82,6 +83,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
+ lim->max_provision_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -208,6 +210,20 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
}
EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+/**
+ * blk_queue_max_provision_sectors - set max sectors for a single provision
+ *
+ * @q: the request queue for the device
+ * @max_provision_sectors: maximum number of sectors to provision per command
+ **/
+
+void blk_queue_max_provision_sectors(struct request_queue *q,
+ unsigned int max_provision_sectors)
+{
+ q->limits.max_provision_sectors = max_provision_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_provision_sectors);
+
/**
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
@@ -578,6 +594,9 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
+ t->max_provision_sectors = min_not_zero(t->max_provision_sectors,
+ b->max_provision_sectors);
+
t->misaligned |= b->misaligned;
alignment = queue_limit_alignment_offset(b, start);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f1fce1c7fa44..0a3165211c66 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -213,6 +213,13 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
return queue_var_show(0, page);
}
+static ssize_t queue_provision_max_show(struct request_queue *q,
+ char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)q->limits.max_provision_sectors << 9);
+}
+
static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
{
return queue_var_show(0, page);
@@ -604,6 +611,7 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
+QUEUE_RO_ENTRY(queue_provision_max, "provision_max_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
@@ -661,6 +669,7 @@ static struct attribute *queue_attrs[] = {
&queue_discard_max_entry.attr,
&queue_discard_max_hw_entry.attr,
&queue_discard_zeroes_data_entry.attr,
+ &queue_provision_max_entry.attr,
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
&queue_zone_append_max_entry.attr,
diff --git a/block/bounce.c b/block/bounce.c
index 7cfcb242f9a1..ab9d8723ae64 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -176,6 +176,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_PROVISION:
break;
default:
bio_for_each_segment(bv, bio_src, iter)
diff --git a/block/fops.c b/block/fops.c
index 4c70fdc546e7..be2e41f160bf 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -613,7 +613,8 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE | \
+ FALLOC_FL_UNSHARE_RANGE)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -653,6 +654,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
* de-allocate mode calls to fallocate().
*/
switch (mode) {
+ case 0:
+ case FALLOC_FL_UNSHARE_RANGE:
+ case FALLOC_FL_KEEP_SIZE:
+ case FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE:
+ error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL);
+ break;
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = truncate_bdev_range(bdev, file->f_mode, start, end);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d766be7152e1..9820b3b039f2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -57,7 +57,8 @@ static inline bool bio_has_data(struct bio *bio)
bio->bi_iter.bi_size &&
bio_op(bio) != REQ_OP_DISCARD &&
bio_op(bio) != REQ_OP_SECURE_ERASE &&
- bio_op(bio) != REQ_OP_WRITE_ZEROES)
+ bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+ bio_op(bio) != REQ_OP_PROVISION)
return true;
return false;
@@ -67,7 +68,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
{
return bio_op(bio) == REQ_OP_DISCARD ||
bio_op(bio) == REQ_OP_SECURE_ERASE ||
- bio_op(bio) == REQ_OP_WRITE_ZEROES;
+ bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+ bio_op(bio) == REQ_OP_PROVISION;
}
static inline void *bio_data(struct bio *bio)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 99be590f952f..27bdf88f541c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -385,7 +385,10 @@ enum req_op {
REQ_OP_DRV_IN = (__force blk_opf_t)34,
REQ_OP_DRV_OUT = (__force blk_opf_t)35,
- REQ_OP_LAST = (__force blk_opf_t)36,
+ /* request device to provision block */
+ REQ_OP_PROVISION = (__force blk_opf_t)37,
+
+ REQ_OP_LAST = (__force blk_opf_t)38,
};
enum req_flag_bits {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 941304f17492..239e2f418b6e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -303,6 +303,7 @@ struct queue_limits {
unsigned int discard_granularity;
unsigned int discard_alignment;
unsigned int zone_write_granularity;
+ unsigned int max_provision_sectors;
unsigned short max_segments;
unsigned short max_integrity_segments;
@@ -921,6 +922,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors);
extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
unsigned int max_write_same_sectors);
+extern void blk_queue_max_provision_sectors(struct request_queue *q,
+ unsigned int max_provision_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors);
@@ -1060,6 +1063,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp);
+extern int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask);
+
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
@@ -1139,6 +1145,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que
return q->limits.max_discard_segments;
}
+static inline unsigned short queue_max_provision_sectors(const struct request_queue *q)
+{
+ return q->limits.max_provision_sectors;
+}
+
static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
return q->limits.max_segment_size;
@@ -1281,6 +1292,11 @@ static inline bool bdev_nowait(struct block_device *bdev)
return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
}
+static inline unsigned int bdev_max_provision_sectors(struct block_device *bdev)
+{
+ return bdev_get_queue(bdev)->limits.max_provision_sectors;
+}
+
static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
{
return blk_queue_zoned_model(bdev_get_queue(bdev));
--
2.40.1.521.gf1e218fcd8-goog
Add block provisioning support for device-mapper targets.
dm-crypt, dm-snap and dm-linear will, by default, passthrough
REQ_OP_PROVISION requests to the underlying device, if
supported.
Signed-off-by: Sarthak Kukreti <[email protected]>
---
drivers/md/dm-crypt.c | 4 +++-
drivers/md/dm-linear.c | 1 +
drivers/md/dm-snap.c | 7 +++++++
drivers/md/dm-table.c | 23 +++++++++++++++++++++++
drivers/md/dm.c | 6 ++++++
include/linux/device-mapper.h | 17 +++++++++++++++++
6 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8b47b913ee83..5a7c475ce6fc 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3336,6 +3336,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->tag_pool_max_sectors <<= cc->sector_shift;
}
+ ti->num_provision_bios = 1;
+
ret = -ENOMEM;
cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
if (!cc->io_queue) {
@@ -3390,7 +3392,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
* - for REQ_OP_DISCARD caller must use flush if IO ordering matters
*/
if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
- bio_op(bio) == REQ_OP_DISCARD)) {
+ bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_PROVISION)) {
bio_set_dev(bio, cc->dev->bdev);
if (bio_sectors(bio))
bio->bi_iter.bi_sector = cc->start +
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index f4448d520ee9..74ee27ca551a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_discard_bios = 1;
ti->num_secure_erase_bios = 1;
ti->num_write_zeroes_bios = 1;
+ ti->num_provision_bios = 1;
ti->private = lc;
return 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9c49f53760d0..0dfda50ac4e0 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1358,6 +1358,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (s->discard_zeroes_cow)
ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
+ ti->num_provision_bios = 1;
/* Add snapshot to the list of snapshots for this origin */
/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -2003,6 +2004,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
/* If the block is already remapped - use that, else remap it */
e = dm_lookup_exception(&s->complete, chunk);
if (e) {
+ if (unlikely(bio_op(bio) == REQ_OP_PROVISION)) {
+ bio_endio(bio);
+ r = DM_MAPIO_SUBMITTED;
+ goto out_unlock;
+ }
remap_exception(s, e, bio, chunk);
if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
io_overlaps_chunk(s, bio)) {
@@ -2413,6 +2419,7 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
/* All discards are split on chunk_size boundary */
limits->discard_granularity = snap->store->chunk_size;
limits->max_discard_sectors = snap->store->chunk_size;
+ limits->max_provision_sectors = snap->store->chunk_size;
up_read(&_origins_lock);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 119db5e01080..282c530b0685 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1854,6 +1854,26 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t)
return true;
}
+static int device_provision_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ return bdev_max_provision_sectors(dev->bdev);
+}
+
+static bool dm_table_supports_provision(struct dm_table *t)
+{
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (ti->provision_supported ||
+ (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_provision_capable, NULL)))
+ return true;
+ }
+
+ return false;
+}
+
static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1987,6 +2007,9 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_zeroes(t))
q->limits.max_write_zeroes_sectors = 0;
+ if (!dm_table_supports_provision(t))
+ q->limits.max_provision_sectors = 0;
+
dm_table_verify_integrity(t);
/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3b694ba3a106..9b94121b8d38 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1609,6 +1609,7 @@ static bool is_abnormal_io(struct bio *bio)
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_PROVISION:
return true;
default:
break;
@@ -1641,6 +1642,11 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
if (ti->max_write_zeroes_granularity)
max_granularity = limits->max_write_zeroes_sectors;
break;
+ case REQ_OP_PROVISION:
+ num_bios = ti->num_provision_bios;
+ if (ti->max_provision_granularity)
+ max_granularity = limits->max_provision_sectors;
+ break;
default:
break;
}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a52d2b9a6846..9981378457d2 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -334,6 +334,12 @@ struct dm_target {
*/
unsigned int num_write_zeroes_bios;
+ /*
+ * The number of PROVISION bios that will be submitted to the target.
+ * The bio number can be accessed with dm_bio_get_target_bio_nr.
+ */
+ unsigned int num_provision_bios;
+
/*
* The minimum number of extra bytes allocated in each io for the
* target to use.
@@ -358,6 +364,11 @@ struct dm_target {
*/
bool discards_supported:1;
+ /* Set if this target needs to receive provision requests regardless of
+ * whether or not its underlying devices have support.
+ */
+ bool provision_supported:1;
+
/*
* Set if this target requires that discards be split on
* 'max_discard_sectors' boundaries.
@@ -376,6 +387,12 @@ struct dm_target {
*/
bool max_write_zeroes_granularity:1;
+ /*
+ * Set if this target requires that provisions be split on
+ * 'max_provision_sectors' boundaries.
+ */
+ bool max_provision_granularity:1;
+
/*
* Set if we need to limit the number of in-flight bios when swapping.
*/
--
2.40.1.521.gf1e218fcd8-goog
dm-thinpool uses the provision request to provision
blocks for a dm-thin device. dm-thinpool currently does not
pass through REQ_OP_PROVISION to underlying devices.
For shared blocks, provision requests will break sharing and copy the
contents of the entire block. Additionally, if 'skip_block_zeroing'
is not set, dm-thin will opt to zero out the entire range as a part
of provisioning.
Signed-off-by: Sarthak Kukreti <[email protected]>
---
drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 66 insertions(+), 4 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b13c949bd72..3f94f53ac956 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -274,6 +274,7 @@ struct pool {
process_bio_fn process_bio;
process_bio_fn process_discard;
+ process_bio_fn process_provision;
process_cell_fn process_cell;
process_cell_fn process_discard_cell;
@@ -913,7 +914,8 @@ static void __inc_remap_and_issue_cell(void *context,
struct bio *bio;
while ((bio = bio_list_pop(&cell->bios))) {
- if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
+ if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD ||
+ bio_op(bio) == REQ_OP_PROVISION)
bio_list_add(&info->defer_bios, bio);
else {
inc_all_io_entry(info->tc->pool, bio);
@@ -1245,8 +1247,8 @@ static int io_overlaps_block(struct pool *pool, struct bio *bio)
static int io_overwrites_block(struct pool *pool, struct bio *bio)
{
- return (bio_data_dir(bio) == WRITE) &&
- io_overlaps_block(pool, bio);
+ return (bio_data_dir(bio) == WRITE) && io_overlaps_block(pool, bio) &&
+ bio_op(bio) != REQ_OP_PROVISION;
}
static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -1953,6 +1955,51 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
}
}
+static void process_provision_bio(struct thin_c *tc, struct bio *bio)
+{
+ int r;
+ struct pool *pool = tc->pool;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_bio_prison_cell *cell;
+ struct dm_cell_key key;
+ struct dm_thin_lookup_result lookup_result;
+
+ /*
+ * If cell is already occupied, then the block is already
+ * being provisioned so we have nothing further to do here.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(pool, &key, bio, &cell))
+ return;
+
+ if (tc->requeue_mode) {
+ cell_requeue(pool, cell);
+ return;
+ }
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ if (lookup_result.shared) {
+ process_shared_bio(tc, bio, block, &lookup_result, cell);
+ } else {
+ bio_endio(bio);
+ cell_defer_no_holder(tc, cell);
+ }
+ break;
+ case -ENODATA:
+ provision_block(tc, bio, block, cell);
+ break;
+
+ default:
+ DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+ __func__, r);
+ cell_defer_no_holder(tc, cell);
+ bio_io_error(bio);
+ break;
+ }
+}
+
static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
int r;
@@ -2228,6 +2275,8 @@ static void process_thin_deferred_bios(struct thin_c *tc)
if (bio_op(bio) == REQ_OP_DISCARD)
pool->process_discard(tc, bio);
+ else if (bio_op(bio) == REQ_OP_PROVISION)
+ pool->process_provision(tc, bio);
else
pool->process_bio(tc, bio);
@@ -2579,6 +2628,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_fail;
pool->process_discard = process_bio_fail;
+ pool->process_provision = process_bio_fail;
pool->process_cell = process_cell_fail;
pool->process_discard_cell = process_cell_fail;
pool->process_prepared_mapping = process_prepared_mapping_fail;
@@ -2592,6 +2642,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_bio_success;
+ pool->process_provision = process_bio_fail;
pool->process_cell = process_cell_read_only;
pool->process_discard_cell = process_cell_success;
pool->process_prepared_mapping = process_prepared_mapping_fail;
@@ -2612,6 +2663,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->out_of_data_space = true;
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
+ pool->process_provision = process_bio_fail;
pool->process_cell = process_cell_read_only;
pool->process_prepared_mapping = process_prepared_mapping;
set_discard_callbacks(pool);
@@ -2628,6 +2680,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
pool->process_discard = process_discard_bio;
+ pool->process_provision = process_provision_bio;
pool->process_cell = process_cell;
pool->process_prepared_mapping = process_prepared_mapping;
set_discard_callbacks(pool);
@@ -2749,7 +2802,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
- if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
+ if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD ||
+ bio_op(bio) == REQ_OP_PROVISION) {
thin_defer_bio_with_throttle(tc, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -3396,6 +3450,9 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
pt->adjusted_pf = pt->requested_pf = pf;
ti->num_flush_bios = 1;
ti->limit_swap_bios = true;
+ ti->num_provision_bios = 1;
+ ti->provision_supported = true;
+ ti->max_provision_granularity = true;
/*
* Only need to enable discards if the pool should pass
@@ -4114,6 +4171,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
* The pool uses the same discard limits as the underlying data
* device. DM core has already set this up.
*/
+
+ limits->max_provision_sectors = pool->sectors_per_block;
}
static struct target_type pool_target = {
@@ -4288,6 +4347,9 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->max_discard_granularity = true;
}
+ ti->num_provision_bios = 1;
+ ti->provision_supported = true;
+
mutex_unlock(&dm_thin_pool_table.mutex);
spin_lock_irq(&tc->pool->lock);
--
2.40.1.521.gf1e218fcd8-goog
Add support for provision requests to loopback devices.
Loop devices will configure provision support based on
whether the underlying block device/file can support
the provision request and upon receiving a provision bio,
will map it to the backing device/storage. For loop devices
over files, a REQ_OP_PROVISION request will translate to
an fallocate mode 0 call on the backing file.
Signed-off-by: Sarthak Kukreti <[email protected]>
---
drivers/block/loop.c | 42 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bc31bb7072a2..13c4b4f8b9c1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -327,6 +327,24 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
return ret;
}
+static int lo_req_provision(struct loop_device *lo, struct request *rq, loff_t pos)
+{
+ struct file *file = lo->lo_backing_file;
+ struct request_queue *q = lo->lo_queue;
+ int ret;
+
+ if (!q->limits.max_provision_sectors) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = file->f_op->fallocate(file, 0, pos, blk_rq_bytes(rq));
+ if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+ ret = -EIO;
+ out:
+ return ret;
+}
+
static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
int ret = vfs_fsync(lo->lo_backing_file, 0);
@@ -488,6 +506,8 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
FALLOC_FL_PUNCH_HOLE);
case REQ_OP_DISCARD:
return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
+ case REQ_OP_PROVISION:
+ return lo_req_provision(lo, rq, pos);
case REQ_OP_WRITE:
if (cmd->use_aio)
return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
@@ -754,6 +774,25 @@ static void loop_sysfs_exit(struct loop_device *lo)
&loop_attribute_group);
}
+static void loop_config_provision(struct loop_device *lo)
+{
+ struct file *file = lo->lo_backing_file;
+ struct inode *inode = file->f_mapping->host;
+
+ /*
+ * If the backing device is a block device, mirror its provisioning
+ * capability.
+ */
+ if (S_ISBLK(inode->i_mode)) {
+ blk_queue_max_provision_sectors(lo->lo_queue,
+ bdev_max_provision_sectors(I_BDEV(inode)));
+ } else if (file->f_op->fallocate) {
+ blk_queue_max_provision_sectors(lo->lo_queue, UINT_MAX >> 9);
+ } else {
+ blk_queue_max_provision_sectors(lo->lo_queue, 0);
+ }
+}
+
static void loop_config_discard(struct loop_device *lo)
{
struct file *file = lo->lo_backing_file;
@@ -1092,6 +1131,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
blk_queue_io_min(lo->lo_queue, bsize);
loop_config_discard(lo);
+ loop_config_provision(lo);
loop_update_rotational(lo);
loop_update_dio(lo);
loop_sysfs_init(lo);
@@ -1304,6 +1344,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
}
loop_config_discard(lo);
+ loop_config_provision(lo);
/* update dio if lo_offset or transfer is changed */
__loop_update_dio(lo, lo->use_dio);
@@ -1830,6 +1871,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
case REQ_OP_FLUSH:
case REQ_OP_DISCARD:
case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_PROVISION:
cmd->use_aio = false;
break;
default:
--
2.40.1.521.gf1e218fcd8-goog
On Sat, May 06 2023 at 2:29P -0400,
Sarthak Kukreti <[email protected]> wrote:
> Only call truncate_bdev_range() if the fallocate mode is
> supported. This fixes a bug where data in the pagecache
> could be invalidated if the fallocate() was called on the
> block device with an invalid mode.
>
> Fixes: 25f4c41415e5 ("block: implement (some of) fallocate for block devices")
> Cc: [email protected]
> Reported-by: Darrick J. Wong <[email protected]>
> Signed-off-by: Sarthak Kukreti <[email protected]>
Reviewed-by: Mike Snitzer <[email protected]>
On Sat, May 06 2023 at 2:29P -0400,
Sarthak Kukreti <[email protected]> wrote:
> Introduce block request REQ_OP_PROVISION. The intent of this request
> is to request underlying storage to preallocate disk space for the given
> block range. Block devices that support this capability will export
> a provision limit within their request queues.
>
> This patch also adds the capability to call fallocate() in mode 0
> on block devices, which will send REQ_OP_PROVISION to the block
> device for the specified range,
>
> Signed-off-by: Sarthak Kukreti <[email protected]>
Reviewed-by: Mike Snitzer <[email protected]>
On Sat, May 06 2023 at 2:29P -0400,
Sarthak Kukreti <[email protected]> wrote:
> Add block provisioning support for device-mapper targets.
> dm-crypt, dm-snap and dm-linear will, by default, passthrough
> REQ_OP_PROVISION requests to the underlying device, if
> supported.
>
> Signed-off-by: Sarthak Kukreti <[email protected]>
Reviewed-by: Mike Snitzer <[email protected]>
On Sat, May 06 2023 at 2:29P -0400,
Sarthak Kukreti <[email protected]> wrote:
> dm-thinpool uses the provision request to provision
> blocks for a dm-thin device. dm-thinpool currently does not
> pass through REQ_OP_PROVISION to underlying devices.
>
> For shared blocks, provision requests will break sharing and copy the
> contents of the entire block. Additionally, if 'skip_block_zeroing'
> is not set, dm-thin will opt to zero out the entire range as a part
> of provisioning.
>
> Signed-off-by: Sarthak Kukreti <[email protected]>
> ---
> drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 66 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> index 2b13c949bd72..3f94f53ac956 100644
> --- a/drivers/md/dm-thin.c
> +++ b/drivers/md/dm-thin.c
> @@ -274,6 +274,7 @@ struct pool {
>
> process_bio_fn process_bio;
> process_bio_fn process_discard;
> + process_bio_fn process_provision;
>
> process_cell_fn process_cell;
> process_cell_fn process_discard_cell;
> @@ -913,7 +914,8 @@ static void __inc_remap_and_issue_cell(void *context,
> struct bio *bio;
>
> while ((bio = bio_list_pop(&cell->bios))) {
> - if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
> + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD ||
> + bio_op(bio) == REQ_OP_PROVISION)
> bio_list_add(&info->defer_bios, bio);
> else {
> inc_all_io_entry(info->tc->pool, bio);
> @@ -1245,8 +1247,8 @@ static int io_overlaps_block(struct pool *pool, struct bio *bio)
>
> static int io_overwrites_block(struct pool *pool, struct bio *bio)
> {
> - return (bio_data_dir(bio) == WRITE) &&
> - io_overlaps_block(pool, bio);
> + return (bio_data_dir(bio) == WRITE) && io_overlaps_block(pool, bio) &&
> + bio_op(bio) != REQ_OP_PROVISION;
> }
>
> static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
> @@ -1953,6 +1955,51 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
> }
> }
>
> +static void process_provision_bio(struct thin_c *tc, struct bio *bio)
> +{
> + int r;
> + struct pool *pool = tc->pool;
> + dm_block_t block = get_bio_block(tc, bio);
> + struct dm_bio_prison_cell *cell;
> + struct dm_cell_key key;
> + struct dm_thin_lookup_result lookup_result;
> +
> + /*
> + * If cell is already occupied, then the block is already
> + * being provisioned so we have nothing further to do here.
> + */
> + build_virtual_key(tc->td, block, &key);
> + if (bio_detain(pool, &key, bio, &cell))
> + return;
> +
> + if (tc->requeue_mode) {
> + cell_requeue(pool, cell);
> + return;
> + }
> +
> + r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
> + switch (r) {
> + case 0:
> + if (lookup_result.shared) {
> + process_shared_bio(tc, bio, block, &lookup_result, cell);
> + } else {
> + bio_endio(bio);
> + cell_defer_no_holder(tc, cell);
> + }
> + break;
> + case -ENODATA:
> + provision_block(tc, bio, block, cell);
> + break;
> +
> + default:
> + DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
> + __func__, r);
> + cell_defer_no_holder(tc, cell);
> + bio_io_error(bio);
> + break;
> + }
> +}
> +
> static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
> {
> int r;
> @@ -2228,6 +2275,8 @@ static void process_thin_deferred_bios(struct thin_c *tc)
>
> if (bio_op(bio) == REQ_OP_DISCARD)
> pool->process_discard(tc, bio);
> + else if (bio_op(bio) == REQ_OP_PROVISION)
> + pool->process_provision(tc, bio);
> else
> pool->process_bio(tc, bio);
>
> @@ -2579,6 +2628,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> dm_pool_metadata_read_only(pool->pmd);
> pool->process_bio = process_bio_fail;
> pool->process_discard = process_bio_fail;
> + pool->process_provision = process_bio_fail;
> pool->process_cell = process_cell_fail;
> pool->process_discard_cell = process_cell_fail;
> pool->process_prepared_mapping = process_prepared_mapping_fail;
> @@ -2592,6 +2642,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> dm_pool_metadata_read_only(pool->pmd);
> pool->process_bio = process_bio_read_only;
> pool->process_discard = process_bio_success;
> + pool->process_provision = process_bio_fail;
> pool->process_cell = process_cell_read_only;
> pool->process_discard_cell = process_cell_success;
> pool->process_prepared_mapping = process_prepared_mapping_fail;
> @@ -2612,6 +2663,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> pool->out_of_data_space = true;
> pool->process_bio = process_bio_read_only;
> pool->process_discard = process_discard_bio;
> + pool->process_provision = process_bio_fail;
> pool->process_cell = process_cell_read_only;
> pool->process_prepared_mapping = process_prepared_mapping;
> set_discard_callbacks(pool);
> @@ -2628,6 +2680,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> dm_pool_metadata_read_write(pool->pmd);
> pool->process_bio = process_bio;
> pool->process_discard = process_discard_bio;
> + pool->process_provision = process_provision_bio;
> pool->process_cell = process_cell;
> pool->process_prepared_mapping = process_prepared_mapping;
> set_discard_callbacks(pool);
> @@ -2749,7 +2802,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
> return DM_MAPIO_SUBMITTED;
> }
>
> - if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
> + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD ||
> + bio_op(bio) == REQ_OP_PROVISION) {
> thin_defer_bio_with_throttle(tc, bio);
> return DM_MAPIO_SUBMITTED;
> }
> @@ -3396,6 +3450,9 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> pt->adjusted_pf = pt->requested_pf = pf;
> ti->num_flush_bios = 1;
> ti->limit_swap_bios = true;
> + ti->num_provision_bios = 1;
> + ti->provision_supported = true;
> + ti->max_provision_granularity = true;
>
> /*
> * Only need to enable discards if the pool should pass
> @@ -4114,6 +4171,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
> * The pool uses the same discard limits as the underlying data
> * device. DM core has already set this up.
> */
> +
> + limits->max_provision_sectors = pool->sectors_per_block;
> }
>
> static struct target_type pool_target = {
> @@ -4288,6 +4347,9 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> ti->max_discard_granularity = true;
> }
>
> + ti->num_provision_bios = 1;
> + ti->provision_supported = true;
> +
We need this in thin_ctr: ti->max_provision_granularity = true;
More needed in the thin target than thin-pool; otherwise provision bio
issued to thin devices won't be split appropriately. But I do think
its fine to set in both thin_ctr and pool_ctr.
Otherwise, looks good.
Thanks,
Mike
On Tue, May 9, 2023 at 9:58 AM Mike Snitzer <[email protected]> wrote:
>
> On Sat, May 06 2023 at 2:29P -0400,
> Sarthak Kukreti <[email protected]> wrote:
>
> > dm-thinpool uses the provision request to provision
> > blocks for a dm-thin device. dm-thinpool currently does not
> > pass through REQ_OP_PROVISION to underlying devices.
> >
> > For shared blocks, provision requests will break sharing and copy the
> > contents of the entire block. Additionally, if 'skip_block_zeroing'
> > is not set, dm-thin will opt to zero out the entire range as a part
> > of provisioning.
> >
> > Signed-off-by: Sarthak Kukreti <[email protected]>
> > ---
> > drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++++++++++---
> > 1 file changed, 66 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > index 2b13c949bd72..3f94f53ac956 100644
> > --- a/drivers/md/dm-thin.c
> > +++ b/drivers/md/dm-thin.c
> > @@ -274,6 +274,7 @@ struct pool {
> >
> > process_bio_fn process_bio;
> > process_bio_fn process_discard;
> > + process_bio_fn process_provision;
> >
> > process_cell_fn process_cell;
> > process_cell_fn process_discard_cell;
> > @@ -913,7 +914,8 @@ static void __inc_remap_and_issue_cell(void *context,
> > struct bio *bio;
> >
> > while ((bio = bio_list_pop(&cell->bios))) {
> > - if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
> > + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD ||
> > + bio_op(bio) == REQ_OP_PROVISION)
> > bio_list_add(&info->defer_bios, bio);
> > else {
> > inc_all_io_entry(info->tc->pool, bio);
> > @@ -1245,8 +1247,8 @@ static int io_overlaps_block(struct pool *pool, struct bio *bio)
> >
> > static int io_overwrites_block(struct pool *pool, struct bio *bio)
> > {
> > - return (bio_data_dir(bio) == WRITE) &&
> > - io_overlaps_block(pool, bio);
> > + return (bio_data_dir(bio) == WRITE) && io_overlaps_block(pool, bio) &&
> > + bio_op(bio) != REQ_OP_PROVISION;
> > }
> >
> > static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
> > @@ -1953,6 +1955,51 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
> > }
> > }
> >
> > +static void process_provision_bio(struct thin_c *tc, struct bio *bio)
> > +{
> > + int r;
> > + struct pool *pool = tc->pool;
> > + dm_block_t block = get_bio_block(tc, bio);
> > + struct dm_bio_prison_cell *cell;
> > + struct dm_cell_key key;
> > + struct dm_thin_lookup_result lookup_result;
> > +
> > + /*
> > + * If cell is already occupied, then the block is already
> > + * being provisioned so we have nothing further to do here.
> > + */
> > + build_virtual_key(tc->td, block, &key);
> > + if (bio_detain(pool, &key, bio, &cell))
> > + return;
> > +
> > + if (tc->requeue_mode) {
> > + cell_requeue(pool, cell);
> > + return;
> > + }
> > +
> > + r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
> > + switch (r) {
> > + case 0:
> > + if (lookup_result.shared) {
> > + process_shared_bio(tc, bio, block, &lookup_result, cell);
> > + } else {
> > + bio_endio(bio);
> > + cell_defer_no_holder(tc, cell);
> > + }
> > + break;
> > + case -ENODATA:
> > + provision_block(tc, bio, block, cell);
> > + break;
> > +
> > + default:
> > + DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
> > + __func__, r);
> > + cell_defer_no_holder(tc, cell);
> > + bio_io_error(bio);
> > + break;
> > + }
> > +}
> > +
> > static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
> > {
> > int r;
> > @@ -2228,6 +2275,8 @@ static void process_thin_deferred_bios(struct thin_c *tc)
> >
> > if (bio_op(bio) == REQ_OP_DISCARD)
> > pool->process_discard(tc, bio);
> > + else if (bio_op(bio) == REQ_OP_PROVISION)
> > + pool->process_provision(tc, bio);
> > else
> > pool->process_bio(tc, bio);
> >
> > @@ -2579,6 +2628,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> > dm_pool_metadata_read_only(pool->pmd);
> > pool->process_bio = process_bio_fail;
> > pool->process_discard = process_bio_fail;
> > + pool->process_provision = process_bio_fail;
> > pool->process_cell = process_cell_fail;
> > pool->process_discard_cell = process_cell_fail;
> > pool->process_prepared_mapping = process_prepared_mapping_fail;
> > @@ -2592,6 +2642,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> > dm_pool_metadata_read_only(pool->pmd);
> > pool->process_bio = process_bio_read_only;
> > pool->process_discard = process_bio_success;
> > + pool->process_provision = process_bio_fail;
> > pool->process_cell = process_cell_read_only;
> > pool->process_discard_cell = process_cell_success;
> > pool->process_prepared_mapping = process_prepared_mapping_fail;
> > @@ -2612,6 +2663,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> > pool->out_of_data_space = true;
> > pool->process_bio = process_bio_read_only;
> > pool->process_discard = process_discard_bio;
> > + pool->process_provision = process_bio_fail;
> > pool->process_cell = process_cell_read_only;
> > pool->process_prepared_mapping = process_prepared_mapping;
> > set_discard_callbacks(pool);
> > @@ -2628,6 +2680,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
> > dm_pool_metadata_read_write(pool->pmd);
> > pool->process_bio = process_bio;
> > pool->process_discard = process_discard_bio;
> > + pool->process_provision = process_provision_bio;
> > pool->process_cell = process_cell;
> > pool->process_prepared_mapping = process_prepared_mapping;
> > set_discard_callbacks(pool);
> > @@ -2749,7 +2802,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
> > return DM_MAPIO_SUBMITTED;
> > }
> >
> > - if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
> > + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD ||
> > + bio_op(bio) == REQ_OP_PROVISION) {
> > thin_defer_bio_with_throttle(tc, bio);
> > return DM_MAPIO_SUBMITTED;
> > }
> > @@ -3396,6 +3450,9 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> > pt->adjusted_pf = pt->requested_pf = pf;
> > ti->num_flush_bios = 1;
> > ti->limit_swap_bios = true;
> > + ti->num_provision_bios = 1;
> > + ti->provision_supported = true;
> > + ti->max_provision_granularity = true;
> >
> > /*
> > * Only need to enable discards if the pool should pass
> > @@ -4114,6 +4171,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
> > * The pool uses the same discard limits as the underlying data
> > * device. DM core has already set this up.
> > */
> > +
> > + limits->max_provision_sectors = pool->sectors_per_block;
> > }
> >
> > static struct target_type pool_target = {
> > @@ -4288,6 +4347,9 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> > ti->max_discard_granularity = true;
> > }
> >
> > + ti->num_provision_bios = 1;
> > + ti->provision_supported = true;
> > +
>
> We need this in thin_ctr: ti->max_provision_granularity = true;
>
> More needed in the thin target than thin-pool; otherwise provision bio
> issued to thin devices won't be split appropriately. But I do think
> its fine to set in both thin_ctr and pool_ctr.
>
> Otherwise, looks good.
>
Thanks! I'll add it to the next iteration (in addition to any other
feedback that's added to v6).
Given that this series covers multiple subsystems, would there be a
preferred way of queueing this for merge?
Best
Sarthak
> Thanks,
> Mike
On Sat, May 06 2023 at 2:29P -0400,
Sarthak Kukreti <[email protected]> wrote:
> dm-thinpool uses the provision request to provision
> blocks for a dm-thin device. dm-thinpool currently does not
> pass through REQ_OP_PROVISION to underlying devices.
>
> For shared blocks, provision requests will break sharing and copy the
> contents of the entire block. Additionally, if 'skip_block_zeroing'
> is not set, dm-thin will opt to zero out the entire range as a part
> of provisioning.
>
> Signed-off-by: Sarthak Kukreti <[email protected]>
> ---
> drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 66 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> index 2b13c949bd72..3f94f53ac956 100644
> --- a/drivers/md/dm-thin.c
> +++ b/drivers/md/dm-thin.c
...
> @@ -4114,6 +4171,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
> * The pool uses the same discard limits as the underlying data
> * device. DM core has already set this up.
> */
> +
> + limits->max_provision_sectors = pool->sectors_per_block;
Just noticed that setting limits->max_provision_sectors needs to move
above pool_io_hints code that sets up discards -- otherwise the early
return from if (!pt->adjusted_pf.discard_enabled) will cause setting
max_provision_sectors to be skipped.
Here is a roll up of the fixes that need to be folded into this patch:
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 3f94f53ac956..90c8e36cb327 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -4151,6 +4151,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
}
+ limits->max_provision_sectors = pool->sectors_per_block;
+
/*
* pt->adjusted_pf is a staging area for the actual features to use.
* They get transferred to the live pool in bind_control_target()
@@ -4171,8 +4173,6 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
* The pool uses the same discard limits as the underlying data
* device. DM core has already set this up.
*/
-
- limits->max_provision_sectors = pool->sectors_per_block;
}
static struct target_type pool_target = {
@@ -4349,6 +4349,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_provision_bios = 1;
ti->provision_supported = true;
+ ti->max_provision_granularity = true;
mutex_unlock(&dm_thin_pool_table.mutex);
On Fri, May 05, 2023 at 11:29:05PM -0700, Sarthak Kukreti wrote:
> Only call truncate_bdev_range() if the fallocate mode is
> supported. This fixes a bug where data in the pagecache
> could be invalidated if the fallocate() was called on the
> block device with an invalid mode.
>
> Fixes: 25f4c41415e5 ("block: implement (some of) fallocate for block devices")
> Cc: [email protected]
> Reported-by: Darrick J. Wong <[email protected]>
> Signed-off-by: Sarthak Kukreti <[email protected]>
Ideally you'd only take filemap_invalidate_lock for valid modes, but eh
who cares about efficiency for the EOPNOTSUPP case, let's move on. :)
Reviewed-by: Darrick J. Wong <[email protected]>
--D
> ---
> block/fops.c | 21 ++++++++++++++++-----
> 1 file changed, 16 insertions(+), 5 deletions(-)
>
> diff --git a/block/fops.c b/block/fops.c
> index d2e6be4e3d1c..4c70fdc546e7 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -648,24 +648,35 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
>
> filemap_invalidate_lock(inode->i_mapping);
>
> - /* Invalidate the page cache, including dirty pages. */
> - error = truncate_bdev_range(bdev, file->f_mode, start, end);
> - if (error)
> - goto fail;
> -
> + /*
> + * Invalidate the page cache, including dirty pages, for valid
> + * de-allocate mode calls to fallocate().
> + */
> switch (mode) {
> case FALLOC_FL_ZERO_RANGE:
> case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
> + error = truncate_bdev_range(bdev, file->f_mode, start, end);
> + if (error)
> + goto fail;
> +
> error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
> len >> SECTOR_SHIFT, GFP_KERNEL,
> BLKDEV_ZERO_NOUNMAP);
> break;
> case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
> + error = truncate_bdev_range(bdev, file->f_mode, start, end);
> + if (error)
> + goto fail;
> +
> error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
> len >> SECTOR_SHIFT, GFP_KERNEL,
> BLKDEV_ZERO_NOFALLBACK);
> break;
> case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
> + error = truncate_bdev_range(bdev, file->f_mode, start, end);
> + if (error)
> + goto fail;
> +
> error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
> len >> SECTOR_SHIFT, GFP_KERNEL);
> break;
> --
> 2.40.1.521.gf1e218fcd8-goog
>
On Fri, May 05, 2023 at 11:29:06PM -0700, Sarthak Kukreti wrote:
> Introduce block request REQ_OP_PROVISION. The intent of this request
> is to request underlying storage to preallocate disk space for the given
> block range. Block devices that support this capability will export
> a provision limit within their request queues.
>
> This patch also adds the capability to call fallocate() in mode 0
> on block devices, which will send REQ_OP_PROVISION to the block
> device for the specified range,
>
> Signed-off-by: Sarthak Kukreti <[email protected]>
> ---
> block/blk-core.c | 5 ++++
> block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
> block/blk-merge.c | 18 +++++++++++++
> block/blk-settings.c | 19 ++++++++++++++
> block/blk-sysfs.c | 9 +++++++
> block/bounce.c | 1 +
> block/fops.c | 10 +++++++-
> include/linux/bio.h | 6 +++--
> include/linux/blk_types.h | 5 +++-
> include/linux/blkdev.h | 16 ++++++++++++
> 10 files changed, 138 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 42926e6cb83c..4a2342ba3a8b 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -123,6 +123,7 @@ static const char *const blk_op_name[] = {
> REQ_OP_NAME(WRITE_ZEROES),
> REQ_OP_NAME(DRV_IN),
> REQ_OP_NAME(DRV_OUT),
> + REQ_OP_NAME(PROVISION)
> };
> #undef REQ_OP_NAME
>
> @@ -798,6 +799,10 @@ void submit_bio_noacct(struct bio *bio)
> if (!q->limits.max_write_zeroes_sectors)
> goto not_supported;
> break;
> + case REQ_OP_PROVISION:
> + if (!q->limits.max_provision_sectors)
> + goto not_supported;
> + break;
> default:
> break;
> }
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index e59c3069e835..647b6451660b 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -343,3 +343,56 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
> return ret;
> }
> EXPORT_SYMBOL(blkdev_issue_secure_erase);
> +
> +/**
> + * blkdev_issue_provision - provision a block range
> + * @bdev: blockdev to write
> + * @sector: start sector
> + * @nr_sects: number of sectors to provision
> + * @gfp_mask: memory allocation flags (for bio_alloc)
> + *
> + * Description:
> + * Issues a provision request to the block device for the range of sectors.
> + * For thinly provisioned block devices, this acts as a signal for the
> + * underlying storage pool to allocate space for this block range.
> + */
> +int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
> + sector_t nr_sects, gfp_t gfp)
> +{
> + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
> + unsigned int max_sectors = bdev_max_provision_sectors(bdev);
> + struct bio *bio = NULL;
> + struct blk_plug plug;
> + int ret = 0;
> +
> + if (max_sectors == 0)
> + return -EOPNOTSUPP;
> + if ((sector | nr_sects) & bs_mask)
> + return -EINVAL;
> + if (bdev_read_only(bdev))
> + return -EPERM;
> +
> + blk_start_plug(&plug);
> + for (;;) {
> + unsigned int req_sects = min_t(sector_t, nr_sects, max_sectors);
> +
> + bio = blk_next_bio(bio, bdev, 0, REQ_OP_PROVISION, gfp);
> + bio->bi_iter.bi_sector = sector;
> + bio->bi_iter.bi_size = req_sects << SECTOR_SHIFT;
> +
> + sector += req_sects;
> + nr_sects -= req_sects;
> + if (!nr_sects) {
> + ret = submit_bio_wait(bio);
> + if (ret == -EOPNOTSUPP)
> + ret = 0;
Why do we convert EOPNOTSUPP to success here? If the device suddenly
forgets how to provision space, wouldn't we want to pass that up to the
caller?
(I'm not sure when this would happen -- perhaps the bdev has the general
provisioning capability but not for the specific range requested?)
The rest of the patch looks ok to me.
--D
> + bio_put(bio);
> + break;
> + }
> + cond_resched();
> + }
> + blk_finish_plug(&plug);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(blkdev_issue_provision);
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 6460abdb2426..a3ffebb97a1d 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -158,6 +158,21 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
> return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
> }
>
> +static struct bio *bio_split_provision(struct bio *bio,
> + const struct queue_limits *lim,
> + unsigned int *nsegs, struct bio_set *bs)
> +{
> + *nsegs = 0;
> +
> + if (!lim->max_provision_sectors)
> + return NULL;
> +
> + if (bio_sectors(bio) <= lim->max_provision_sectors)
> + return NULL;
> +
> + return bio_split(bio, lim->max_provision_sectors, GFP_NOIO, bs);
> +}
> +
> /*
> * Return the maximum number of sectors from the start of a bio that may be
> * submitted as a single request to a block device. If enough sectors remain,
> @@ -366,6 +381,9 @@ struct bio *__bio_split_to_limits(struct bio *bio,
> case REQ_OP_WRITE_ZEROES:
> split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
> break;
> + case REQ_OP_PROVISION:
> + split = bio_split_provision(bio, lim, nr_segs, bs);
> + break;
> default:
> split = bio_split_rw(bio, lim, nr_segs, bs,
> get_max_io_size(bio, lim) << SECTOR_SHIFT);
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 896b4654ab00..d303e6614c36 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -59,6 +59,7 @@ void blk_set_default_limits(struct queue_limits *lim)
> lim->zoned = BLK_ZONED_NONE;
> lim->zone_write_granularity = 0;
> lim->dma_alignment = 511;
> + lim->max_provision_sectors = 0;
> }
>
> /**
> @@ -82,6 +83,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
> lim->max_dev_sectors = UINT_MAX;
> lim->max_write_zeroes_sectors = UINT_MAX;
> lim->max_zone_append_sectors = UINT_MAX;
> + lim->max_provision_sectors = UINT_MAX;
> }
> EXPORT_SYMBOL(blk_set_stacking_limits);
>
> @@ -208,6 +210,20 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
> }
> EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
>
> +/**
> + * blk_queue_max_provision_sectors - set max sectors for a single provision
> + *
> + * @q: the request queue for the device
> + * @max_provision_sectors: maximum number of sectors to provision per command
> + **/
> +
> +void blk_queue_max_provision_sectors(struct request_queue *q,
> + unsigned int max_provision_sectors)
> +{
> + q->limits.max_provision_sectors = max_provision_sectors;
> +}
> +EXPORT_SYMBOL(blk_queue_max_provision_sectors);
> +
> /**
> * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
> * @q: the request queue for the device
> @@ -578,6 +594,9 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
> t->max_segment_size = min_not_zero(t->max_segment_size,
> b->max_segment_size);
>
> + t->max_provision_sectors = min_not_zero(t->max_provision_sectors,
> + b->max_provision_sectors);
> +
> t->misaligned |= b->misaligned;
>
> alignment = queue_limit_alignment_offset(b, start);
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index f1fce1c7fa44..0a3165211c66 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -213,6 +213,13 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
> return queue_var_show(0, page);
> }
>
> +static ssize_t queue_provision_max_show(struct request_queue *q,
> + char *page)
> +{
> + return sprintf(page, "%llu\n",
> + (unsigned long long)q->limits.max_provision_sectors << 9);
> +}
> +
> static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
> {
> return queue_var_show(0, page);
> @@ -604,6 +611,7 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
> QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
> QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
>
> +QUEUE_RO_ENTRY(queue_provision_max, "provision_max_bytes");
> QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
> QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
> QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
> @@ -661,6 +669,7 @@ static struct attribute *queue_attrs[] = {
> &queue_discard_max_entry.attr,
> &queue_discard_max_hw_entry.attr,
> &queue_discard_zeroes_data_entry.attr,
> + &queue_provision_max_entry.attr,
> &queue_write_same_max_entry.attr,
> &queue_write_zeroes_max_entry.attr,
> &queue_zone_append_max_entry.attr,
> diff --git a/block/bounce.c b/block/bounce.c
> index 7cfcb242f9a1..ab9d8723ae64 100644
> --- a/block/bounce.c
> +++ b/block/bounce.c
> @@ -176,6 +176,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
> case REQ_OP_DISCARD:
> case REQ_OP_SECURE_ERASE:
> case REQ_OP_WRITE_ZEROES:
> + case REQ_OP_PROVISION:
> break;
> default:
> bio_for_each_segment(bv, bio_src, iter)
> diff --git a/block/fops.c b/block/fops.c
> index 4c70fdc546e7..be2e41f160bf 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -613,7 +613,8 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>
> #define BLKDEV_FALLOC_FL_SUPPORTED \
> (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
> - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
> + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE | \
> + FALLOC_FL_UNSHARE_RANGE)
>
> static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> loff_t len)
> @@ -653,6 +654,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> * de-allocate mode calls to fallocate().
> */
> switch (mode) {
> + case 0:
> + case FALLOC_FL_UNSHARE_RANGE:
> + case FALLOC_FL_KEEP_SIZE:
> + case FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE:
> + error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> + len >> SECTOR_SHIFT, GFP_KERNEL);
> + break;
> case FALLOC_FL_ZERO_RANGE:
> case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
> error = truncate_bdev_range(bdev, file->f_mode, start, end);
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index d766be7152e1..9820b3b039f2 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -57,7 +57,8 @@ static inline bool bio_has_data(struct bio *bio)
> bio->bi_iter.bi_size &&
> bio_op(bio) != REQ_OP_DISCARD &&
> bio_op(bio) != REQ_OP_SECURE_ERASE &&
> - bio_op(bio) != REQ_OP_WRITE_ZEROES)
> + bio_op(bio) != REQ_OP_WRITE_ZEROES &&
> + bio_op(bio) != REQ_OP_PROVISION)
> return true;
>
> return false;
> @@ -67,7 +68,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
> {
> return bio_op(bio) == REQ_OP_DISCARD ||
> bio_op(bio) == REQ_OP_SECURE_ERASE ||
> - bio_op(bio) == REQ_OP_WRITE_ZEROES;
> + bio_op(bio) == REQ_OP_WRITE_ZEROES ||
> + bio_op(bio) == REQ_OP_PROVISION;
> }
>
> static inline void *bio_data(struct bio *bio)
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 99be590f952f..27bdf88f541c 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -385,7 +385,10 @@ enum req_op {
> REQ_OP_DRV_IN = (__force blk_opf_t)34,
> REQ_OP_DRV_OUT = (__force blk_opf_t)35,
>
> - REQ_OP_LAST = (__force blk_opf_t)36,
> + /* request device to provision block */
> + REQ_OP_PROVISION = (__force blk_opf_t)37,
> +
> + REQ_OP_LAST = (__force blk_opf_t)38,
> };
>
> enum req_flag_bits {
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 941304f17492..239e2f418b6e 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -303,6 +303,7 @@ struct queue_limits {
> unsigned int discard_granularity;
> unsigned int discard_alignment;
> unsigned int zone_write_granularity;
> + unsigned int max_provision_sectors;
>
> unsigned short max_segments;
> unsigned short max_integrity_segments;
> @@ -921,6 +922,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
> unsigned int max_discard_sectors);
> extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
> unsigned int max_write_same_sectors);
> +extern void blk_queue_max_provision_sectors(struct request_queue *q,
> + unsigned int max_provision_sectors);
> extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
> extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
> unsigned int max_zone_append_sectors);
> @@ -1060,6 +1063,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
> sector_t nr_sects, gfp_t gfp);
>
> +extern int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
> + sector_t nr_sects, gfp_t gfp_mask);
> +
> #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
> #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
>
> @@ -1139,6 +1145,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que
> return q->limits.max_discard_segments;
> }
>
> +static inline unsigned short queue_max_provision_sectors(const struct request_queue *q)
> +{
> + return q->limits.max_provision_sectors;
> +}
> +
> static inline unsigned int queue_max_segment_size(const struct request_queue *q)
> {
> return q->limits.max_segment_size;
> @@ -1281,6 +1292,11 @@ static inline bool bdev_nowait(struct block_device *bdev)
> return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
> }
>
> +static inline unsigned int bdev_max_provision_sectors(struct block_device *bdev)
> +{
> + return bdev_get_queue(bdev)->limits.max_provision_sectors;
> +}
> +
> static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
> {
> return blk_queue_zoned_model(bdev_get_queue(bdev));
> --
> 2.40.1.521.gf1e218fcd8-goog
>
On Fri, May 05, 2023 at 11:29:09PM -0700, Sarthak Kukreti wrote:
> Add support for provision requests to loopback devices.
> Loop devices will configure provision support based on
> whether the underlying block device/file can support
> the provision request and upon receiving a provision bio,
> will map it to the backing device/storage. For loop devices
> over files, a REQ_OP_PROVISION request will translate to
> an fallocate mode 0 call on the backing file.
>
> Signed-off-by: Sarthak Kukreti <[email protected]>
> ---
> drivers/block/loop.c | 42 ++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 42 insertions(+)
>
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index bc31bb7072a2..13c4b4f8b9c1 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -327,6 +327,24 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
> return ret;
> }
>
> +static int lo_req_provision(struct loop_device *lo, struct request *rq, loff_t pos)
> +{
> + struct file *file = lo->lo_backing_file;
> + struct request_queue *q = lo->lo_queue;
> + int ret;
> +
> + if (!q->limits.max_provision_sectors) {
> + ret = -EOPNOTSUPP;
> + goto out;
> + }
> +
> + ret = file->f_op->fallocate(file, 0, pos, blk_rq_bytes(rq));
> + if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
> + ret = -EIO;
> + out:
> + return ret;
> +}
> +
> static int lo_req_flush(struct loop_device *lo, struct request *rq)
> {
> int ret = vfs_fsync(lo->lo_backing_file, 0);
> @@ -488,6 +506,8 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
> FALLOC_FL_PUNCH_HOLE);
> case REQ_OP_DISCARD:
> return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
> + case REQ_OP_PROVISION:
> + return lo_req_provision(lo, rq, pos);
Hi Sarthak,
The only thing that stands out to me is the separate lo_req_provision()
helper here. It seems it might be a little cleaner to extend and reuse
lo_req_fallocate()..? But that's not something I feel strongly about, so
this all looks pretty good to me either way, FWIW.
Brian
> case REQ_OP_WRITE:
> if (cmd->use_aio)
> return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
> @@ -754,6 +774,25 @@ static void loop_sysfs_exit(struct loop_device *lo)
> &loop_attribute_group);
> }
>
> +static void loop_config_provision(struct loop_device *lo)
> +{
> + struct file *file = lo->lo_backing_file;
> + struct inode *inode = file->f_mapping->host;
> +
> + /*
> + * If the backing device is a block device, mirror its provisioning
> + * capability.
> + */
> + if (S_ISBLK(inode->i_mode)) {
> + blk_queue_max_provision_sectors(lo->lo_queue,
> + bdev_max_provision_sectors(I_BDEV(inode)));
> + } else if (file->f_op->fallocate) {
> + blk_queue_max_provision_sectors(lo->lo_queue, UINT_MAX >> 9);
> + } else {
> + blk_queue_max_provision_sectors(lo->lo_queue, 0);
> + }
> +}
> +
> static void loop_config_discard(struct loop_device *lo)
> {
> struct file *file = lo->lo_backing_file;
> @@ -1092,6 +1131,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
> blk_queue_io_min(lo->lo_queue, bsize);
>
> loop_config_discard(lo);
> + loop_config_provision(lo);
> loop_update_rotational(lo);
> loop_update_dio(lo);
> loop_sysfs_init(lo);
> @@ -1304,6 +1344,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
> }
>
> loop_config_discard(lo);
> + loop_config_provision(lo);
>
> /* update dio if lo_offset or transfer is changed */
> __loop_update_dio(lo, lo->use_dio);
> @@ -1830,6 +1871,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> case REQ_OP_FLUSH:
> case REQ_OP_DISCARD:
> case REQ_OP_WRITE_ZEROES:
> + case REQ_OP_PROVISION:
> cmd->use_aio = false;
> break;
> default:
> --
> 2.40.1.521.gf1e218fcd8-goog
>
On Fri, May 12, 2023 at 10:32 AM Mike Snitzer <[email protected]> wrote:
>
> On Sat, May 06 2023 at 2:29P -0400,
> Sarthak Kukreti <[email protected]> wrote:
>
> > dm-thinpool uses the provision request to provision
> > blocks for a dm-thin device. dm-thinpool currently does not
> > pass through REQ_OP_PROVISION to underlying devices.
> >
> > For shared blocks, provision requests will break sharing and copy the
> > contents of the entire block. Additionally, if 'skip_block_zeroing'
> > is not set, dm-thin will opt to zero out the entire range as a part
> > of provisioning.
> >
> > Signed-off-by: Sarthak Kukreti <[email protected]>
> > ---
> > drivers/md/dm-thin.c | 70 +++++++++++++++++++++++++++++++++++++++++---
> > 1 file changed, 66 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> > index 2b13c949bd72..3f94f53ac956 100644
> > --- a/drivers/md/dm-thin.c
> > +++ b/drivers/md/dm-thin.c
> ...
> > @@ -4114,6 +4171,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
> > * The pool uses the same discard limits as the underlying data
> > * device. DM core has already set this up.
> > */
> > +
> > + limits->max_provision_sectors = pool->sectors_per_block;
>
> Just noticed that setting limits->max_provision_sectors needs to move
> above pool_io_hints code that sets up discards -- otherwise the early
> return from if (!pt->adjusted_pf.discard_enabled) will cause setting
> max_provision_sectors to be skipped.
>
> Here is a roll up of the fixes that need to be folded into this patch:
>
Ah right, thanks for pointing that out! I'll fold this into v7.
Best
Sarthak
> diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
> index 3f94f53ac956..90c8e36cb327 100644
> --- a/drivers/md/dm-thin.c
> +++ b/drivers/md/dm-thin.c
> @@ -4151,6 +4151,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
> blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
> }
>
> + limits->max_provision_sectors = pool->sectors_per_block;
> +
> /*
> * pt->adjusted_pf is a staging area for the actual features to use.
> * They get transferred to the live pool in bind_control_target()
> @@ -4171,8 +4173,6 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
> * The pool uses the same discard limits as the underlying data
> * device. DM core has already set this up.
> */
> -
> - limits->max_provision_sectors = pool->sectors_per_block;
> }
>
> static struct target_type pool_target = {
> @@ -4349,6 +4349,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>
> ti->num_provision_bios = 1;
> ti->provision_supported = true;
> + ti->max_provision_granularity = true;
>
> mutex_unlock(&dm_thin_pool_table.mutex);
>
On Fri, May 12, 2023 at 11:37 AM Darrick J. Wong <[email protected]> wrote:
>
> On Fri, May 05, 2023 at 11:29:06PM -0700, Sarthak Kukreti wrote:
> > Introduce block request REQ_OP_PROVISION. The intent of this request
> > is to request underlying storage to preallocate disk space for the given
> > block range. Block devices that support this capability will export
> > a provision limit within their request queues.
> >
> > This patch also adds the capability to call fallocate() in mode 0
> > on block devices, which will send REQ_OP_PROVISION to the block
> > device for the specified range,
> >
> > Signed-off-by: Sarthak Kukreti <[email protected]>
> > ---
> > block/blk-core.c | 5 ++++
> > block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
> > block/blk-merge.c | 18 +++++++++++++
> > block/blk-settings.c | 19 ++++++++++++++
> > block/blk-sysfs.c | 9 +++++++
> > block/bounce.c | 1 +
> > block/fops.c | 10 +++++++-
> > include/linux/bio.h | 6 +++--
> > include/linux/blk_types.h | 5 +++-
> > include/linux/blkdev.h | 16 ++++++++++++
> > 10 files changed, 138 insertions(+), 4 deletions(-)
> >
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 42926e6cb83c..4a2342ba3a8b 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -123,6 +123,7 @@ static const char *const blk_op_name[] = {
> > REQ_OP_NAME(WRITE_ZEROES),
> > REQ_OP_NAME(DRV_IN),
> > REQ_OP_NAME(DRV_OUT),
> > + REQ_OP_NAME(PROVISION)
> > };
> > #undef REQ_OP_NAME
> >
> > @@ -798,6 +799,10 @@ void submit_bio_noacct(struct bio *bio)
> > if (!q->limits.max_write_zeroes_sectors)
> > goto not_supported;
> > break;
> > + case REQ_OP_PROVISION:
> > + if (!q->limits.max_provision_sectors)
> > + goto not_supported;
> > + break;
> > default:
> > break;
> > }
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index e59c3069e835..647b6451660b 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -343,3 +343,56 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
> > return ret;
> > }
> > EXPORT_SYMBOL(blkdev_issue_secure_erase);
> > +
> > +/**
> > + * blkdev_issue_provision - provision a block range
> > + * @bdev: blockdev to write
> > + * @sector: start sector
> > + * @nr_sects: number of sectors to provision
> > + * @gfp_mask: memory allocation flags (for bio_alloc)
> > + *
> > + * Description:
> > + * Issues a provision request to the block device for the range of sectors.
> > + * For thinly provisioned block devices, this acts as a signal for the
> > + * underlying storage pool to allocate space for this block range.
> > + */
> > +int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
> > + sector_t nr_sects, gfp_t gfp)
> > +{
> > + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
> > + unsigned int max_sectors = bdev_max_provision_sectors(bdev);
> > + struct bio *bio = NULL;
> > + struct blk_plug plug;
> > + int ret = 0;
> > +
> > + if (max_sectors == 0)
> > + return -EOPNOTSUPP;
> > + if ((sector | nr_sects) & bs_mask)
> > + return -EINVAL;
> > + if (bdev_read_only(bdev))
> > + return -EPERM;
> > +
> > + blk_start_plug(&plug);
> > + for (;;) {
> > + unsigned int req_sects = min_t(sector_t, nr_sects, max_sectors);
> > +
> > + bio = blk_next_bio(bio, bdev, 0, REQ_OP_PROVISION, gfp);
> > + bio->bi_iter.bi_sector = sector;
> > + bio->bi_iter.bi_size = req_sects << SECTOR_SHIFT;
> > +
> > + sector += req_sects;
> > + nr_sects -= req_sects;
> > + if (!nr_sects) {
> > + ret = submit_bio_wait(bio);
> > + if (ret == -EOPNOTSUPP)
> > + ret = 0;
>
> Why do we convert EOPNOTSUPP to success here? If the device suddenly
> forgets how to provision space, wouldn't we want to pass that up to the
> caller?
>
> (I'm not sure when this would happen -- perhaps the bdev has the general
> provisioning capability but not for the specific range requested?)
>
Ah good catch, I initially wired it up to be less noisy in the kernel
logs but left it behind accidentally. The error should definitely be
passed through: one case where this can happen is if the device-mapper
table comprises several underlying targets but only a few of them
support provision. I'll fix this in v7.
Best
Sarthak
> The rest of the patch looks ok to me.
>
> --D
>
> > + bio_put(bio);
> > + break;
> > + }
> > + cond_resched();
> > + }
> > + blk_finish_plug(&plug);
> > +
> > + return ret;
> > +}
> > +EXPORT_SYMBOL(blkdev_issue_provision);
> > diff --git a/block/blk-merge.c b/block/blk-merge.c
> > index 6460abdb2426..a3ffebb97a1d 100644
> > --- a/block/blk-merge.c
> > +++ b/block/blk-merge.c
> > @@ -158,6 +158,21 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
> > return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
> > }
> >
> > +static struct bio *bio_split_provision(struct bio *bio,
> > + const struct queue_limits *lim,
> > + unsigned int *nsegs, struct bio_set *bs)
> > +{
> > + *nsegs = 0;
> > +
> > + if (!lim->max_provision_sectors)
> > + return NULL;
> > +
> > + if (bio_sectors(bio) <= lim->max_provision_sectors)
> > + return NULL;
> > +
> > + return bio_split(bio, lim->max_provision_sectors, GFP_NOIO, bs);
> > +}
> > +
> > /*
> > * Return the maximum number of sectors from the start of a bio that may be
> > * submitted as a single request to a block device. If enough sectors remain,
> > @@ -366,6 +381,9 @@ struct bio *__bio_split_to_limits(struct bio *bio,
> > case REQ_OP_WRITE_ZEROES:
> > split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
> > break;
> > + case REQ_OP_PROVISION:
> > + split = bio_split_provision(bio, lim, nr_segs, bs);
> > + break;
> > default:
> > split = bio_split_rw(bio, lim, nr_segs, bs,
> > get_max_io_size(bio, lim) << SECTOR_SHIFT);
> > diff --git a/block/blk-settings.c b/block/blk-settings.c
> > index 896b4654ab00..d303e6614c36 100644
> > --- a/block/blk-settings.c
> > +++ b/block/blk-settings.c
> > @@ -59,6 +59,7 @@ void blk_set_default_limits(struct queue_limits *lim)
> > lim->zoned = BLK_ZONED_NONE;
> > lim->zone_write_granularity = 0;
> > lim->dma_alignment = 511;
> > + lim->max_provision_sectors = 0;
> > }
> >
> > /**
> > @@ -82,6 +83,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
> > lim->max_dev_sectors = UINT_MAX;
> > lim->max_write_zeroes_sectors = UINT_MAX;
> > lim->max_zone_append_sectors = UINT_MAX;
> > + lim->max_provision_sectors = UINT_MAX;
> > }
> > EXPORT_SYMBOL(blk_set_stacking_limits);
> >
> > @@ -208,6 +210,20 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
> > }
> > EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
> >
> > +/**
> > + * blk_queue_max_provision_sectors - set max sectors for a single provision
> > + *
> > + * @q: the request queue for the device
> > + * @max_provision_sectors: maximum number of sectors to provision per command
> > + **/
> > +
> > +void blk_queue_max_provision_sectors(struct request_queue *q,
> > + unsigned int max_provision_sectors)
> > +{
> > + q->limits.max_provision_sectors = max_provision_sectors;
> > +}
> > +EXPORT_SYMBOL(blk_queue_max_provision_sectors);
> > +
> > /**
> > * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
> > * @q: the request queue for the device
> > @@ -578,6 +594,9 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
> > t->max_segment_size = min_not_zero(t->max_segment_size,
> > b->max_segment_size);
> >
> > + t->max_provision_sectors = min_not_zero(t->max_provision_sectors,
> > + b->max_provision_sectors);
> > +
> > t->misaligned |= b->misaligned;
> >
> > alignment = queue_limit_alignment_offset(b, start);
> > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > index f1fce1c7fa44..0a3165211c66 100644
> > --- a/block/blk-sysfs.c
> > +++ b/block/blk-sysfs.c
> > @@ -213,6 +213,13 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
> > return queue_var_show(0, page);
> > }
> >
> > +static ssize_t queue_provision_max_show(struct request_queue *q,
> > + char *page)
> > +{
> > + return sprintf(page, "%llu\n",
> > + (unsigned long long)q->limits.max_provision_sectors << 9);
> > +}
> > +
> > static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
> > {
> > return queue_var_show(0, page);
> > @@ -604,6 +611,7 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
> > QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
> > QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
> >
> > +QUEUE_RO_ENTRY(queue_provision_max, "provision_max_bytes");
> > QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
> > QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
> > QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
> > @@ -661,6 +669,7 @@ static struct attribute *queue_attrs[] = {
> > &queue_discard_max_entry.attr,
> > &queue_discard_max_hw_entry.attr,
> > &queue_discard_zeroes_data_entry.attr,
> > + &queue_provision_max_entry.attr,
> > &queue_write_same_max_entry.attr,
> > &queue_write_zeroes_max_entry.attr,
> > &queue_zone_append_max_entry.attr,
> > diff --git a/block/bounce.c b/block/bounce.c
> > index 7cfcb242f9a1..ab9d8723ae64 100644
> > --- a/block/bounce.c
> > +++ b/block/bounce.c
> > @@ -176,6 +176,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
> > case REQ_OP_DISCARD:
> > case REQ_OP_SECURE_ERASE:
> > case REQ_OP_WRITE_ZEROES:
> > + case REQ_OP_PROVISION:
> > break;
> > default:
> > bio_for_each_segment(bv, bio_src, iter)
> > diff --git a/block/fops.c b/block/fops.c
> > index 4c70fdc546e7..be2e41f160bf 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> > @@ -613,7 +613,8 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >
> > #define BLKDEV_FALLOC_FL_SUPPORTED \
> > (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
> > - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
> > + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE | \
> > + FALLOC_FL_UNSHARE_RANGE)
> >
> > static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> > loff_t len)
> > @@ -653,6 +654,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> > * de-allocate mode calls to fallocate().
> > */
> > switch (mode) {
> > + case 0:
> > + case FALLOC_FL_UNSHARE_RANGE:
> > + case FALLOC_FL_KEEP_SIZE:
> > + case FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE:
> > + error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> > + len >> SECTOR_SHIFT, GFP_KERNEL);
> > + break;
> > case FALLOC_FL_ZERO_RANGE:
> > case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
> > error = truncate_bdev_range(bdev, file->f_mode, start, end);
> > diff --git a/include/linux/bio.h b/include/linux/bio.h
> > index d766be7152e1..9820b3b039f2 100644
> > --- a/include/linux/bio.h
> > +++ b/include/linux/bio.h
> > @@ -57,7 +57,8 @@ static inline bool bio_has_data(struct bio *bio)
> > bio->bi_iter.bi_size &&
> > bio_op(bio) != REQ_OP_DISCARD &&
> > bio_op(bio) != REQ_OP_SECURE_ERASE &&
> > - bio_op(bio) != REQ_OP_WRITE_ZEROES)
> > + bio_op(bio) != REQ_OP_WRITE_ZEROES &&
> > + bio_op(bio) != REQ_OP_PROVISION)
> > return true;
> >
> > return false;
> > @@ -67,7 +68,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
> > {
> > return bio_op(bio) == REQ_OP_DISCARD ||
> > bio_op(bio) == REQ_OP_SECURE_ERASE ||
> > - bio_op(bio) == REQ_OP_WRITE_ZEROES;
> > + bio_op(bio) == REQ_OP_WRITE_ZEROES ||
> > + bio_op(bio) == REQ_OP_PROVISION;
> > }
> >
> > static inline void *bio_data(struct bio *bio)
> > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > index 99be590f952f..27bdf88f541c 100644
> > --- a/include/linux/blk_types.h
> > +++ b/include/linux/blk_types.h
> > @@ -385,7 +385,10 @@ enum req_op {
> > REQ_OP_DRV_IN = (__force blk_opf_t)34,
> > REQ_OP_DRV_OUT = (__force blk_opf_t)35,
> >
> > - REQ_OP_LAST = (__force blk_opf_t)36,
> > + /* request device to provision block */
> > + REQ_OP_PROVISION = (__force blk_opf_t)37,
> > +
> > + REQ_OP_LAST = (__force blk_opf_t)38,
> > };
> >
> > enum req_flag_bits {
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 941304f17492..239e2f418b6e 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -303,6 +303,7 @@ struct queue_limits {
> > unsigned int discard_granularity;
> > unsigned int discard_alignment;
> > unsigned int zone_write_granularity;
> > + unsigned int max_provision_sectors;
> >
> > unsigned short max_segments;
> > unsigned short max_integrity_segments;
> > @@ -921,6 +922,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
> > unsigned int max_discard_sectors);
> > extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
> > unsigned int max_write_same_sectors);
> > +extern void blk_queue_max_provision_sectors(struct request_queue *q,
> > + unsigned int max_provision_sectors);
> > extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
> > extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
> > unsigned int max_zone_append_sectors);
> > @@ -1060,6 +1063,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> > int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
> > sector_t nr_sects, gfp_t gfp);
> >
> > +extern int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
> > + sector_t nr_sects, gfp_t gfp_mask);
> > +
> > #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
> > #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
> >
> > @@ -1139,6 +1145,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que
> > return q->limits.max_discard_segments;
> > }
> >
> > +static inline unsigned short queue_max_provision_sectors(const struct request_queue *q)
> > +{
> > + return q->limits.max_provision_sectors;
> > +}
> > +
> > static inline unsigned int queue_max_segment_size(const struct request_queue *q)
> > {
> > return q->limits.max_segment_size;
> > @@ -1281,6 +1292,11 @@ static inline bool bdev_nowait(struct block_device *bdev)
> > return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
> > }
> >
> > +static inline unsigned int bdev_max_provision_sectors(struct block_device *bdev)
> > +{
> > + return bdev_get_queue(bdev)->limits.max_provision_sectors;
> > +}
> > +
> > static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
> > {
> > return blk_queue_zoned_model(bdev_get_queue(bdev));
> > --
> > 2.40.1.521.gf1e218fcd8-goog
> >