Provides the operation of block devices of snapshot images. Read and
write operations are redirected to the regions of difference blocks for
block device (struct diff_area).
Signed-off-by: Sergei Shtepa <[email protected]>
---
drivers/block/blksnap/snapimage.c | 275 ++++++++++++++++++++++++++++++
drivers/block/blksnap/snapimage.h | 69 ++++++++
2 files changed, 344 insertions(+)
create mode 100644 drivers/block/blksnap/snapimage.c
create mode 100644 drivers/block/blksnap/snapimage.h
diff --git a/drivers/block/blksnap/snapimage.c b/drivers/block/blksnap/snapimage.c
new file mode 100644
index 000000000000..d7fd1a12e3a9
--- /dev/null
+++ b/drivers/block/blksnap/snapimage.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME "-snapimage: " fmt
+
+#include <linux/slab.h>
+#include <linux/cdrom.h>
+#include <linux/blk-mq.h>
+#include <uapi/linux/blksnap.h>
+#include "snapimage.h"
+#include "diff_area.h"
+#include "chunk.h"
+#include "cbt_map.h"
+
+#define NR_SNAPIMAGE_DEVT (1 << MINORBITS)
+
+static unsigned int _major;
+static DEFINE_IDA(snapimage_devt_ida);
+
+static int snapimage_kthread_worker_fn(void *param);
+
+static inline void snapimage_stop_worker(struct snapimage *snapimage)
+{
+ kthread_stop(snapimage->worker);
+ put_task_struct(snapimage->worker);
+}
+
+static inline int snapimage_start_worker(struct snapimage *snapimage)
+{
+ struct task_struct *task;
+
+ spin_lock_init(&snapimage->queue_lock);
+ bio_list_init(&snapimage->queue);
+
+ task = kthread_create(snapimage_kthread_worker_fn,
+ snapimage,
+ BLK_SNAP_IMAGE_NAME "%d",
+ MINOR(snapimage->image_dev_id));
+ if (IS_ERR(task))
+ return -ENOMEM;
+
+ snapimage->worker = get_task_struct(task);
+ set_user_nice(task, MAX_NICE);
+ task->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+ wake_up_process(task);
+
+ return 0;
+}
+
+static void snapimage_process_bio(struct snapimage *snapimage, struct bio *bio)
+{
+
+ struct diff_area_image_ctx io_ctx;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ sector_t pos = bio->bi_iter.bi_sector;
+
+ diff_area_throttling_io(snapimage->diff_area);
+ diff_area_image_ctx_init(&io_ctx, snapimage->diff_area,
+ op_is_write(bio_op(bio)));
+ bio_for_each_segment(bvec, bio, iter) {
+ blk_status_t st;
+
+ st = diff_area_image_io(&io_ctx, &bvec, &pos);
+ if (unlikely(st != BLK_STS_OK))
+ break;
+ }
+ diff_area_image_ctx_done(&io_ctx);
+ bio_endio(bio);
+}
+
+static inline struct bio *get_bio_from_queue(struct snapimage *snapimage)
+{
+ struct bio *bio;
+
+ spin_lock(&snapimage->queue_lock);
+ bio = bio_list_pop(&snapimage->queue);
+ spin_unlock(&snapimage->queue_lock);
+
+ return bio;
+}
+
+static int snapimage_kthread_worker_fn(void *param)
+{
+ struct snapimage *snapimage = param;
+ struct bio *bio;
+ int ret = 0;
+
+ while (!kthread_should_stop()) {
+ bio = get_bio_from_queue(snapimage);
+ if (!bio) {
+ schedule_timeout_interruptible(HZ / 100);
+ continue;
+ }
+
+ snapimage_process_bio(snapimage, bio);
+ }
+
+ while ((bio = get_bio_from_queue(snapimage)))
+ snapimage_process_bio(snapimage, bio);
+
+ return ret;
+}
+
+static void snapimage_submit_bio(struct bio *bio)
+{
+ struct snapimage *snapimage = bio->bi_bdev->bd_disk->private_data;
+ gfp_t gfp = GFP_NOIO;
+
+ if (bio->bi_opf & REQ_NOWAIT)
+ gfp |= GFP_NOWAIT;
+ if (snapimage->is_ready) {
+ spin_lock(&snapimage->queue_lock);
+ bio_list_add(&snapimage->queue, bio);
+ spin_unlock(&snapimage->queue_lock);
+
+ wake_up_process(snapimage->worker);
+ } else
+ bio_io_error(bio);
+}
+
+const struct block_device_operations bd_ops = {
+ .owner = THIS_MODULE,
+ .submit_bio = snapimage_submit_bio
+};
+
+void snapimage_free(struct snapimage *snapimage)
+{
+ pr_info("Snapshot image disk [%u:%u] delete\n",
+ MAJOR(snapimage->image_dev_id), MINOR(snapimage->image_dev_id));
+
+ blk_mq_freeze_queue(snapimage->disk->queue);
+ snapimage->is_ready = false;
+ blk_mq_unfreeze_queue(snapimage->disk->queue);
+
+ snapimage_stop_worker(snapimage);
+
+ del_gendisk(snapimage->disk);
+ put_disk(snapimage->disk);
+
+ diff_area_put(snapimage->diff_area);
+ cbt_map_put(snapimage->cbt_map);
+
+ ida_free(&snapimage_devt_ida, MINOR(snapimage->image_dev_id));
+ kfree(snapimage);
+}
+
+struct snapimage *snapimage_create(struct diff_area *diff_area,
+ struct cbt_map *cbt_map)
+{
+ int ret = 0;
+ int minor;
+ struct snapimage *snapimage = NULL;
+ struct gendisk *disk;
+
+ snapimage = kzalloc(sizeof(struct snapimage), GFP_KERNEL);
+ if (snapimage == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ minor = ida_alloc_range(&snapimage_devt_ida, 0, NR_SNAPIMAGE_DEVT - 1,
+ GFP_KERNEL);
+ if (minor < 0) {
+ ret = minor;
+ pr_err("Failed to allocate minor for snapshot image device. errno=%d\n",
+ abs(ret));
+ goto fail_free_image;
+ }
+
+ snapimage->is_ready = true;
+ snapimage->capacity = cbt_map->device_capacity;
+ snapimage->image_dev_id = MKDEV(_major, minor);
+ pr_info("Create snapshot image device [%u:%u] for original device [%u:%u]\n",
+ MAJOR(snapimage->image_dev_id),
+ MINOR(snapimage->image_dev_id),
+ MAJOR(diff_area->orig_bdev->bd_dev),
+ MINOR(diff_area->orig_bdev->bd_dev));
+
+ ret = snapimage_start_worker(snapimage);
+ if (ret) {
+ pr_err("Failed to start worker thread. errno=%d\n", abs(ret));
+ goto fail_free_minor;
+ }
+
+ disk = blk_alloc_disk(NUMA_NO_NODE);
+ if (!disk) {
+ pr_err("Failed to allocate disk\n");
+ ret = -ENOMEM;
+ goto fail_free_worker;
+ }
+ snapimage->disk = disk;
+
+ blk_queue_max_hw_sectors(disk->queue, BLK_DEF_MAX_SECTORS);
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue);
+
+ if (snprintf(disk->disk_name, DISK_NAME_LEN, "%s%d",
+ BLK_SNAP_IMAGE_NAME, minor) < 0) {
+ pr_err("Unable to set disk name for snapshot image device: invalid minor %u\n",
+ minor);
+ ret = -EINVAL;
+ goto fail_cleanup_disk;
+ }
+ pr_debug("Snapshot image disk name [%s]\n", disk->disk_name);
+
+ disk->flags = 0;
+#ifdef GENHD_FL_NO_PART_SCAN
+ disk->flags |= GENHD_FL_NO_PART_SCAN;
+#else
+ disk->flags |= GENHD_FL_NO_PART;
+#endif
+ disk->major = _major;
+ disk->first_minor = minor;
+ disk->minors = 1; /* One disk has only one partition */
+
+ disk->fops = &bd_ops;
+ disk->private_data = snapimage;
+
+ set_capacity(disk, snapimage->capacity);
+ pr_debug("Snapshot image device capacity %lld bytes\n",
+ (u64)(snapimage->capacity << SECTOR_SHIFT));
+
+ diff_area_get(diff_area);
+ snapimage->diff_area = diff_area;
+ cbt_map_get(cbt_map);
+ snapimage->cbt_map = cbt_map;
+
+ pr_debug("Add device [%d:%d]",
+ MAJOR(snapimage->image_dev_id), MINOR(snapimage->image_dev_id));
+ ret = add_disk(disk);
+ if (ret) {
+ pr_err("Failed to add disk [%s] for snapshot image device\n",
+ disk->disk_name);
+ goto fail_cleanup_disk;
+ }
+
+ wake_up_process(snapimage->worker);
+
+ return snapimage;
+
+fail_cleanup_disk:
+ put_disk(disk);
+fail_free_worker:
+ snapimage_stop_worker(snapimage);
+fail_free_minor:
+ ida_free(&snapimage_devt_ida, minor);
+fail_free_image:
+ kfree(snapimage);
+
+ return ERR_PTR(ret);
+}
+
+int snapimage_init(void)
+{
+ int ret = 0;
+
+ ret = register_blkdev(0, BLK_SNAP_IMAGE_NAME);
+ if (ret < 0) {
+ pr_err("Failed to register snapshot image block device\n");
+ return ret;
+ }
+
+ _major = ret;
+ pr_info("Snapshot image block device major %d was registered\n",
+ _major);
+
+ return 0;
+}
+
+void snapimage_done(void)
+{
+ unregister_blkdev(_major, BLK_SNAP_IMAGE_NAME);
+ pr_info("Snapshot image block device [%d] was unregistered\n", _major);
+}
+
+int snapimage_major(void)
+{
+ return _major;
+}
diff --git a/drivers/block/blksnap/snapimage.h b/drivers/block/blksnap/snapimage.h
new file mode 100644
index 000000000000..91e7bc004ee2
--- /dev/null
+++ b/drivers/block/blksnap/snapimage.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BLK_SNAP_SNAPIMAGE_H
+#define __BLK_SNAP_SNAPIMAGE_H
+
+#include <linux/blk_types.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/kthread.h>
+
+struct diff_area;
+struct cbt_map;
+
+/**
+ * struct snapimage - Snapshot image block device.
+ *
+ * @image_dev_id:
+ * ID of the snapshot image block device.
+ * @capacity:
+ * The size of the snapshot image in sectors must be equal to the size
+ * of the original device at the time of taking the snapshot.
+ * @is_ready:
+ * The flag means that the snapshot image is ready for processing
+ * I/O units.
+ * @worker:
+ * A pointer to the &struct task of the worker thread that process I/O
+ * units.
+ * @queue_lock:
+ * Lock for &queue.
+ * @queue:
+ * A queue of I/O units waiting to be processed.
+ * @disk:
+ * A pointer to the &struct gendisk for the image block device.
+ * @diff_area:
+ * A pointer to the owned &struct diff_area.
+ * @cbt_map:
+ * A pointer to the owned &struct cbt_map.
+ *
+ * The snapshot image is presented in the system as a block device. But
+ * when reading or writing a snapshot image, the data is redirected to
+ * the original block device or to the block device of the difference storage.
+ *
+ * The module does not prohibit reading and writing data to the snapshot
+ * from different threads in parallel. To avoid the problem with simultaneous
+ * access, it is enough to open the snapshot image block device with the
+ * FMODE_EXCL parameter.
+ */
+struct snapimage {
+ dev_t image_dev_id;
+ sector_t capacity;
+ bool is_ready;
+
+ struct task_struct *worker;
+ spinlock_t queue_lock;
+ struct bio_list queue;
+
+ struct gendisk *disk;
+
+ struct diff_area *diff_area;
+ struct cbt_map *cbt_map;
+};
+
+int snapimage_init(void);
+void snapimage_done(void);
+int snapimage_major(void);
+
+void snapimage_free(struct snapimage *snapimage);
+struct snapimage *snapimage_create(struct diff_area *diff_area,
+ struct cbt_map *cbt_map);
+#endif /* __BLK_SNAP_SNAPIMAGE_H */
--
2.20.1
On Fri, Dec 09, 2022 at 03:23:27PM +0100, Sergei Shtepa wrote:
> +static int snapimage_kthread_worker_fn(void *param);
Any chance to order th code so that you can avoid forward declarations
wherever possible.
> +static inline int snapimage_start_worker(struct snapimage *snapimage)
> +{
> + struct task_struct *task;
> +
> + spin_lock_init(&snapimage->queue_lock);
> + bio_list_init(&snapimage->queue);
> +
> + task = kthread_create(snapimage_kthread_worker_fn,
> + snapimage,
> + BLK_SNAP_IMAGE_NAME "%d",
> + MINOR(snapimage->image_dev_id));
> + if (IS_ERR(task))
> + return -ENOMEM;
> +
> + snapimage->worker = get_task_struct(task);
There's not need to grab and put an extra reference to the task_struct,
the kthread code already holds one.
> + while (!kthread_should_stop()) {
> + bio = get_bio_from_queue(snapimage);
> + if (!bio) {
> + schedule_timeout_interruptible(HZ / 100);
Do you need a timeout sleep here? All new arrivals should properly
wake the thread.
> + continue;
> + }
> +
> + snapimage_process_bio(snapimage, bio);
> + }
> +
> + while ((bio = get_bio_from_queue(snapimage)))
> + snapimage_process_bio(snapimage, bio);
It seems like the loop could be simplified to:
for (;; {
while ((bio = get_bio_from_queue(snapimage)))
snapimage_process_bio(snapimage, bio);
if (kthread_should_stop())
break;
schedule();
}
return 0;
> +static void snapimage_submit_bio(struct bio *bio)
> +{
> + struct snapimage *snapimage = bio->bi_bdev->bd_disk->private_data;
> + gfp_t gfp = GFP_NOIO;
> +
> + if (bio->bi_opf & REQ_NOWAIT)
> + gfp |= GFP_NOWAIT;
gfp isn't actually used anywhere.
> + if (snapimage->is_ready) {
> + spin_lock(&snapimage->queue_lock);
> + bio_list_add(&snapimage->queue, bio);
> + spin_unlock(&snapimage->queue_lock);
> +
> + wake_up_process(snapimage->worker);
> + } else
> + bio_io_error(bio);
> +}
I think you can do away with the is_ready flag entirely by just
calling del_gendisk early enough. See below.
> +void snapimage_free(struct snapimage *snapimage)
> +{
> + pr_info("Snapshot image disk [%u:%u] delete\n",
> + MAJOR(snapimage->image_dev_id), MINOR(snapimage->image_dev_id));
> +
> + blk_mq_freeze_queue(snapimage->disk->queue);
> + snapimage->is_ready = false;
> + blk_mq_unfreeze_queue(snapimage->disk->queue);
> +
> + snapimage_stop_worker(snapimage);
> +
> + del_gendisk(snapimage->disk);
Just move the snapimage_stop_worker after del_gendisk here, and
you should be fine.
> + put_disk(snapimage->disk);
> +
> + diff_area_put(snapimage->diff_area);
> + cbt_map_put(snapimage->cbt_map);
> +
> + ida_free(&snapimage_devt_ida, MINOR(snapimage->image_dev_id));
> + kfree(snapimage);
.. and then implement the free_disk block_device operation, and
move all code after the put_disk into it, which means all your
data structures are alive until the very last gendisk reference goes
away.
> +#ifdef GENHD_FL_NO_PART_SCAN
> + disk->flags |= GENHD_FL_NO_PART_SCAN;
> +#else
> + disk->flags |= GENHD_FL_NO_PART;
> +#endif
GENHD_FL_NO_PART_SCAN is gone, so this ifdef goes away.
> + disk->major = _major;
> + disk->first_minor = minor;
> + disk->minors = 1; /* One disk has only one partition */
No new driver should manage the major/minor numbers manually, as
the block layer just assigns a dev_t by default if you don't se these.
> + wake_up_process(snapimage->worker);
I don't think this wake_up is needed - the first queue bio should
wake the helper thread up.
> +int snapimage_init(void)
> +{
> + int ret = 0;
> +
> + ret = register_blkdev(0, BLK_SNAP_IMAGE_NAME);
> + if (ret < 0) {
> + pr_err("Failed to register snapshot image block device\n");
> + return ret;
> + }
> +
> + _major = ret;
> + pr_info("Snapshot image block device major %d was registered\n",
> + _major);
> +
> + return 0;
> +}
> +
> +void snapimage_done(void)
> +{
> + unregister_blkdev(_major, BLK_SNAP_IMAGE_NAME);
> + pr_info("Snapshot image block device [%d] was unregistered\n", _major);
> +}
And with block layer managed dev_t all this boilerplate can go away
as well.
> +#include <linux/blk_types.h>
> +#include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
If you already include blk-mq.h, there is no need for the other two
headers as they are already implictly included.
On 1/1/23 08:18, Hillf Danton wrote:
> Subject:
> Re: [PATCH v2 17/21] block, blksnap: snapshot image block device
> From:
> Hillf Danton <[email protected]>
> Date:
> 1/1/23, 08:18
>
> To:
> Sergei Shtepa <[email protected]>
> CC:
> "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>, "[email protected]" <[email protected]>
>
>
> This is the first time you've received an email from this sender
> [email protected], please exercise caution when clicking on links or opening
> attachments.
>
>
> On 9 Dec 2022 15:23:27 +0100 Sergei Shtepa
> > Provides the operation of block devices of snapshot images. Read and
> > write operations are redirected to the regions of difference blocks for
> > block device (struct diff_area).
> >
> > Signed-off-by: Sergei Shtepa
> > ---
>
> Thanks for your patchset.
Thanks for the review.
>
> > +static int snapimage_kthread_worker_fn(void *param)
> > +{
> > + struct snapimage *snapimage = param;
> > + struct bio *bio;
> > + int ret = 0;
> > +
> > + while (!kthread_should_stop()) {
> > + bio = get_bio_from_queue(snapimage);
> > + if (!bio) {
> > + schedule_timeout_interruptible(HZ / 100);
> > + continue;
> > + }
>
> Given the wake_up_process() below, s$HZ / 100$HZ * 1000$ to avoid
> unnecessary wakeups [1].
>
> And because of no signal handling added, use schedule_timeout_idle() instead.
Yes, this function will be rewritten to eliminate unnecessary wake-ups.
The code that Christoph proposed in the letter at Dec 15 should work well.
>
> [1]
> https://nam10.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Flkml%2F20210419085027.761150-2-elver%40google.com%2F&data=05%7C01%7Csergei.shtepa%40veeam.com%7C765d591d47ad4d93857208daebc92508%7Cba07baab431b49edadd7cbc3542f5140%7C1%7C1%7C638081546440884849%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=rbINOq7m31DFhjHLnQKFHQuoWB64ZhmnWbAVLuNwv7w%3D&reserved=0
>
> > +
> > + snapimage_process_bio(snapimage, bio);
> > + }
> > +
> > + while ((bio = get_bio_from_queue(snapimage)))
> > + snapimage_process_bio(snapimage, bio);
> > +
> > + return ret;
> > +}
> > +
> > +static void snapimage_submit_bio(struct bio *bio)
> > +{
> > + struct snapimage *snapimage = bio->bi_bdev->bd_disk->private_data;
> > + gfp_t gfp = GFP_NOIO;
> > +
> > + if (bio->bi_opf & REQ_NOWAIT)
> > + gfp |= GFP_NOWAIT;
> > + if (snapimage->is_ready) {
> > + spin_lock(&snapimage->queue_lock);
> > + bio_list_add(&snapimage->queue, bio);
> > + spin_unlock(&snapimage->queue_lock);
> > +
> > + wake_up_process(snapimage->worker);
> > + } else
> > + bio_io_error(bio);
> > +}
>