Allows to attach block device filters to the block devices. Kernel
modules can use this functionality to extend the capabilities of the
block layer.
Signed-off-by: Sergei Shtepa <[email protected]>
---
block/bdev.c | 70 ++++++++++++++++++++++++++++++++++++++
block/blk-core.c | 19 +++++++++--
include/linux/blk_types.h | 2 ++
include/linux/blkdev.h | 71 +++++++++++++++++++++++++++++++++++++++
4 files changed, 160 insertions(+), 2 deletions(-)
diff --git a/block/bdev.c b/block/bdev.c
index d699ecdb3260..b820178824b2 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -427,6 +427,7 @@ static void init_once(void *data)
static void bdev_evict_inode(struct inode *inode)
{
+ bdev_filter_detach(I_BDEV(inode));
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
@@ -502,6 +503,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
return NULL;
}
bdev->bd_disk = disk;
+ bdev->bd_filter = NULL;
return bdev;
}
@@ -1092,3 +1094,71 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
blkdev_put_no_open(bdev);
}
+
+/**
+ * bdev_filter_attach - Attach the filter to the original block device.
+ * @bdev:
+ * Block device.
+ * @flt:
+ * Filter that needs to be attached to the block device.
+ *
+ * Before adding a filter, it is necessary to initialize &struct bdev_filter
+ * using a bdev_filter_init() function.
+ *
+ * The bdev_filter_detach() function allows to detach the filter from the block
+ * device.
+ *
+ * Return: 0 if succeeded, or -EALREADY if the filter already exists.
+ */
+int bdev_filter_attach(struct block_device *bdev,
+ struct bdev_filter *flt)
+{
+ int ret = 0;
+
+ blk_mq_freeze_queue(bdev->bd_queue);
+ blk_mq_quiesce_queue(bdev->bd_queue);
+
+ if (bdev->bd_filter)
+ ret = -EALREADY;
+ else
+ bdev->bd_filter = flt;
+
+ blk_mq_unquiesce_queue(bdev->bd_queue);
+ blk_mq_unfreeze_queue(bdev->bd_queue);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdev_filter_attach);
+
+/**
+ * bdev_filter_detach - Detach the filter from the block device.
+ * @bdev:
+ * Block device.
+ *
+ * The filter should be added using the bdev_filter_attach() function.
+ *
+ * Return: 0 if succeeded, or -ENOENT if the filter was not found.
+ */
+int bdev_filter_detach(struct block_device *bdev)
+{
+ int ret = 0;
+ struct bdev_filter *flt = NULL;
+
+ blk_mq_freeze_queue(bdev->bd_queue);
+ blk_mq_quiesce_queue(bdev->bd_queue);
+
+ flt = bdev->bd_filter;
+ if (flt)
+ bdev->bd_filter = NULL;
+ else
+ ret = -ENOENT;
+
+ blk_mq_unquiesce_queue(bdev->bd_queue);
+ blk_mq_unfreeze_queue(bdev->bd_queue);
+
+ if (flt)
+ bdev_filter_put(flt);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdev_filter_detach);
diff --git a/block/blk-core.c b/block/blk-core.c
index 5487912befe8..284b295a7b23 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -678,9 +678,24 @@ void submit_bio_noacct_nocheck(struct bio *bio)
* to collect a list of requests submited by a ->submit_bio method while
* it is active, and then process them after it returned.
*/
- if (current->bio_list)
+ if (current->bio_list) {
bio_list_add(¤t->bio_list[0], bio);
- else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+ return;
+ }
+
+ if (bio->bi_bdev->bd_filter && !bio_flagged(bio, BIO_FILTERED)) {
+ bool pass;
+
+ pass = bio->bi_bdev->bd_filter->fops->submit_bio_cb(bio);
+ bio_set_flag(bio, BIO_FILTERED);
+ if (!pass) {
+ bio->bi_status = BLK_STS_OK;
+ bio_endio(bio);
+ return;
+ }
+ }
+
+ if (!bio->bi_bdev->bd_disk->fops->submit_bio)
__submit_bio_noacct_mq(bio);
else
__submit_bio_noacct(bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e0b098089ef2..3b58c69cbf9d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -68,6 +68,7 @@ struct block_device {
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool bd_make_it_fail;
#endif
+ struct bdev_filter *bd_filter;
} __randomize_layout;
#define bdev_whole(_bdev) \
@@ -333,6 +334,7 @@ enum {
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_FILTERED, /* bio has already been filtered */
BIO_FLAG_LAST
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 891f8cbcd043..dc2da4c7ab39 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1549,4 +1549,75 @@ struct io_comp_batch {
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
+/**
+ * struct bdev_filter_operations - Callback functions for the filter.
+ *
+ * @submit_bio_cb:
+ * A callback function for I/O unit handling.
+ * @release_cb:
+ * A callback function to disable the filter when removing a block
+ * device from the system.
+ */
+struct bdev_filter_operations {
+ bool (*submit_bio_cb)(struct bio *bio);
+ void (*release_cb)(struct kref *kref);
+};
+
+/**
+ * struct bdev_filter - Block device filter.
+ *
+ * @kref:
+ * Kernel reference counter.
+ * @fops:
+ * The pointer to &struct bdev_filter_operations with callback
+ * functions for the filter.
+ */
+struct bdev_filter {
+ struct kref kref;
+ const struct bdev_filter_operations *fops;
+};
+
+/**
+ * bdev_filter_init - Initialization of the filter structure.
+ * @flt:
+ * Pointer to the &struct bdev_filter to be initialized.
+ * @fops:
+ * The callback functions for the filter.
+ */
+static inline void bdev_filter_init(struct bdev_filter *flt,
+ const struct bdev_filter_operations *fops)
+{
+ kref_init(&flt->kref);
+ flt->fops = fops;
+};
+
+/**
+ * bdev_filter_get - Increment reference counter.
+ * @flt:
+ * Pointer to the &struct bdev_filter.
+ *
+ * Allows to ensure that the filter will not be released as long as there are
+ * references to it.
+ */
+static inline void bdev_filter_get(struct bdev_filter *flt)
+{
+ kref_get(&flt->kref);
+}
+
+/**
+ * bdev_filter_put - Decrement reference counter.
+ * @flt:
+ * Pointer to the &struct bdev_filter.
+ *
+ * Decrement the reference counter, and if 0, release filter.
+ */
+static inline void bdev_filter_put(struct bdev_filter *flt)
+{
+ kref_put(&flt->kref, flt->fops->release_cb);
+};
+
+int bdev_filter_attach(struct block_device *bdev, struct bdev_filter *flt);
+int bdev_filter_detach(struct block_device *bdev);
+
+
#endif /* _LINUX_BLKDEV_H */
--
2.20.1
On Fri, Dec 09, 2022 at 03:23:12PM +0100, Sergei Shtepa wrote:
> + blk_mq_freeze_queue(bdev->bd_queue);
> + blk_mq_quiesce_queue(bdev->bd_queue);
I don't think we need the quiesce here (or in the detach path).
quiesce works as the low-level blk-mq dispatch level, and we
sit way above that.
> +EXPORT_SYMBOL(bdev_filter_attach);
Please use EXPORT_SYMBOL_GPL for new low-level block layer features.
> +int bdev_filter_detach(struct block_device *bdev)
> +{
> + int ret = 0;
> + struct bdev_filter *flt = NULL;
> +
> + blk_mq_freeze_queue(bdev->bd_queue);
> + blk_mq_quiesce_queue(bdev->bd_queue);
> +
> + flt = bdev->bd_filter;
> + if (flt)
> + bdev->bd_filter = NULL;
> + else
> + ret = -ENOENT;
Not having a filter is a grave error that the caller can't do
anything about. So pleas just WARN_ON_ONCE for that case, and
change the function to a void return.
> + if (bio->bi_bdev->bd_filter && !bio_flagged(bio, BIO_FILTERED)) {
> + bool pass;
> +
> + pass = bio->bi_bdev->bd_filter->fops->submit_bio_cb(bio);
> + bio_set_flag(bio, BIO_FILTERED);
> + if (!pass) {
> + bio->bi_status = BLK_STS_OK;
> + bio_endio(bio);
> + return;
> + }
Instead of ending the bio here for the !pass case it seems to me it
would make more sense to just pass ownership to the filter driver and
let the driver complete it. That would allow error returns for that
case, or handling it from a workqueue. I'd also change the polarity
so that true means "I've taken ownership". I.e.:
if (bio->bi_bdev->bd_filter && !bio_flagged(bio, BIO_FILTERED)) {
bio_set_flag(bio, BIO_FILTERED);
if (bio->bi_bdev->bd_filter->fops->submit_bio_cb(bio))
return;
}
> +struct bdev_filter_operations {
> + bool (*submit_bio_cb)(struct bio *bio);
> + void (*release_cb)(struct kref *kref);
> +};
Nit: I don't think these _cb postfixes are very useful. All methods
in a method table are sort of callbacks.
> +/**
> + * bdev_filter_get - Increment reference counter.
> + * @flt:
> + * Pointer to the &struct bdev_filter.
> + *
> + * Allows to ensure that the filter will not be released as long as there are
> + * references to it.
> + */
> +static inline void bdev_filter_get(struct bdev_filter *flt)
> +{
> + kref_get(&flt->kref);
> +}
Looking at the callers in blksnap I'm not sure this works. The
pattern seems to be the driver has a block device reference, and
then uses bdev_filter_get to grab a filter reference. But what
prevents the filter from going away just bdev_filter_get is called?
At a minimum we'd need a something:
static inline struct bdev_filter *bdev_filter_get(struct block_device *bdev)
{
struct bdev_filter *flt;
rcu_read_lock();
flt = rcu_dereference(bdev->bd_filter);
if (!refcount_inc_not_zero(&flt->refs))
flt = NULL;
rcu_read_unlock();
return flt;
}
with bd_filter switched to __rcu annotation, the kref replaced with
a refcount_t, updates switched to cmpxchg and a rcu_synchronize()
after clearing bd_filter.
I am very glad to see your comments, Christoph.
Thank you so much for the review.
I have read all the comments and agree with them.
Now I see new ways to make the code of the filter and the blksnap module better.
There seems to be a lot of work to be done. :)
On Thu, Dec 15, 2022 at 11:46:35AM +0100, Sergei Shtepa wrote:
> I am very glad to see your comments, Christoph.
> Thank you so much for the review.
>
> I have read all the comments and agree with them.
> Now I see new ways to make the code of the filter and the blksnap module better.
Sorry for being so late, but I was stuck onder a pile of work last
time you posted it. But compared to the versions before I think we've
made a lot of progress and we'll get there. Also feel free to ask me
for input on changes before the sending the whole series if you have any
questions.
On Fri, Dec 09 2022 at 9:23P -0500,
Sergei Shtepa <[email protected]> wrote:
> Allows to attach block device filters to the block devices. Kernel
> modules can use this functionality to extend the capabilities of the
> block layer.
>
> Signed-off-by: Sergei Shtepa <[email protected]>
> ---
> block/bdev.c | 70 ++++++++++++++++++++++++++++++++++++++
> block/blk-core.c | 19 +++++++++--
> include/linux/blk_types.h | 2 ++
> include/linux/blkdev.h | 71 +++++++++++++++++++++++++++++++++++++++
> 4 files changed, 160 insertions(+), 2 deletions(-)
>
> diff --git a/block/bdev.c b/block/bdev.c
> index d699ecdb3260..b820178824b2 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -427,6 +427,7 @@ static void init_once(void *data)
>
> static void bdev_evict_inode(struct inode *inode)
> {
> + bdev_filter_detach(I_BDEV(inode));
> truncate_inode_pages_final(&inode->i_data);
> invalidate_inode_buffers(inode); /* is it needed here? */
> clear_inode(inode);
> @@ -502,6 +503,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
> return NULL;
> }
> bdev->bd_disk = disk;
> + bdev->bd_filter = NULL;
> return bdev;
> }
>
> @@ -1092,3 +1094,71 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
>
> blkdev_put_no_open(bdev);
> }
> +
> +/**
> + * bdev_filter_attach - Attach the filter to the original block device.
> + * @bdev:
> + * Block device.
> + * @flt:
> + * Filter that needs to be attached to the block device.
> + *
> + * Before adding a filter, it is necessary to initialize &struct bdev_filter
> + * using a bdev_filter_init() function.
> + *
> + * The bdev_filter_detach() function allows to detach the filter from the block
> + * device.
> + *
> + * Return: 0 if succeeded, or -EALREADY if the filter already exists.
> + */
> +int bdev_filter_attach(struct block_device *bdev,
> + struct bdev_filter *flt)
> +{
> + int ret = 0;
> +
> + blk_mq_freeze_queue(bdev->bd_queue);
> + blk_mq_quiesce_queue(bdev->bd_queue);
> +
> + if (bdev->bd_filter)
> + ret = -EALREADY;
> + else
> + bdev->bd_filter = flt;
> +
> + blk_mq_unquiesce_queue(bdev->bd_queue);
> + blk_mq_unfreeze_queue(bdev->bd_queue);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(bdev_filter_attach);
> +
> +/**
> + * bdev_filter_detach - Detach the filter from the block device.
> + * @bdev:
> + * Block device.
> + *
> + * The filter should be added using the bdev_filter_attach() function.
> + *
> + * Return: 0 if succeeded, or -ENOENT if the filter was not found.
> + */
> +int bdev_filter_detach(struct block_device *bdev)
> +{
> + int ret = 0;
> + struct bdev_filter *flt = NULL;
> +
> + blk_mq_freeze_queue(bdev->bd_queue);
> + blk_mq_quiesce_queue(bdev->bd_queue);
> +
> + flt = bdev->bd_filter;
> + if (flt)
> + bdev->bd_filter = NULL;
> + else
> + ret = -ENOENT;
> +
> + blk_mq_unquiesce_queue(bdev->bd_queue);
> + blk_mq_unfreeze_queue(bdev->bd_queue);
> +
> + if (flt)
> + bdev_filter_put(flt);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(bdev_filter_detach);
What about bio-based devices? (DM, MD, etc)
DM uses freeze_bdev() and thaw_bdev(), seems like you're missing some
work here.
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 5487912befe8..284b295a7b23 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -678,9 +678,24 @@ void submit_bio_noacct_nocheck(struct bio *bio)
> * to collect a list of requests submited by a ->submit_bio method while
> * it is active, and then process them after it returned.
> */
> - if (current->bio_list)
> + if (current->bio_list) {
> bio_list_add(¤t->bio_list[0], bio);
> - else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
> + return;
> + }
> +
> + if (bio->bi_bdev->bd_filter && !bio_flagged(bio, BIO_FILTERED)) {
Shouldn't this be: if (unlikely(...))?
But that obviously assumes a fair amount about the only consumer
(temporary filter that lasts as long as it takes to do a backup).
> + bool pass;
> +
> + pass = bio->bi_bdev->bd_filter->fops->submit_bio_cb(bio);
> + bio_set_flag(bio, BIO_FILTERED);
> + if (!pass) {
> + bio->bi_status = BLK_STS_OK;
> + bio_endio(bio);
> + return;
> + }
> + }
> +
> + if (!bio->bi_bdev->bd_disk->fops->submit_bio)
> __submit_bio_noacct_mq(bio);
> else
> __submit_bio_noacct(bio);
And you currently don't allow for blkfilter to be involved if a bio
recurses (which is how bio splitting works now). Not sure it
matters, just mentioning it...
But taking a step back, in the hopes of stepping out of your way:
Myself and others on the DM team (past and present) have always hoped
all block devices could have the flexibility of DM. It was that hope
that caused my frustration when I first saw your blkfilter approach.
But I was too idealistic that a byproduct of your efforts
(blk-interposer before and blkfilter now) would usher in _all_ block
devices being able to comprehensively change their identity (and IO
processing) like DM enjoys.
DM showcases all the extra code needed to achieve its extreme IO
remapping and stacking flexibilty -- I don't yet see a way to distill
the essence of what DM achieves without imposing too much on all block
core.
So I do think blkfilter is a pragmatic way to achieve your goals.
Mike
Il 01/02/2023 00:58, Mike Snitzer ha scritto:
> On Fri, Dec 09 2022 at 9:23P -0500,
> Sergei Shtepa <[email protected]> wrote:
>
>> Allows to attach block device filters to the block devices. Kernel
>> modules can use this functionality to extend the capabilities of the
>> block layer.
>>
>> Signed-off-by: Sergei Shtepa <[email protected]>
>> ---
>> block/bdev.c | 70 ++++++++++++++++++++++++++++++++++++++
>> block/blk-core.c | 19 +++++++++--
>> include/linux/blk_types.h | 2 ++
>> include/linux/blkdev.h | 71 +++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 160 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/bdev.c b/block/bdev.c
>> index d699ecdb3260..b820178824b2 100644
>> --- a/block/bdev.c
>> +++ b/block/bdev.c
>> @@ -427,6 +427,7 @@ static void init_once(void *data)
>>
>> static void bdev_evict_inode(struct inode *inode)
>> {
>> + bdev_filter_detach(I_BDEV(inode));
>> truncate_inode_pages_final(&inode->i_data);
>> invalidate_inode_buffers(inode); /* is it needed here? */
>> clear_inode(inode);
>> @@ -502,6 +503,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
>> return NULL;
>> }
>> bdev->bd_disk = disk;
>> + bdev->bd_filter = NULL;
>> return bdev;
>> }
>>
>> @@ -1092,3 +1094,71 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
>>
>> blkdev_put_no_open(bdev);
>> }
>> +
>> +/**
>> + * bdev_filter_attach - Attach the filter to the original block device.
>> + * @bdev:
>> + * Block device.
>> + * @flt:
>> + * Filter that needs to be attached to the block device.
>> + *
>> + * Before adding a filter, it is necessary to initialize &struct bdev_filter
>> + * using a bdev_filter_init() function.
>> + *
>> + * The bdev_filter_detach() function allows to detach the filter from the block
>> + * device.
>> + *
>> + * Return: 0 if succeeded, or -EALREADY if the filter already exists.
>> + */
>> +int bdev_filter_attach(struct block_device *bdev,
>> + struct bdev_filter *flt)
>> +{
>> + int ret = 0;
>> +
>> + blk_mq_freeze_queue(bdev->bd_queue);
>> + blk_mq_quiesce_queue(bdev->bd_queue);
>> +
>> + if (bdev->bd_filter)
>> + ret = -EALREADY;
>> + else
>> + bdev->bd_filter = flt;
>> +
>> + blk_mq_unquiesce_queue(bdev->bd_queue);
>> + blk_mq_unfreeze_queue(bdev->bd_queue);
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(bdev_filter_attach);
>> +
>> +/**
>> + * bdev_filter_detach - Detach the filter from the block device.
>> + * @bdev:
>> + * Block device.
>> + *
>> + * The filter should be added using the bdev_filter_attach() function.
>> + *
>> + * Return: 0 if succeeded, or -ENOENT if the filter was not found.
>> + */
>> +int bdev_filter_detach(struct block_device *bdev)
>> +{
>> + int ret = 0;
>> + struct bdev_filter *flt = NULL;
>> +
>> + blk_mq_freeze_queue(bdev->bd_queue);
>> + blk_mq_quiesce_queue(bdev->bd_queue);
>> +
>> + flt = bdev->bd_filter;
>> + if (flt)
>> + bdev->bd_filter = NULL;
>> + else
>> + ret = -ENOENT;
>> +
>> + blk_mq_unquiesce_queue(bdev->bd_queue);
>> + blk_mq_unfreeze_queue(bdev->bd_queue);
>> +
>> + if (flt)
>> + bdev_filter_put(flt);
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(bdev_filter_detach);
> What about bio-based devices? (DM, MD, etc)
>
> DM uses freeze_bdev() and thaw_bdev(), seems like you're missing some
> work here.
>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 5487912befe8..284b295a7b23 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -678,9 +678,24 @@ void submit_bio_noacct_nocheck(struct bio *bio)
>> * to collect a list of requests submited by a ->submit_bio method while
>> * it is active, and then process them after it returned.
>> */
>> - if (current->bio_list)
>> + if (current->bio_list) {
>> bio_list_add(¤t->bio_list[0], bio);
>> - else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
>> + return;
>> + }
>> +
>> + if (bio->bi_bdev->bd_filter && !bio_flagged(bio, BIO_FILTERED)) {
> Shouldn't this be: if (unlikely(...))?
>
> But that obviously assumes a fair amount about the only consumer
> (temporary filter that lasts as long as it takes to do a backup).
>
>> + bool pass;
>> +
>> + pass = bio->bi_bdev->bd_filter->fops->submit_bio_cb(bio);
>> + bio_set_flag(bio, BIO_FILTERED);
>> + if (!pass) {
>> + bio->bi_status = BLK_STS_OK;
>> + bio_endio(bio);
>> + return;
>> + }
>> + }
>> +
>> + if (!bio->bi_bdev->bd_disk->fops->submit_bio)
>> __submit_bio_noacct_mq(bio);
>> else
>> __submit_bio_noacct(bio);
> And you currently don't allow for blkfilter to be involved if a bio
> recurses (which is how bio splitting works now). Not sure it
> matters, just mentioning it...
Hi, thanks for review this project that I think is very useful and I
hope it will be able to reach a very good quality, that will be
integrated into the kernel and that more backup solutions will use it
improving backups on linux in several cases.
In the last month Sergei made big changes, applying Christoph Hellwig
changes and working following the received replies, the new version of
the patch is not ready yet but the many changes done for now are
available here: https://github.com/SergeiShtepa/linux/commits/blksnap-master
I suppose that full review now of v2 when many things was already
changed for next version can risk to waste time. I hope I've helped with
this, if I haven't, sorry to have bother you.
>
> But taking a step back, in the hopes of stepping out of your way:
>
> Myself and others on the DM team (past and present) have always hoped
> all block devices could have the flexibility of DM. It was that hope
> that caused my frustration when I first saw your blkfilter approach.
>
> But I was too idealistic that a byproduct of your efforts
> (blk-interposer before and blkfilter now) would usher in _all_ block
> devices being able to comprehensively change their identity (and IO
> processing) like DM enjoys.
>
> DM showcases all the extra code needed to achieve its extreme IO
> remapping and stacking flexibilty -- I don't yet see a way to distill
> the essence of what DM achieves without imposing too much on all block
> core.
>
> So I do think blkfilter is a pragmatic way to achieve your goals.
>
> Mike
>
>
On 2/1/23 00:58, Mike Snitzer wrote:
> Subject:
> Re: [PATCH v2 02/21] block, blkfilter: Block Device Filtering Mechanism
> From:
> Mike Snitzer <[email protected]>
> Date:
> 2/1/23, 00:58
>
> To:
> Sergei Shtepa <[email protected]>
> CC:
> [email protected], [email protected], [email protected], [email protected], [email protected], [email protected]
>
>
> On Fri, Dec 09 2022 at 9:23P -0500,
> Sergei Shtepa <[email protected]> wrote:
>
>> Allows to attach block device filters to the block devices. Kernel
>> modules can use this functionality to extend the capabilities of the
>> block layer.
>>
>> Signed-off-by: Sergei Shtepa <[email protected]>
>> ---
>> block/bdev.c | 70 ++++++++++++++++++++++++++++++++++++++
>> block/blk-core.c | 19 +++++++++--
>> include/linux/blk_types.h | 2 ++
>> include/linux/blkdev.h | 71 +++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 160 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/bdev.c b/block/bdev.c
>> index d699ecdb3260..b820178824b2 100644
>> --- a/block/bdev.c
>> +++ b/block/bdev.c
>> @@ -427,6 +427,7 @@ static void init_once(void *data)
>>
>> static void bdev_evict_inode(struct inode *inode)
>> {
>> + bdev_filter_detach(I_BDEV(inode));
>> truncate_inode_pages_final(&inode->i_data);
>> invalidate_inode_buffers(inode); /* is it needed here? */
>> clear_inode(inode);
>> @@ -502,6 +503,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
>> return NULL;
>> }
>> bdev->bd_disk = disk;
>> + bdev->bd_filter = NULL;
>> return bdev;
>> }
>>
>> @@ -1092,3 +1094,71 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
>>
>> blkdev_put_no_open(bdev);
>> }
>> +
>> +/**
>> + * bdev_filter_attach - Attach the filter to the original block device.
>> + * @bdev:
>> + * Block device.
>> + * @flt:
>> + * Filter that needs to be attached to the block device.
>> + *
>> + * Before adding a filter, it is necessary to initialize &struct bdev_filter
>> + * using a bdev_filter_init() function.
>> + *
>> + * The bdev_filter_detach() function allows to detach the filter from the block
>> + * device.
>> + *
>> + * Return: 0 if succeeded, or -EALREADY if the filter already exists.
>> + */
>> +int bdev_filter_attach(struct block_device *bdev,
>> + struct bdev_filter *flt)
>> +{
>> + int ret = 0;
>> +
>> + blk_mq_freeze_queue(bdev->bd_queue);
>> + blk_mq_quiesce_queue(bdev->bd_queue);
>> +
>> + if (bdev->bd_filter)
>> + ret = -EALREADY;
>> + else
>> + bdev->bd_filter = flt;
>> +
>> + blk_mq_unquiesce_queue(bdev->bd_queue);
>> + blk_mq_unfreeze_queue(bdev->bd_queue);
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(bdev_filter_attach);
>> +
>> +/**
>> + * bdev_filter_detach - Detach the filter from the block device.
>> + * @bdev:
>> + * Block device.
>> + *
>> + * The filter should be added using the bdev_filter_attach() function.
>> + *
>> + * Return: 0 if succeeded, or -ENOENT if the filter was not found.
>> + */
>> +int bdev_filter_detach(struct block_device *bdev)
>> +{
>> + int ret = 0;
>> + struct bdev_filter *flt = NULL;
>> +
>> + blk_mq_freeze_queue(bdev->bd_queue);
>> + blk_mq_quiesce_queue(bdev->bd_queue);
>> +
>> + flt = bdev->bd_filter;
>> + if (flt)
>> + bdev->bd_filter = NULL;
>> + else
>> + ret = -ENOENT;
>> +
>> + blk_mq_unquiesce_queue(bdev->bd_queue);
>> + blk_mq_unfreeze_queue(bdev->bd_queue);
>> +
>> + if (flt)
>> + bdev_filter_put(flt);
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(bdev_filter_detach);
> What about bio-based devices? (DM, MD, etc)
>
> DM uses freeze_bdev() and thaw_bdev(), seems like you're missing some
> work here.
Thanks, Mike.
We are planning to add a freeze_bdev() function call in bdev_filter_attach().
But for the bdev_filter_detach() function, it doesn't seem to make sense.
I think enough to call blk_mq_freeze_queue().
As Fabio already wrote, I use a public repository on github to work with
the patch: https://github.com/SergeiShtepa/linux/commits/blksnap-master
The current state can be viewed there. Feedback is welcome as usual.
>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 5487912befe8..284b295a7b23 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -678,9 +678,24 @@ void submit_bio_noacct_nocheck(struct bio *bio)
>> * to collect a list of requests submited by a ->submit_bio method while
>> * it is active, and then process them after it returned.
>> */
>> - if (current->bio_list)
>> + if (current->bio_list) {
>> bio_list_add(¤t->bio_list[0], bio);
>> - else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
>> + return;
>> + }
>> +
>> + if (bio->bi_bdev->bd_filter && !bio_flagged(bio, BIO_FILTERED)) {
> Shouldn't this be: if (unlikely(...))?
>
> But that obviously assumes a fair amount about the only consumer
> (temporary filter that lasts as long as it takes to do a backup).
Yes, at the moment the code is being created so that only one filter
is possible. In the summer, I offered a more complex solution, in which
there were altitudes. See:
https://lore.kernel.org/linux-block/[email protected]/
But this is redundant code for this task at the moment, since only
one filter is offered now. I think it will be possible to implement
something similar later.
>
>> + bool pass;
>> +
>> + pass = bio->bi_bdev->bd_filter->fops->submit_bio_cb(bio);
>> + bio_set_flag(bio, BIO_FILTERED);
>> + if (!pass) {
>> + bio->bi_status = BLK_STS_OK;
>> + bio_endio(bio);
>> + return;
>> + }
>> + }
>> +
>> + if (!bio->bi_bdev->bd_disk->fops->submit_bio)
>> __submit_bio_noacct_mq(bio);
>> else
>> __submit_bio_noacct(bio);
> And you currently don't allow for blkfilter to be involved if a bio
> recurses (which is how bio splitting works now). Not sure it
> matters, just mentioning it...
>
> But taking a step back, in the hopes of stepping out of your way:
>
> Myself and others on the DM team (past and present) have always hoped
> all block devices could have the flexibility of DM. It was that hope
> that caused my frustration when I first saw your blkfilter approach.
>
> But I was too idealistic that a byproduct of your efforts
> (blk-interposer before and blkfilter now) would usher in _all_ block
> devices being able to comprehensively change their identity (and IO
> processing) like DM enjoys.
>
> DM showcases all the extra code needed to achieve its extreme IO
> remapping and stacking flexibilty -- I don't yet see a way to distill
> the essence of what DM achieves without imposing too much on all block
> core.
>
> So I do think blkfilter is a pragmatic way to achieve your goals.
>
> Mike
>