__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.
This is a real-world scenario that we have encountered:
1) Call fallocate(file_fd, 0, 0, SZ_8G)
2) Call ioctl(file_fd, FS_IOC_FIEMAP, fiemap). In situations of severe
file system fragmentation, fiemap->fm_mapped_extents may exceed 1000.
3) Create a dm-linear device based on fiemap->fm_extents
4) Create a snapshot-cow device based on the dm-linear device
Perf diff of fio test:
fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
--ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
--blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
Scenario one:
for i in {0..1023}; do
echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
done | sudo dmsetup create example
Before: bw=857KiB/
After: bw=30.8MiB/s +3580%
Scenario two:
for i in {0..1023}; do
if [[ $i -gt 511 ]]; then
echo $((8000*$i)) 8000 linear /dev/nvme0n1p6 $((16384*$i))
else
echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
fi
done | sudo dmsetup create example
Before: bw=1470KiB/
After: bw=33.9MiB/s +2261%
Any comments are welcome!
V2:
-- Split into smaller pieces that are easier to review
-- Add flush_pass_around, suggested by Mikulas Patocka
-- Handling different target types separately
Yang Yang (5):
dm: introduce flush_pass_around flag
dm: add __send_empty_flush_bios() helper
dm: support retrieving struct dm_target from struct dm_dev
dm: Avoid sending redundant empty flush bios to the same block device
dm linear: enable flush optimization function
drivers/md/dm-core.h | 3 +++
drivers/md/dm-ioctl.c | 4 ++++
drivers/md/dm-linear.c | 1 +
drivers/md/dm-table.c | 39 +++++++++++++++++++++++++++++++++++
drivers/md/dm.c | 37 +++++++++++++++++++++++++--------
include/linux/device-mapper.h | 8 +++++++
6 files changed, 83 insertions(+), 9 deletions(-)
--
2.34.1
introduce a per-target bit "flush_pass_around" and means that the
target supports flush optimization.
set a per-table "flush_pass_around" bit if all the targets in the
table have "flush_pass_around" set.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm-core.h | 3 +++
drivers/md/dm-ioctl.c | 4 ++++
drivers/md/dm-table.c | 3 +++
include/linux/device-mapper.h | 5 +++++
4 files changed, 15 insertions(+)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..b273f25b634d 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -208,6 +208,9 @@ struct dm_table {
bool singleton:1;
unsigned integrity_added:1;
+ /* set if all the targets in the table have "flush_pass_around" set */
+ bool flush_pass_around:1;
+
/*
* Indicates the rw permissions for the new logical device. This
* should be a combination of BLK_OPEN_READ and BLK_OPEN_WRITE.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c2c07bfa6471..bb178df2a340 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1445,6 +1445,8 @@ static int populate_table(struct dm_table *table,
return -EINVAL;
}
+ table->flush_pass_around = 1;
+
for (i = 0; i < param->target_count; i++) {
const char *nul_terminator;
@@ -2279,6 +2281,8 @@ int __init dm_early_create(struct dm_ioctl *dmi,
if (r)
goto err_hash_remove;
+ t->flush_pass_around = 1;
+
/* add targets */
for (i = 0; i < dmi->target_count; i++) {
r = dm_table_add_target(t, spec_array[i]->target_type,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..bd68af10afed 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -738,6 +738,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
static_branch_enable(&swap_bios_enabled);
+ if (ti->flush_pass_around == 0)
+ t->flush_pass_around = 0;
+
return 0;
bad:
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 82b2195efaca..0893ff8c01b6 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -397,6 +397,11 @@ struct dm_target {
* bio_set_dev(). NOTE: ideally a target should _not_ need this.
*/
bool needs_bio_set_dev:1;
+
+ /*
+ * Set if the target supports flush optimization
+ */
+ bool flush_pass_around:1;
};
void *dm_per_bio_data(struct bio *bio, size_t data_size);
--
2.34.1
There are no functional changes, the helper will be used in later
patches.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm.c | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..25215b93c3cf 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1543,6 +1543,20 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
return ret;
}
+static void __send_empty_flush_bios(struct dm_table *t, struct dm_target *ti,
+ struct clone_info *ci)
+{
+ unsigned int bios;
+
+ if (unlikely(ti->num_flush_bios == 0))
+ return;
+
+ atomic_add(ti->num_flush_bios, &ci->io->io_count);
+ bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
+ NULL, GFP_NOWAIT);
+ atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+}
+
static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
@@ -1561,16 +1575,9 @@ static void __send_empty_flush(struct clone_info *ci)
ci->io->tio.clone.bi_iter.bi_size = 0;
for (unsigned int i = 0; i < t->num_targets; i++) {
- unsigned int bios;
struct dm_target *ti = dm_table_get_target(t, i);
- if (unlikely(ti->num_flush_bios == 0))
- continue;
-
- atomic_add(ti->num_flush_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
- NULL, GFP_NOWAIT);
- atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ __send_empty_flush_bios(t, ti, ci);
}
/*
--
2.34.1
Add a list to the struct dm_dev structure to store the associated
targets, while also allowing differentiation between different target
types.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm-table.c | 36 +++++++++++++++++++++++++++++++++++
include/linux/device-mapper.h | 3 +++
2 files changed, 39 insertions(+)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bd68af10afed..f6554590b7af 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -741,6 +741,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
if (ti->flush_pass_around == 0)
t->flush_pass_around = 0;
+ INIT_LIST_HEAD(&ti->list);
+
return 0;
bad:
@@ -2134,6 +2136,25 @@ void dm_table_postsuspend_targets(struct dm_table *t)
suspend_targets(t, POSTSUSPEND);
}
+static int dm_link_dev_to_target(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct list_head *targets = &dev->targets;
+ struct dm_target *pti;
+
+ if (!list_empty(targets)) {
+ list_for_each_entry(pti, targets, list) {
+ if (pti->type == ti->type)
+ return 0;
+ }
+ }
+
+ if (list_empty(&ti->list))
+ list_add_tail(&ti->list, targets);
+
+ return 0;
+}
+
int dm_table_resume_targets(struct dm_table *t)
{
unsigned int i;
@@ -2162,6 +2183,21 @@ int dm_table_resume_targets(struct dm_table *t)
ti->type->resume(ti);
}
+ if (t->flush_pass_around) {
+ struct list_head *devices = &t->devices;
+ struct dm_dev_internal *dd;
+
+ list_for_each_entry(dd, devices, list)
+ INIT_LIST_HEAD(&dd->dm_dev->targets);
+
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (ti->type->iterate_devices)
+ ti->type->iterate_devices(ti, dm_link_dev_to_target, NULL);
+ }
+ }
+
return 0;
}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0893ff8c01b6..19e03f9b2589 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -169,6 +169,7 @@ struct dm_dev {
struct dax_device *dax_dev;
blk_mode_t mode;
char name[16];
+ struct list_head targets;
};
/*
@@ -298,6 +299,8 @@ struct dm_target {
struct dm_table *table;
struct target_type *type;
+ struct list_head list;
+
/* target limits */
sector_t begin;
sector_t len;
--
2.34.1
If the num_targets is greater than the number of dm_devs in the
dm_table's devices list, __send_empty_flush() might invoke
__send_duplicate_bios() multiple times for the same block device.
This could lead to a substantial decrease in performance when
num_targets significantly exceeds the number of dm_devs.
This patch ensure that __send_duplicate_bios() is only called once
for each dm_dev with different target type.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 25215b93c3cf..9dbddc214084 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1561,6 +1561,7 @@ static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
struct bio flush_bio;
+ struct dm_target *ti;
/*
* Use an on-stack bio for this, it's safe since we don't
@@ -1574,10 +1575,21 @@ static void __send_empty_flush(struct clone_info *ci)
ci->sector_count = 0;
ci->io->tio.clone.bi_iter.bi_size = 0;
- for (unsigned int i = 0; i < t->num_targets; i++) {
- struct dm_target *ti = dm_table_get_target(t, i);
+ if (!t->flush_pass_around) {
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ ti = dm_table_get_target(t, i);
+ __send_empty_flush_bios(t, ti, ci);
+ }
+ } else {
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd;
- __send_empty_flush_bios(t, ti, ci);
+ list_for_each_entry(dd, devices, list) {
+ struct list_head *targets = &dd->dm_dev->targets;
+
+ list_for_each_entry(ti, targets, list)
+ __send_empty_flush_bios(t, ti, ci);
+ }
}
/*
--
2.34.1
__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.
This issue can be reproduced using this command line:
for i in {0..1023}; do
echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
done | dmsetup create example
With this fix, a random write with fsync workload executed with the
following fio command:
fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
--ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
--blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
results in an increase from 857 KB/s to 30.8 MB/s of the write
throughput (3580% increase).
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm-linear.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 2d3e186ca87e..3e1a33b4d289 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_discard_bios = 1;
ti->num_secure_erase_bios = 1;
ti->num_write_zeroes_bios = 1;
+ ti->flush_pass_around = 1;
ti->private = lc;
return 0;
--
2.34.1
On Tue, May 14, 2024 at 05:04:42PM +0800, Yang Yang wrote:
> Add a list to the struct dm_dev structure to store the associated
> targets, while also allowing differentiation between different target
> types.
I still think this is more complex than it needs to be. If devices that
support flush_pass_around can guarantee that:
1. They will send a flush bio to all of their table devices
2. They are fine with another target sending the flush bio to their
table devices
Then I don't see why we need the table devices to keep track of all the
different target types that are using them. Am I missing something here?
If we don't need to worry about sending a flush bio to a target of each
type that is using a table device, then all we need to do is call
__send_empty_flush_bios() for enough targets to cover all the table
devices. This seems a lot easier to track. We just need another flag in
dm_target, something like sends_pass_around_flush.
When a target calls dm_get_device(), if it adds a new table device to
t->devices, then it's the first target in this table to use that device.
If flush_pass_around is set for this target, then it also sets
sends_pass_around_flush. In __send_empty_flush() if the table has
flush_pass_around set, when you iterate through the devices, you only
call __send_empty_flush_bios() for the ones with sends_pass_around_flush
set.
Or am I overlooking something?
-Ben
>
> Signed-off-by: Yang Yang <[email protected]>
> ---
> drivers/md/dm-table.c | 36 +++++++++++++++++++++++++++++++++++
> include/linux/device-mapper.h | 3 +++
> 2 files changed, 39 insertions(+)
>
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index bd68af10afed..f6554590b7af 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -741,6 +741,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
> if (ti->flush_pass_around == 0)
> t->flush_pass_around = 0;
>
> + INIT_LIST_HEAD(&ti->list);
> +
> return 0;
>
> bad:
> @@ -2134,6 +2136,25 @@ void dm_table_postsuspend_targets(struct dm_table *t)
> suspend_targets(t, POSTSUSPEND);
> }
>
> +static int dm_link_dev_to_target(struct dm_target *ti, struct dm_dev *dev,
> + sector_t start, sector_t len, void *data)
> +{
> + struct list_head *targets = &dev->targets;
> + struct dm_target *pti;
> +
> + if (!list_empty(targets)) {
> + list_for_each_entry(pti, targets, list) {
> + if (pti->type == ti->type)
> + return 0;
> + }
> + }
> +
> + if (list_empty(&ti->list))
> + list_add_tail(&ti->list, targets);
> +
> + return 0;
> +}
> +
> int dm_table_resume_targets(struct dm_table *t)
> {
> unsigned int i;
> @@ -2162,6 +2183,21 @@ int dm_table_resume_targets(struct dm_table *t)
> ti->type->resume(ti);
> }
>
> + if (t->flush_pass_around) {
> + struct list_head *devices = &t->devices;
> + struct dm_dev_internal *dd;
> +
> + list_for_each_entry(dd, devices, list)
> + INIT_LIST_HEAD(&dd->dm_dev->targets);
> +
> + for (i = 0; i < t->num_targets; i++) {
> + struct dm_target *ti = dm_table_get_target(t, i);
> +
> + if (ti->type->iterate_devices)
> + ti->type->iterate_devices(ti, dm_link_dev_to_target, NULL);
> + }
> + }
> +
> return 0;
> }
>
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 0893ff8c01b6..19e03f9b2589 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -169,6 +169,7 @@ struct dm_dev {
> struct dax_device *dax_dev;
> blk_mode_t mode;
> char name[16];
> + struct list_head targets;
> };
>
> /*
> @@ -298,6 +299,8 @@ struct dm_target {
> struct dm_table *table;
> struct target_type *type;
>
> + struct list_head list;
> +
> /* target limits */
> sector_t begin;
> sector_t len;
> --
> 2.34.1
>
On Wed, 15 May 2024, Benjamin Marzinski wrote:
> On Tue, May 14, 2024 at 05:04:42PM +0800, Yang Yang wrote:
> > Add a list to the struct dm_dev structure to store the associated
> > targets, while also allowing differentiation between different target
> > types.
>
> I still think this is more complex than it needs to be. If devices that
> support flush_pass_around can guarantee that:
>
> 1. They will send a flush bio to all of their table devices
> 2. They are fine with another target sending the flush bio to their
> table devices
>
> Then I don't see why we need the table devices to keep track of all the
> different target types that are using them. Am I missing something here?
>
> If we don't need to worry about sending a flush bio to a target of each
> type that is using a table device, then all we need to do is call
> __send_empty_flush_bios() for enough targets to cover all the table
> devices. This seems a lot easier to track. We just need another flag in
> dm_target, something like sends_pass_around_flush.
>
> When a target calls dm_get_device(), if it adds a new table device to
> t->devices, then it's the first target in this table to use that device.
> If flush_pass_around is set for this target, then it also sets
> sends_pass_around_flush. In __send_empty_flush() if the table has
> flush_pass_around set, when you iterate through the devices, you only
> call __send_empty_flush_bios() for the ones with sends_pass_around_flush
> set.
>
> Or am I overlooking something?
>
> -Ben
Yes, I agree that it is complex.
I reworked the patch, I'm testing it now and I'll send it when it passes
the tests.
Mikulas
On Wed, May 15, 2024 at 11:42:04AM -0400, Benjamin Marzinski wrote:
> When a target calls dm_get_device(), if it adds a new table device to
> t->devices, then it's the first target in this table to use that device.
> If flush_pass_around is set for this target, then it also sets
> sends_pass_around_flush. In __send_empty_flush() if the table has
> flush_pass_around set, when you iterate through the devices, you only
Err, "When you iterate through the *targets*, you only ..." In this
method you don't iterate through the list of devices (which is supposed
to be protected by t->devices_lock).
> call __send_empty_flush_bios() for the ones with sends_pass_around_flush
> set.
>
> Or am I overlooking something?
>
> -Ben
On 2024/5/15 23:42, Benjamin Marzinski wrote:
> On Tue, May 14, 2024 at 05:04:42PM +0800, Yang Yang wrote:
>> Add a list to the struct dm_dev structure to store the associated
>> targets, while also allowing differentiation between different target
>> types.
>
> I still think this is more complex than it needs to be. If devices that
> support flush_pass_around can guarantee that:
>
> 1. They will send a flush bio to all of their table devices
> 2. They are fine with another target sending the flush bio to their
> table devices
>
> Then I don't see why we need the table devices to keep track of all the
> different target types that are using them. Am I missing something here?
I attempted to enhance this solution to support additional target types,
such as those with num_flush_bios greater than 1.
> If we don't need to worry about sending a flush bio to a target of each
> type that is using a table device, then all we need to do is call
> __send_empty_flush_bios() for enough targets to cover all the table
> devices. This seems a lot easier to track. We just need another flag in
> dm_target, something like sends_pass_around_flush.
>
> When a target calls dm_get_device(), if it adds a new table device to
> t->devices, then it's the first target in this table to use that device.
> If flush_pass_around is set for this target, then it also sets
> sends_pass_around_flush. In __send_empty_flush() if the table has
> flush_pass_around set, when you iterate through the devices, you only
> call __send_empty_flush_bios() for the ones with sends_pass_around_flush
> set.
>
> Or am I overlooking something?
If I understand correctly, you are suggesting to iterate through all the
targets, handling those with sends_pass_around_flush set, and skipping
those where sends_pass_around_flush is not set. I believe this approach
may result in some CPU wastage.
for i in {0..1023}; do
echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
done | sudo dmsetup create example
In this specific scenario, a single iteration of the loop is all that
is needed.
>
> -Ben
>
>>
>> Signed-off-by: Yang Yang <[email protected]>
>> ---
>> drivers/md/dm-table.c | 36 +++++++++++++++++++++++++++++++++++
>> include/linux/device-mapper.h | 3 +++
>> 2 files changed, 39 insertions(+)
>>
>> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
>> index bd68af10afed..f6554590b7af 100644
>> --- a/drivers/md/dm-table.c
>> +++ b/drivers/md/dm-table.c
>> @@ -741,6 +741,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
>> if (ti->flush_pass_around == 0)
>> t->flush_pass_around = 0;
>>
>> + INIT_LIST_HEAD(&ti->list);
>> +
>> return 0;
>>
>> bad:
>> @@ -2134,6 +2136,25 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>> suspend_targets(t, POSTSUSPEND);
>> }
>>
>> +static int dm_link_dev_to_target(struct dm_target *ti, struct dm_dev *dev,
>> + sector_t start, sector_t len, void *data)
>> +{
>> + struct list_head *targets = &dev->targets;
>> + struct dm_target *pti;
>> +
>> + if (!list_empty(targets)) {
>> + list_for_each_entry(pti, targets, list) {
>> + if (pti->type == ti->type)
>> + return 0;
>> + }
>> + }
>> +
>> + if (list_empty(&ti->list))
>> + list_add_tail(&ti->list, targets);
>> +
>> + return 0;
>> +}
>> +
>> int dm_table_resume_targets(struct dm_table *t)
>> {
>> unsigned int i;
>> @@ -2162,6 +2183,21 @@ int dm_table_resume_targets(struct dm_table *t)
>> ti->type->resume(ti);
>> }
>>
>> + if (t->flush_pass_around) {
>> + struct list_head *devices = &t->devices;
>> + struct dm_dev_internal *dd;
>> +
>> + list_for_each_entry(dd, devices, list)
>> + INIT_LIST_HEAD(&dd->dm_dev->targets);
>> +
>> + for (i = 0; i < t->num_targets; i++) {
>> + struct dm_target *ti = dm_table_get_target(t, i);
>> +
>> + if (ti->type->iterate_devices)
>> + ti->type->iterate_devices(ti, dm_link_dev_to_target, NULL);
>> + }
>> + }
>> +
>> return 0;
>> }
>>
>> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
>> index 0893ff8c01b6..19e03f9b2589 100644
>> --- a/include/linux/device-mapper.h
>> +++ b/include/linux/device-mapper.h
>> @@ -169,6 +169,7 @@ struct dm_dev {
>> struct dax_device *dax_dev;
>> blk_mode_t mode;
>> char name[16];
>> + struct list_head targets;
>> };
>>
>> /*
>> @@ -298,6 +299,8 @@ struct dm_target {
>> struct dm_table *table;
>> struct target_type *type;
>>
>> + struct list_head list;
>> +
>> /* target limits */
>> sector_t begin;
>> sector_t len;
>> --
>> 2.34.1
>>
>
On 2024/5/16 0:00, Benjamin Marzinski wrote:
> On Wed, May 15, 2024 at 11:42:04AM -0400, Benjamin Marzinski wrote:
>> When a target calls dm_get_device(), if it adds a new table device to
>> t->devices, then it's the first target in this table to use that device.
>> If flush_pass_around is set for this target, then it also sets
>> sends_pass_around_flush. In __send_empty_flush() if the table has
>> flush_pass_around set, when you iterate through the devices, you only
>
> Err, "When you iterate through the *targets*, you only ..." In this
> method you don't iterate through the list of devices (which is supposed
> to be protected by t->devices_lock).
I'm not very familiar with this area, I thought that the device list
of an active table cannot be modified, so it doesn't need to be
protected by t->devices_lock.
>
>> call __send_empty_flush_bios() for the ones with sends_pass_around_flush
>> set.
>>
>> Or am I overlooking something?
>>
>> -Ben
>
On Thu, May 16, 2024 at 09:55:53AM +0800, YangYang wrote:
> On 2024/5/15 23:42, Benjamin Marzinski wrote:
> > On Tue, May 14, 2024 at 05:04:42PM +0800, Yang Yang wrote:
> > > Add a list to the struct dm_dev structure to store the associated
> > > targets, while also allowing differentiation between different target
> > > types.
> >
> > I still think this is more complex than it needs to be. If devices that
> > support flush_pass_around can guarantee that:
> >
> > 1. They will send a flush bio to all of their table devices
> > 2. They are fine with another target sending the flush bio to their
> > table devices
> >
> > Then I don't see why we need the table devices to keep track of all the
> > different target types that are using them. Am I missing something here?
>
> I attempted to enhance this solution to support additional target types,
> such as those with num_flush_bios greater than 1.
I'm still missing why sending a flush to each target type is necessary
to handle num_flush_bios > 1. As long as the targets meet the
requirements I listed before, AFAICS it should still work with only one
of the targets mapped to each device.
Say the table devices are sda, sdb, sdc, and sdd. If you have 4 linear
targets, each mapped to a different table device, and one stripe target
mapped to all of them. It doesn't really matter if you don't call
__send_empty_flush_bios() for the stripe target, does it? all if its
stripe devs will still get flushes. Similarly, it's fine if one of the
linear targets doesn't get called (in fact it's fine if all the linear
targets don't get called, since the stripe target will send flushes to
all the devices). If there were only 3 linear targets, then the stripe
target would get linked to a table device, so it would get a flush sent
to it. Can you explain a situation where you need the to send a flush to
multiple targets per table device for this to work, if you assume the 2
guarantees I mentioned above (the target sends flushes to all their
devices, and they don't do something special so they need to be the one
to send the flushes).
>
> > If we don't need to worry about sending a flush bio to a target of each
> > type that is using a table device, then all we need to do is call
> > __send_empty_flush_bios() for enough targets to cover all the table
> > devices. This seems a lot easier to track. We just need another flag in
> > dm_target, something like sends_pass_around_flush.
> >
> > When a target calls dm_get_device(), if it adds a new table device to
> > t->devices, then it's the first target in this table to use that device.
> > If flush_pass_around is set for this target, then it also sets
> > sends_pass_around_flush. In __send_empty_flush() if the table has
> > flush_pass_around set, when you iterate through the devices, you only
> > call __send_empty_flush_bios() for the ones with sends_pass_around_flush
> > set.
> >
> > Or am I overlooking something?
>
> If I understand correctly, you are suggesting to iterate through all the
> targets, handling those with sends_pass_around_flush set, and skipping
> those where sends_pass_around_flush is not set. I believe this approach
> may result in some CPU wastage.
>
> for i in {0..1023}; do
> echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
> done | sudo dmsetup create example
>
> In this specific scenario, a single iteration of the loop is all that
> is needed.
It's just one iteration of the loop either way. You either loop through
the targets or the devices. It's true that if you have lots of targets
all mapped to the same device, you would waste time looping through all
the targets instead of looping through the devices. But if you only had
one striped target mapped to lots of devices, you would waste time
looping through all of the devices instead of looping through the
targets.
-Ben
> >
> > -Ben
> >
> > >
> > > Signed-off-by: Yang Yang <[email protected]>
> > > ---
> > > drivers/md/dm-table.c | 36 +++++++++++++++++++++++++++++++++++
> > > include/linux/device-mapper.h | 3 +++
> > > 2 files changed, 39 insertions(+)
> > >
> > > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > > index bd68af10afed..f6554590b7af 100644
> > > --- a/drivers/md/dm-table.c
> > > +++ b/drivers/md/dm-table.c
> > > @@ -741,6 +741,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
> > > if (ti->flush_pass_around == 0)
> > > t->flush_pass_around = 0;
> > > + INIT_LIST_HEAD(&ti->list);
> > > +
> > > return 0;
> > > bad:
> > > @@ -2134,6 +2136,25 @@ void dm_table_postsuspend_targets(struct dm_table *t)
> > > suspend_targets(t, POSTSUSPEND);
> > > }
> > > +static int dm_link_dev_to_target(struct dm_target *ti, struct dm_dev *dev,
> > > + sector_t start, sector_t len, void *data)
> > > +{
> > > + struct list_head *targets = &dev->targets;
> > > + struct dm_target *pti;
> > > +
> > > + if (!list_empty(targets)) {
> > > + list_for_each_entry(pti, targets, list) {
> > > + if (pti->type == ti->type)
> > > + return 0;
> > > + }
> > > + }
> > > +
> > > + if (list_empty(&ti->list))
> > > + list_add_tail(&ti->list, targets);
> > > +
> > > + return 0;
> > > +}
> > > +
> > > int dm_table_resume_targets(struct dm_table *t)
> > > {
> > > unsigned int i;
> > > @@ -2162,6 +2183,21 @@ int dm_table_resume_targets(struct dm_table *t)
> > > ti->type->resume(ti);
> > > }
> > > + if (t->flush_pass_around) {
> > > + struct list_head *devices = &t->devices;
> > > + struct dm_dev_internal *dd;
> > > +
> > > + list_for_each_entry(dd, devices, list)
> > > + INIT_LIST_HEAD(&dd->dm_dev->targets);
> > > +
> > > + for (i = 0; i < t->num_targets; i++) {
> > > + struct dm_target *ti = dm_table_get_target(t, i);
> > > +
> > > + if (ti->type->iterate_devices)
> > > + ti->type->iterate_devices(ti, dm_link_dev_to_target, NULL);
> > > + }
> > > + }
> > > +
> > > return 0;
> > > }
> > > diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> > > index 0893ff8c01b6..19e03f9b2589 100644
> > > --- a/include/linux/device-mapper.h
> > > +++ b/include/linux/device-mapper.h
> > > @@ -169,6 +169,7 @@ struct dm_dev {
> > > struct dax_device *dax_dev;
> > > blk_mode_t mode;
> > > char name[16];
> > > + struct list_head targets;
> > > };
> > > /*
> > > @@ -298,6 +299,8 @@ struct dm_target {
> > > struct dm_table *table;
> > > struct target_type *type;
> > > + struct list_head list;
> > > +
> > > /* target limits */
> > > sector_t begin;
> > > sector_t len;
> > > --
> > > 2.34.1
> > >
> >
On Thu, May 16, 2024 at 10:12:25AM +0800, YangYang wrote:
> On 2024/5/16 0:00, Benjamin Marzinski wrote:
> > On Wed, May 15, 2024 at 11:42:04AM -0400, Benjamin Marzinski wrote:
> > > When a target calls dm_get_device(), if it adds a new table device to
> > > t->devices, then it's the first target in this table to use that device.
> > > If flush_pass_around is set for this target, then it also sets
> > > sends_pass_around_flush. In __send_empty_flush() if the table has
> > > flush_pass_around set, when you iterate through the devices, you only
> >
> > Err, "When you iterate through the *targets*, you only ..." In this
> > method you don't iterate through the list of devices (which is supposed
> > to be protected by t->devices_lock).
>
> I'm not very familiar with this area, I thought that the device list
> of an active table cannot be modified, so it doesn't need to be
> protected by t->devices_lock.
Actually, looking at this some more you're basically correct. The only
place where dm can modify the devices list of active table is in
multipath_message(), and that's bug. So you should be safe not locking
here.
I'll post a patch to fix the multipath target.
-Ben
> >
> > > call __send_empty_flush_bios() for the ones with sends_pass_around_flush
> > > set.
> > >
> > > Or am I overlooking something?
> > >
> > > -Ben
> >
Device mapper sends flush bios to all the targets and the targets send it
to the underlying device. That may be inefficient, for example if a table
contains 10 linear targets pointing to the same physical device, then
device mapper would send 10 flush bios to that device - despite the fact
that only one bio would be sufficient.
This commit optimizes the flush behavior. It introduces a per-target
variable flush_pass_around - it is set when the target supports flush
optimization - currently, the dm-linear and dm-stripe targets support it.
When all the targets in a table have flush_pass_around, flush_pass_around
on the table is set. __send_empty_flush tests if the table has
flush_pass_around - and if it has, no flush bios are sent to the targets
and the list dm_table->devices is iterated and the flush bios are sent to
each member of the list.
Signed-off-by: Mikulas Patocka <[email protected]>
Reported-by: Yang Yang <[email protected]>
---
drivers/md/dm-core.h | 4 ++-
drivers/md/dm-linear.c | 1
drivers/md/dm-stripe.c | 1
drivers/md/dm-table.c | 4 +++
drivers/md/dm.c | 47 +++++++++++++++++++++++++++++-------------
include/linux/device-mapper.h | 5 ++++
6 files changed, 47 insertions(+), 15 deletions(-)
Index: linux-2.6/drivers/md/dm-core.h
===================================================================
--- linux-2.6.orig/drivers/md/dm-core.h 2024-05-15 16:56:49.000000000 +0200
+++ linux-2.6/drivers/md/dm-core.h 2024-05-15 16:56:49.000000000 +0200
@@ -206,7 +206,9 @@ struct dm_table {
bool integrity_supported:1;
bool singleton:1;
- unsigned integrity_added:1;
+ bool integrity_added:1;
+ /* set if all the targets in the table have "flush_pass_around" set */
+ bool flush_pass_around:1;
/*
* Indicates the rw permissions for the new logical device. This
Index: linux-2.6/drivers/md/dm-linear.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-linear.c 2024-05-15 16:56:49.000000000 +0200
+++ linux-2.6/drivers/md/dm-linear.c 2024-05-15 16:56:49.000000000 +0200
@@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *
ti->num_discard_bios = 1;
ti->num_secure_erase_bios = 1;
ti->num_write_zeroes_bios = 1;
+ ti->flush_pass_around = true;
ti->private = lc;
return 0;
Index: linux-2.6/drivers/md/dm-stripe.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-stripe.c 2024-05-15 16:56:49.000000000 +0200
+++ linux-2.6/drivers/md/dm-stripe.c 2024-05-15 16:56:49.000000000 +0200
@@ -157,6 +157,7 @@ static int stripe_ctr(struct dm_target *
ti->num_discard_bios = stripes;
ti->num_secure_erase_bios = stripes;
ti->num_write_zeroes_bios = stripes;
+ ti->flush_pass_around = true;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
Index: linux-2.6/drivers/md/dm-table.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
+++ linux-2.6/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
@@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **re
t->type = DM_TYPE_NONE;
t->mode = mode;
t->md = md;
+ t->flush_pass_around = 1;
*result = t;
return 0;
}
@@ -738,6 +739,9 @@ int dm_table_add_target(struct dm_table
if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
static_branch_enable(&swap_bios_enabled);
+ if (!ti->flush_pass_around)
+ t->flush_pass_around = false;
+
return 0;
bad:
Index: linux-2.6/include/linux/device-mapper.h
===================================================================
--- linux-2.6.orig/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
+++ linux-2.6/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
@@ -397,6 +397,11 @@ struct dm_target {
* bio_set_dev(). NOTE: ideally a target should _not_ need this.
*/
bool needs_bio_set_dev:1;
+
+ /*
+ * Set if the target supports flush optimization
+ */
+ bool flush_pass_around:1;
};
void *dm_per_bio_data(struct bio *bio, size_t data_size);
Index: linux-2.6/drivers/md/dm.c
===================================================================
--- linux-2.6.orig/drivers/md/dm.c 2024-05-15 16:56:49.000000000 +0200
+++ linux-2.6/drivers/md/dm.c 2024-05-16 20:06:32.000000000 +0200
@@ -645,7 +645,7 @@ static struct bio *alloc_tio(struct clon
/* Set default bdev, but target must bio_set_dev() before issuing IO */
clone->bi_bdev = md->disk->part0;
- if (unlikely(ti->needs_bio_set_dev))
+ if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
bio_set_dev(clone, md->disk->part0);
if (len) {
@@ -1107,7 +1107,7 @@ static void clone_endio(struct bio *bio)
blk_status_t error = bio->bi_status;
struct dm_target_io *tio = clone_to_tio(bio);
struct dm_target *ti = tio->ti;
- dm_endio_fn endio = ti->type->end_io;
+ dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
struct dm_io *io = tio->io;
struct mapped_device *md = io->md;
@@ -1154,7 +1154,7 @@ static void clone_endio(struct bio *bio)
}
if (static_branch_unlikely(&swap_bios_enabled) &&
- unlikely(swap_bios_limit(ti, bio)))
+ likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
up(&md->swap_bios_semaphore);
free_tio(bio);
@@ -1566,17 +1566,36 @@ static void __send_empty_flush(struct cl
ci->sector_count = 0;
ci->io->tio.clone.bi_iter.bi_size = 0;
- for (unsigned int i = 0; i < t->num_targets; i++) {
- unsigned int bios;
- struct dm_target *ti = dm_table_get_target(t, i);
-
- if (unlikely(ti->num_flush_bios == 0))
- continue;
-
- atomic_add(ti->num_flush_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
- NULL, GFP_NOWAIT);
- atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ if (!t->flush_pass_around) {
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ unsigned int bios;
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (unlikely(ti->num_flush_bios == 0))
+ continue;
+
+ atomic_add(ti->num_flush_bios, &ci->io->io_count);
+ bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
+ NULL, GFP_NOWAIT);
+ atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ }
+ } else {
+ /*
+ * Note that there's no need to grab t->devices_lock here
+ * because the targets that support flush pass-around don't
+ * modify the list of devices.
+ */
+ struct list_head *devices = dm_table_get_devices(t);
+ unsigned int len = 0;
+ struct dm_dev_internal *dd;
+ list_for_each_entry(dd, devices, list) {
+ struct bio *clone;
+ clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
+ atomic_add(1, &ci->io->io_count);
+ bio_set_dev(clone, dd->dm_dev->bdev);
+ clone->bi_end_io = clone_endio;
+ dm_submit_bio_remap(clone, NULL);
+ }
}
/*
On 2024/5/16 23:29, Benjamin Marzinski wrote:
> On Thu, May 16, 2024 at 09:55:53AM +0800, YangYang wrote:
>> On 2024/5/15 23:42, Benjamin Marzinski wrote:
>>> On Tue, May 14, 2024 at 05:04:42PM +0800, Yang Yang wrote:
>>>> Add a list to the struct dm_dev structure to store the associated
>>>> targets, while also allowing differentiation between different target
>>>> types.
>>>
>>> I still think this is more complex than it needs to be. If devices that
>>> support flush_pass_around can guarantee that:
>>>
>>> 1. They will send a flush bio to all of their table devices
>>> 2. They are fine with another target sending the flush bio to their
>>> table devices
>>>
>>> Then I don't see why we need the table devices to keep track of all the
>>> different target types that are using them. Am I missing something here?
>>
>> I attempted to enhance this solution to support additional target types,
>> such as those with num_flush_bios greater than 1.
>
> I'm still missing why sending a flush to each target type is necessary
> to handle num_flush_bios > 1. As long as the targets meet the
> requirements I listed before, AFAICS it should still work with only one
> of the targets mapped to each device.
>
> Say the table devices are sda, sdb, sdc, and sdd. If you have 4 linear
> targets, each mapped to a different table device, and one stripe target
> mapped to all of them. It doesn't really matter if you don't call
> __send_empty_flush_bios() for the stripe target, does it? all if its
> stripe devs will still get flushes. Similarly, it's fine if one of the
> linear targets doesn't get called (in fact it's fine if all the linear
> targets don't get called, since the stripe target will send flushes to
> all the devices). If there were only 3 linear targets, then the stripe
> target would get linked to a table device, so it would get a flush sent
> to it. Can you explain a situation where you need the to send a flush to
> multiple targets per table device for this to work, if you assume the 2
> guarantees I mentioned above (the target sends flushes to all their
> devices, and they don't do something special so they need to be the one
> to send the flushes).
Yes, if the targets meet the requirements you listed previously, there
is no need to send a flush to each target type.
I think I may be overthinking this. I tried to handle some targets with
num_flush_bios > 1 that don't meet the requirements.
>>
>>> If we don't need to worry about sending a flush bio to a target of each
>>> type that is using a table device, then all we need to do is call
>>> __send_empty_flush_bios() for enough targets to cover all the table
>>> devices. This seems a lot easier to track. We just need another flag in
>>> dm_target, something like sends_pass_around_flush.
>>>
>>> When a target calls dm_get_device(), if it adds a new table device to
>>> t->devices, then it's the first target in this table to use that device.
>>> If flush_pass_around is set for this target, then it also sets
>>> sends_pass_around_flush. In __send_empty_flush() if the table has
>>> flush_pass_around set, when you iterate through the devices, you only
>>> call __send_empty_flush_bios() for the ones with sends_pass_around_flush
>>> set.
>>>
>>> Or am I overlooking something?
>>
>> If I understand correctly, you are suggesting to iterate through all the
>> targets, handling those with sends_pass_around_flush set, and skipping
>> those where sends_pass_around_flush is not set. I believe this approach
>> may result in some CPU wastage.
>>
>> for i in {0..1023}; do
>> echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>> done | sudo dmsetup create example
>>
>> In this specific scenario, a single iteration of the loop is all that
>> is needed.
>
> It's just one iteration of the loop either way. You either loop through
> the targets or the devices. It's true that if you have lots of targets
> all mapped to the same device, you would waste time looping through all
> the targets instead of looping through the devices. But if you only had
> one striped target mapped to lots of devices, you would waste time
> looping through all of the devices instead of looping through the
> targets.
Yes, I get your point. This patchset may make things even worse for
the striped target.
I am just curious, in what scenario is the "dm-strip" target mapped to
a large number of underlying devices from the same block device.
> -Ben
>
>>>
>>> -Ben
>>>
>>>>
>>>> Signed-off-by: Yang Yang <[email protected]>
>>>> ---
>>>> drivers/md/dm-table.c | 36 +++++++++++++++++++++++++++++++++++
>>>> include/linux/device-mapper.h | 3 +++
>>>> 2 files changed, 39 insertions(+)
>>>>
>>>> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
>>>> index bd68af10afed..f6554590b7af 100644
>>>> --- a/drivers/md/dm-table.c
>>>> +++ b/drivers/md/dm-table.c
>>>> @@ -741,6 +741,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
>>>> if (ti->flush_pass_around == 0)
>>>> t->flush_pass_around = 0;
>>>> + INIT_LIST_HEAD(&ti->list);
>>>> +
>>>> return 0;
>>>> bad:
>>>> @@ -2134,6 +2136,25 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>>>> suspend_targets(t, POSTSUSPEND);
>>>> }
>>>> +static int dm_link_dev_to_target(struct dm_target *ti, struct dm_dev *dev,
>>>> + sector_t start, sector_t len, void *data)
>>>> +{
>>>> + struct list_head *targets = &dev->targets;
>>>> + struct dm_target *pti;
>>>> +
>>>> + if (!list_empty(targets)) {
>>>> + list_for_each_entry(pti, targets, list) {
>>>> + if (pti->type == ti->type)
>>>> + return 0;
>>>> + }
>>>> + }
>>>> +
>>>> + if (list_empty(&ti->list))
>>>> + list_add_tail(&ti->list, targets);
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> int dm_table_resume_targets(struct dm_table *t)
>>>> {
>>>> unsigned int i;
>>>> @@ -2162,6 +2183,21 @@ int dm_table_resume_targets(struct dm_table *t)
>>>> ti->type->resume(ti);
>>>> }
>>>> + if (t->flush_pass_around) {
>>>> + struct list_head *devices = &t->devices;
>>>> + struct dm_dev_internal *dd;
>>>> +
>>>> + list_for_each_entry(dd, devices, list)
>>>> + INIT_LIST_HEAD(&dd->dm_dev->targets);
>>>> +
>>>> + for (i = 0; i < t->num_targets; i++) {
>>>> + struct dm_target *ti = dm_table_get_target(t, i);
>>>> +
>>>> + if (ti->type->iterate_devices)
>>>> + ti->type->iterate_devices(ti, dm_link_dev_to_target, NULL);
>>>> + }
>>>> + }
>>>> +
>>>> return 0;
>>>> }
>>>> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
>>>> index 0893ff8c01b6..19e03f9b2589 100644
>>>> --- a/include/linux/device-mapper.h
>>>> +++ b/include/linux/device-mapper.h
>>>> @@ -169,6 +169,7 @@ struct dm_dev {
>>>> struct dax_device *dax_dev;
>>>> blk_mode_t mode;
>>>> char name[16];
>>>> + struct list_head targets;
>>>> };
>>>> /*
>>>> @@ -298,6 +299,8 @@ struct dm_target {
>>>> struct dm_table *table;
>>>> struct target_type *type;
>>>> + struct list_head list;
>>>> +
>>>> /* target limits */
>>>> sector_t begin;
>>>> sector_t len;
>>>> --
>>>> 2.34.1
>>>>
>>>
>
On 2024/5/17 4:49, Mikulas Patocka wrote:
> Device mapper sends flush bios to all the targets and the targets send it
> to the underlying device. That may be inefficient, for example if a table
> contains 10 linear targets pointing to the same physical device, then
> device mapper would send 10 flush bios to that device - despite the fact
> that only one bio would be sufficient.
>
> This commit optimizes the flush behavior. It introduces a per-target
> variable flush_pass_around - it is set when the target supports flush
> optimization - currently, the dm-linear and dm-stripe targets support it.
> When all the targets in a table have flush_pass_around, flush_pass_around
> on the table is set. __send_empty_flush tests if the table has
> flush_pass_around - and if it has, no flush bios are sent to the targets
> and the list dm_table->devices is iterated and the flush bios are sent to
> each member of the list.
>
> Signed-off-by: Mikulas Patocka <[email protected]>
> Reported-by: Yang Yang <[email protected]>
>
> ---
> drivers/md/dm-core.h | 4 ++-
> drivers/md/dm-linear.c | 1
> drivers/md/dm-stripe.c | 1
> drivers/md/dm-table.c | 4 +++
> drivers/md/dm.c | 47 +++++++++++++++++++++++++++++-------------
> include/linux/device-mapper.h | 5 ++++
> 6 files changed, 47 insertions(+), 15 deletions(-)
>
> Index: linux-2.6/drivers/md/dm-core.h
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-core.h 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-core.h 2024-05-15 16:56:49.000000000 +0200
> @@ -206,7 +206,9 @@ struct dm_table {
>
> bool integrity_supported:1;
> bool singleton:1;
> - unsigned integrity_added:1;
> + bool integrity_added:1;
> + /* set if all the targets in the table have "flush_pass_around" set */
> + bool flush_pass_around:1;
>
> /*
> * Indicates the rw permissions for the new logical device. This
> Index: linux-2.6/drivers/md/dm-linear.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-linear.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-linear.c 2024-05-15 16:56:49.000000000 +0200
> @@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *
> ti->num_discard_bios = 1;
> ti->num_secure_erase_bios = 1;
> ti->num_write_zeroes_bios = 1;
> + ti->flush_pass_around = true;
> ti->private = lc;
> return 0;
>
> Index: linux-2.6/drivers/md/dm-stripe.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-stripe.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-stripe.c 2024-05-15 16:56:49.000000000 +0200
> @@ -157,6 +157,7 @@ static int stripe_ctr(struct dm_target *
> ti->num_discard_bios = stripes;
> ti->num_secure_erase_bios = stripes;
> ti->num_write_zeroes_bios = stripes;
> + ti->flush_pass_around = true;
>
> sc->chunk_size = chunk_size;
> if (chunk_size & (chunk_size - 1))
> Index: linux-2.6/drivers/md/dm-table.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
> @@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **re
> t->type = DM_TYPE_NONE;
> t->mode = mode;
> t->md = md;
> + t->flush_pass_around = 1;
> *result = t;
> return 0;
> }
> @@ -738,6 +739,9 @@ int dm_table_add_target(struct dm_table
> if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
> static_branch_enable(&swap_bios_enabled);
>
> + if (!ti->flush_pass_around)
> + t->flush_pass_around = false;
> +
> return 0;
>
> bad:
> Index: linux-2.6/include/linux/device-mapper.h
> ===================================================================
> --- linux-2.6.orig/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
> @@ -397,6 +397,11 @@ struct dm_target {
> * bio_set_dev(). NOTE: ideally a target should _not_ need this.
> */
> bool needs_bio_set_dev:1;
> +
> + /*
> + * Set if the target supports flush optimization
> + */
> + bool flush_pass_around:1;
> };
>
> void *dm_per_bio_data(struct bio *bio, size_t data_size);
> Index: linux-2.6/drivers/md/dm.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm.c 2024-05-16 20:06:32.000000000 +0200
> @@ -645,7 +645,7 @@ static struct bio *alloc_tio(struct clon
>
> /* Set default bdev, but target must bio_set_dev() before issuing IO */
> clone->bi_bdev = md->disk->part0;
> - if (unlikely(ti->needs_bio_set_dev))
> + if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
> bio_set_dev(clone, md->disk->part0);
>
> if (len) {
> @@ -1107,7 +1107,7 @@ static void clone_endio(struct bio *bio)
> blk_status_t error = bio->bi_status;
> struct dm_target_io *tio = clone_to_tio(bio);
> struct dm_target *ti = tio->ti;
> - dm_endio_fn endio = ti->type->end_io;
> + dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
> struct dm_io *io = tio->io;
> struct mapped_device *md = io->md;
>
> @@ -1154,7 +1154,7 @@ static void clone_endio(struct bio *bio)
> }
>
> if (static_branch_unlikely(&swap_bios_enabled) &&
> - unlikely(swap_bios_limit(ti, bio)))
> + likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
> up(&md->swap_bios_semaphore);
>
> free_tio(bio);
> @@ -1566,17 +1566,36 @@ static void __send_empty_flush(struct cl
> ci->sector_count = 0;
> ci->io->tio.clone.bi_iter.bi_size = 0;
>
> - for (unsigned int i = 0; i < t->num_targets; i++) {
> - unsigned int bios;
> - struct dm_target *ti = dm_table_get_target(t, i);
> -
> - if (unlikely(ti->num_flush_bios == 0))
> - continue;
> -
> - atomic_add(ti->num_flush_bios, &ci->io->io_count);
> - bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> - NULL, GFP_NOWAIT);
> - atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> + if (!t->flush_pass_around) {
> + for (unsigned int i = 0; i < t->num_targets; i++) {
> + unsigned int bios;
> + struct dm_target *ti = dm_table_get_target(t, i);
> +
> + if (unlikely(ti->num_flush_bios == 0))
> + continue;
> +
> + atomic_add(ti->num_flush_bios, &ci->io->io_count);
> + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> + NULL, GFP_NOWAIT);
> + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> + }
> + } else {
> + /*
> + * Note that there's no need to grab t->devices_lock here
> + * because the targets that support flush pass-around don't
> + * modify the list of devices.
> + */
> + struct list_head *devices = dm_table_get_devices(t);
> + unsigned int len = 0;
> + struct dm_dev_internal *dd;
> + list_for_each_entry(dd, devices, list) {
> + struct bio *clone;
> + clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
> + atomic_add(1, &ci->io->io_count);
> + bio_set_dev(clone, dd->dm_dev->bdev);
> + clone->bi_end_io = clone_endio;
> + dm_submit_bio_remap(clone, NULL);
> + }
> }
>
> /*
>
Thanks, I tested this patch, and it fixed the issue for me!
On Fri, May 17, 2024 at 03:48:49PM +0800, YangYang wrote:
> On 2024/5/16 23:29, Benjamin Marzinski wrote:
> > On Thu, May 16, 2024 at 09:55:53AM +0800, YangYang wrote:
> > > On 2024/5/15 23:42, Benjamin Marzinski wrote:
> > > > On Tue, May 14, 2024 at 05:04:42PM +0800, Yang Yang wrote:
> > >
> > > If I understand correctly, you are suggesting to iterate through all the
> > > targets, handling those with sends_pass_around_flush set, and skipping
> > > those where sends_pass_around_flush is not set. I believe this approach
> > > may result in some CPU wastage.
> > >
> > > for i in {0..1023}; do
> > > echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
> > > done | sudo dmsetup create example
> > >
> > > In this specific scenario, a single iteration of the loop is all that
> > > is needed.
> >
> > It's just one iteration of the loop either way. You either loop through
> > the targets or the devices. It's true that if you have lots of targets
> > all mapped to the same device, you would waste time looping through all
> > the targets instead of looping through the devices. But if you only had
> > one striped target mapped to lots of devices, you would waste time
> > looping through all of the devices instead of looping through the
> > targets.
>
> Yes, I get your point. This patchset may make things even worse for
> the striped target.
> I am just curious, in what scenario is the "dm-strip" target mapped to
> a large number of underlying devices from the same block device.
>
I don't think anyone in the real world does create dm-stripe devices with a
huge number of stripe table devices. My point was that it didn't seem
obvious me that looping through the targets was a significant problem
compared to looping through the devices.
At any rate, Mikulas's patch already does this optimally, even for
targets like dm-stripe, so it doesn't really matter now.
-Ben
On 2024/5/17 22:33, Benjamin Marzinski wrote:
> On Fri, May 17, 2024 at 03:48:49PM +0800, YangYang wrote:
>> On 2024/5/16 23:29, Benjamin Marzinski wrote:
>>> On Thu, May 16, 2024 at 09:55:53AM +0800, YangYang wrote:
>>>> On 2024/5/15 23:42, Benjamin Marzinski wrote:
>>>>> On Tue, May 14, 2024 at 05:04:42PM +0800, Yang Yang wrote:
>
>>>>
>>>> If I understand correctly, you are suggesting to iterate through all the
>>>> targets, handling those with sends_pass_around_flush set, and skipping
>>>> those where sends_pass_around_flush is not set. I believe this approach
>>>> may result in some CPU wastage.
>>>>
>>>> for i in {0..1023}; do
>>>> echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>>>> done | sudo dmsetup create example
>>>>
>>>> In this specific scenario, a single iteration of the loop is all that
>>>> is needed.
>>>
>>> It's just one iteration of the loop either way. You either loop through
>>> the targets or the devices. It's true that if you have lots of targets
>>> all mapped to the same device, you would waste time looping through all
>>> the targets instead of looping through the devices. But if you only had
>>> one striped target mapped to lots of devices, you would waste time
>>> looping through all of the devices instead of looping through the
>>> targets.
>>
>> Yes, I get your point. This patchset may make things even worse for
>> the striped target.
>> I am just curious, in what scenario is the "dm-strip" target mapped to
>> a large number of underlying devices from the same block device.
>>
>
> I don't think anyone in the real world does create dm-stripe devices with a
> huge number of stripe table devices. My point was that it didn't seem
> obvious me that looping through the targets was a significant problem
> compared to looping through the devices.
>
> At any rate, Mikulas's patch already does this optimally, even for
> targets like dm-stripe, so it doesn't really matter now.
>
> -Ben
>
Yeah. Thanks for the explanation and your patience.
On Thu, May 16, 2024 at 10:49:55PM +0200, Mikulas Patocka wrote:
> Device mapper sends flush bios to all the targets and the targets send it
> to the underlying device. That may be inefficient, for example if a table
> contains 10 linear targets pointing to the same physical device, then
> device mapper would send 10 flush bios to that device - despite the fact
> that only one bio would be sufficient.
>
> This commit optimizes the flush behavior. It introduces a per-target
> variable flush_pass_around - it is set when the target supports flush
> optimization - currently, the dm-linear and dm-stripe targets support it.
> When all the targets in a table have flush_pass_around, flush_pass_around
> on the table is set. __send_empty_flush tests if the table has
> flush_pass_around - and if it has, no flush bios are sent to the targets
> and the list dm_table->devices is iterated and the flush bios are sent to
> each member of the list.
What does "pass around" mean? Seems like an awkward name for this.
(Naming can be hard, I don't have better suggestions at the moment.)
> Signed-off-by: Mikulas Patocka <[email protected]>
> Reported-by: Yang Yang <[email protected]>
>
> ---
> drivers/md/dm-core.h | 4 ++-
> drivers/md/dm-linear.c | 1
> drivers/md/dm-stripe.c | 1
> drivers/md/dm-table.c | 4 +++
> drivers/md/dm.c | 47 +++++++++++++++++++++++++++++-------------
> include/linux/device-mapper.h | 5 ++++
> 6 files changed, 47 insertions(+), 15 deletions(-)
>
> Index: linux-2.6/drivers/md/dm-core.h
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-core.h 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-core.h 2024-05-15 16:56:49.000000000 +0200
> @@ -206,7 +206,9 @@ struct dm_table {
>
> bool integrity_supported:1;
> bool singleton:1;
> - unsigned integrity_added:1;
> + bool integrity_added:1;
> + /* set if all the targets in the table have "flush_pass_around" set */
> + bool flush_pass_around:1;
>
> /*
> * Indicates the rw permissions for the new logical device. This
> Index: linux-2.6/drivers/md/dm-linear.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-linear.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-linear.c 2024-05-15 16:56:49.000000000 +0200
> @@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *
> ti->num_discard_bios = 1;
> ti->num_secure_erase_bios = 1;
> ti->num_write_zeroes_bios = 1;
> + ti->flush_pass_around = true;
> ti->private = lc;
> return 0;
>
> Index: linux-2.6/drivers/md/dm-stripe.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-stripe.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-stripe.c 2024-05-15 16:56:49.000000000 +0200
> @@ -157,6 +157,7 @@ static int stripe_ctr(struct dm_target *
> ti->num_discard_bios = stripes;
> ti->num_secure_erase_bios = stripes;
> ti->num_write_zeroes_bios = stripes;
> + ti->flush_pass_around = true;
>
> sc->chunk_size = chunk_size;
> if (chunk_size & (chunk_size - 1))
> Index: linux-2.6/drivers/md/dm-table.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
> @@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **re
> t->type = DM_TYPE_NONE;
> t->mode = mode;
> t->md = md;
> + t->flush_pass_around = 1;
> *result = t;
> return 0;
> }
Should be: t->flush_pass_around = true;
> @@ -738,6 +739,9 @@ int dm_table_add_target(struct dm_table
> if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
> static_branch_enable(&swap_bios_enabled);
>
> + if (!ti->flush_pass_around)
> + t->flush_pass_around = false;
> +
> return 0;
>
> bad:
> Index: linux-2.6/include/linux/device-mapper.h
> ===================================================================
> --- linux-2.6.orig/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
> @@ -397,6 +397,11 @@ struct dm_target {
> * bio_set_dev(). NOTE: ideally a target should _not_ need this.
> */
> bool needs_bio_set_dev:1;
> +
> + /*
> + * Set if the target supports flush optimization
> + */
> + bool flush_pass_around:1;
> };
How does a developer _know_ if a target can set this flag? Please
elaborate on the requirements in this code comment.
>
> void *dm_per_bio_data(struct bio *bio, size_t data_size);
> Index: linux-2.6/drivers/md/dm.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm.c 2024-05-15 16:56:49.000000000 +0200
> +++ linux-2.6/drivers/md/dm.c 2024-05-16 20:06:32.000000000 +0200
> @@ -645,7 +645,7 @@ static struct bio *alloc_tio(struct clon
>
> /* Set default bdev, but target must bio_set_dev() before issuing IO */
> clone->bi_bdev = md->disk->part0;
> - if (unlikely(ti->needs_bio_set_dev))
> + if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
> bio_set_dev(clone, md->disk->part0);
>
> if (len) {
> @@ -1107,7 +1107,7 @@ static void clone_endio(struct bio *bio)
> blk_status_t error = bio->bi_status;
> struct dm_target_io *tio = clone_to_tio(bio);
> struct dm_target *ti = tio->ti;
> - dm_endio_fn endio = ti->type->end_io;
> + dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
> struct dm_io *io = tio->io;
> struct mapped_device *md = io->md;
>
> @@ -1154,7 +1154,7 @@ static void clone_endio(struct bio *bio)
> }
>
> if (static_branch_unlikely(&swap_bios_enabled) &&
> - unlikely(swap_bios_limit(ti, bio)))
> + likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
> up(&md->swap_bios_semaphore);
>
> free_tio(bio);
What is it about this commit that makes it important to verify ti
isn't NULL in the above 3 hunks?
Should these NULL checks be factored out as a separate fix?
Or can these hunks be dropped?
> @@ -1566,17 +1566,36 @@ static void __send_empty_flush(struct cl
> ci->sector_count = 0;
> ci->io->tio.clone.bi_iter.bi_size = 0;
>
> - for (unsigned int i = 0; i < t->num_targets; i++) {
> - unsigned int bios;
> - struct dm_target *ti = dm_table_get_target(t, i);
> -
> - if (unlikely(ti->num_flush_bios == 0))
> - continue;
> -
> - atomic_add(ti->num_flush_bios, &ci->io->io_count);
> - bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> - NULL, GFP_NOWAIT);
> - atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> + if (!t->flush_pass_around) {
> + for (unsigned int i = 0; i < t->num_targets; i++) {
> + unsigned int bios;
> + struct dm_target *ti = dm_table_get_target(t, i);
> +
> + if (unlikely(ti->num_flush_bios == 0))
> + continue;
> +
> + atomic_add(ti->num_flush_bios, &ci->io->io_count);
> + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> + NULL, GFP_NOWAIT);
> + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> + }
> + } else {
> + /*
> + * Note that there's no need to grab t->devices_lock here
> + * because the targets that support flush pass-around don't
> + * modify the list of devices.
> + */
> + struct list_head *devices = dm_table_get_devices(t);
> + unsigned int len = 0;
> + struct dm_dev_internal *dd;
> + list_for_each_entry(dd, devices, list) {
> + struct bio *clone;
> + clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
> + atomic_add(1, &ci->io->io_count);
> + bio_set_dev(clone, dd->dm_dev->bdev);
> + clone->bi_end_io = clone_endio;
> + dm_submit_bio_remap(clone, NULL);
> + }
> }
>
> /*
>
>
Still missing what "pass-around" is meant to convey given that you
aren't passing around the same flush... you're cloning a new flush and
issuing one per device. Probably worth explaining that's what you
mean by "flush_pass_around" (both in commit header and elaborate in
code)?
Also, you're issuing a flush to _all_ devices in a table. Not just
the data devices. .iterate_devices returns only the data devices.
If/when there is a need to extend this feature to targets that have
metadata devices (e.g. dm-thin, cache, etc): would it make sense to
filter out non-data devices (by stepping through each target in the
table and using iterate_devices)?
Mike
On Wed, 22 May 2024, Mike Snitzer wrote:
> On Thu, May 16, 2024 at 10:49:55PM +0200, Mikulas Patocka wrote:
> > Device mapper sends flush bios to all the targets and the targets send it
> > to the underlying device. That may be inefficient, for example if a table
> > contains 10 linear targets pointing to the same physical device, then
> > device mapper would send 10 flush bios to that device - despite the fact
> > that only one bio would be sufficient.
> >
> > This commit optimizes the flush behavior. It introduces a per-target
> > variable flush_pass_around - it is set when the target supports flush
> > optimization - currently, the dm-linear and dm-stripe targets support it.
> > When all the targets in a table have flush_pass_around, flush_pass_around
> > on the table is set. __send_empty_flush tests if the table has
> > flush_pass_around - and if it has, no flush bios are sent to the targets
> > and the list dm_table->devices is iterated and the flush bios are sent to
> > each member of the list.
>
> What does "pass around" mean? Seems like an awkward name for this.
> (Naming can be hard, I don't have better suggestions at the moment.)
What about "flush_bypass" or "flush_bypasses_map"?
> > Index: linux-2.6/drivers/md/dm-table.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
> > +++ linux-2.6/drivers/md/dm-table.c 2024-05-15 16:56:49.000000000 +0200
> > @@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **re
> > t->type = DM_TYPE_NONE;
> > t->mode = mode;
> > t->md = md;
> > + t->flush_pass_around = 1;
> > *result = t;
> > return 0;
> > }
>
> Should be: t->flush_pass_around = true;
Yes.
> > +
> > + /*
> > + * Set if the target supports flush optimization
> > + */
> > + bool flush_pass_around:1;
> > };
>
> How does a developer _know_ if a target can set this flag? Please
> elaborate on the requirements in this code comment.
What about:
"The target supports flush optimization. When all the targets in the table
support flush optimization, flushes will not use the "map" method and they
will be sent directly to all the devices in the table. This optimization
reduces the number of flushes that are being sent if multiple targets use
the same underlying device."
> >
> > void *dm_per_bio_data(struct bio *bio, size_t data_size);
> > Index: linux-2.6/drivers/md/dm.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm.c 2024-05-15 16:56:49.000000000 +0200
> > +++ linux-2.6/drivers/md/dm.c 2024-05-16 20:06:32.000000000 +0200
> > @@ -645,7 +645,7 @@ static struct bio *alloc_tio(struct clon
> >
> > /* Set default bdev, but target must bio_set_dev() before issuing IO */
> > clone->bi_bdev = md->disk->part0;
> > - if (unlikely(ti->needs_bio_set_dev))
> > + if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
> > bio_set_dev(clone, md->disk->part0);
> >
> > if (len) {
> > @@ -1107,7 +1107,7 @@ static void clone_endio(struct bio *bio)
> > blk_status_t error = bio->bi_status;
> > struct dm_target_io *tio = clone_to_tio(bio);
> > struct dm_target *ti = tio->ti;
> > - dm_endio_fn endio = ti->type->end_io;
> > + dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
> > struct dm_io *io = tio->io;
> > struct mapped_device *md = io->md;
> >
> > @@ -1154,7 +1154,7 @@ static void clone_endio(struct bio *bio)
> > }
> >
> > if (static_branch_unlikely(&swap_bios_enabled) &&
> > - unlikely(swap_bios_limit(ti, bio)))
> > + likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
> > up(&md->swap_bios_semaphore);
> >
> > free_tio(bio);
>
> What is it about this commit that makes it important to verify ti
> isn't NULL in the above 3 hunks?
>
> Should these NULL checks be factored out as a separate fix?
>
> Or can these hunks be dropped?
They can't be dropped.
When performing the flush bypass optimization, the dm core creates a
dm_target_io structure that isn't associated with any specific target. So,
the pointer "tio->ti" is NULL.
I could set "tio->ti" to any target, but I think it's better to set it to
NULL, just to mark that there is no target association.
> > @@ -1566,17 +1566,36 @@ static void __send_empty_flush(struct cl
> > ci->sector_count = 0;
> > ci->io->tio.clone.bi_iter.bi_size = 0;
> >
> > - for (unsigned int i = 0; i < t->num_targets; i++) {
> > - unsigned int bios;
> > - struct dm_target *ti = dm_table_get_target(t, i);
> > -
> > - if (unlikely(ti->num_flush_bios == 0))
> > - continue;
> > -
> > - atomic_add(ti->num_flush_bios, &ci->io->io_count);
> > - bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> > - NULL, GFP_NOWAIT);
> > - atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> > + if (!t->flush_pass_around) {
> > + for (unsigned int i = 0; i < t->num_targets; i++) {
> > + unsigned int bios;
> > + struct dm_target *ti = dm_table_get_target(t, i);
> > +
> > + if (unlikely(ti->num_flush_bios == 0))
> > + continue;
> > +
> > + atomic_add(ti->num_flush_bios, &ci->io->io_count);
> > + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> > + NULL, GFP_NOWAIT);
> > + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> > + }
> > + } else {
> > + /*
> > + * Note that there's no need to grab t->devices_lock here
> > + * because the targets that support flush pass-around don't
> > + * modify the list of devices.
> > + */
> > + struct list_head *devices = dm_table_get_devices(t);
> > + unsigned int len = 0;
> > + struct dm_dev_internal *dd;
> > + list_for_each_entry(dd, devices, list) {
> > + struct bio *clone;
> > + clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
^^^^
Here we set tio->ti to NULL.
> > + atomic_add(1, &ci->io->io_count);
> > + bio_set_dev(clone, dd->dm_dev->bdev);
> > + clone->bi_end_io = clone_endio;
> > + dm_submit_bio_remap(clone, NULL);
> > + }
> > }
> >
> > /*
> >
> >
>
> Still missing what "pass-around" is meant to convey given that you
> aren't passing around the same flush... you're cloning a new flush and
> issuing one per device. Probably worth explaining that's what you
> mean by "flush_pass_around" (both in commit header and elaborate in
> code)?
I mean that flushes bypass the map method.
> Also, you're issuing a flush to _all_ devices in a table. Not just
> the data devices. .iterate_devices returns only the data devices.
> If/when there is a need to extend this feature to targets that have
> metadata devices (e.g. dm-thin, cache, etc): would it make sense to
> filter out non-data devices (by stepping through each target in the
> table and using iterate_devices)?
This optimization only makes sense if there are multiple targets in the
table. dm-thin, dm-cache, dm-raid is usually the only target in the table,
so the optimization doesn't make sense for them. Trying to support the
"flush bypass" optimization for them would bloat the code without reducing
the number of flush requests at all.
> Mike
Mikulas
Hi
Here I'm resending the patch, with more comments and explanations added.
Mikulas
From: Mikulas Patocka <[email protected]>
Device mapper sends flush bios to all the targets and the targets send it
to the underlying device. That may be inefficient, for example if a table
contains 10 linear targets pointing to the same physical device, then
device mapper would send 10 flush bios to that device - despite the fact
that only one bio would be sufficient.
This commit optimizes the flush behavior. It introduces a per-target
variable flush_bypasses_map - it is set when the target supports flush
optimization - currently, the dm-linear and dm-stripe targets support it.
When all the targets in a table have flush_bypasses_map,
flush_bypasses_map on the table is set. __send_empty_flush tests if the
table has flush_bypasses_map - and if it has, no flush bios are sent to
the targets via the "map" method and the list dm_table->devices is
iterated and the flush bios are sent to each member of the list.
Signed-off-by: Mikulas Patocka <[email protected]>
Suggested-by: Yang Yang <[email protected]>
---
drivers/md/dm-core.h | 4 ++-
drivers/md/dm-linear.c | 1
drivers/md/dm-stripe.c | 1
drivers/md/dm-table.c | 4 +++
drivers/md/dm.c | 54 +++++++++++++++++++++++++++++++-----------
include/linux/device-mapper.h | 15 +++++++++++
6 files changed, 64 insertions(+), 15 deletions(-)
Index: linux-2.6/drivers/md/dm-core.h
===================================================================
--- linux-2.6.orig/drivers/md/dm-core.h 2024-05-23 19:00:00.000000000 +0200
+++ linux-2.6/drivers/md/dm-core.h 2024-05-23 19:00:00.000000000 +0200
@@ -206,7 +206,9 @@ struct dm_table {
bool integrity_supported:1;
bool singleton:1;
- unsigned integrity_added:1;
+ bool integrity_added:1;
+ /* set if all the targets in the table have "flush_bypasses_map" set */
+ bool flush_bypasses_map:1;
/*
* Indicates the rw permissions for the new logical device. This
Index: linux-2.6/drivers/md/dm-linear.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-linear.c 2024-05-23 19:00:00.000000000 +0200
+++ linux-2.6/drivers/md/dm-linear.c 2024-05-23 19:00:00.000000000 +0200
@@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *
ti->num_discard_bios = 1;
ti->num_secure_erase_bios = 1;
ti->num_write_zeroes_bios = 1;
+ ti->flush_bypasses_map = true;
ti->private = lc;
return 0;
Index: linux-2.6/drivers/md/dm-stripe.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-stripe.c 2024-05-23 19:00:00.000000000 +0200
+++ linux-2.6/drivers/md/dm-stripe.c 2024-05-23 19:00:00.000000000 +0200
@@ -157,6 +157,7 @@ static int stripe_ctr(struct dm_target *
ti->num_discard_bios = stripes;
ti->num_secure_erase_bios = stripes;
ti->num_write_zeroes_bios = stripes;
+ ti->flush_bypasses_map = true;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
Index: linux-2.6/drivers/md/dm-table.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-table.c 2024-05-23 19:00:00.000000000 +0200
+++ linux-2.6/drivers/md/dm-table.c 2024-05-23 19:00:00.000000000 +0200
@@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **re
t->type = DM_TYPE_NONE;
t->mode = mode;
t->md = md;
+ t->flush_bypasses_map = true;
*result = t;
return 0;
}
@@ -738,6 +739,9 @@ int dm_table_add_target(struct dm_table
if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
static_branch_enable(&swap_bios_enabled);
+ if (!ti->flush_bypasses_map)
+ t->flush_bypasses_map = false;
+
return 0;
bad:
Index: linux-2.6/include/linux/device-mapper.h
===================================================================
--- linux-2.6.orig/include/linux/device-mapper.h 2024-05-23 19:00:00.000000000 +0200
+++ linux-2.6/include/linux/device-mapper.h 2024-05-23 19:18:01.000000000 +0200
@@ -397,6 +397,21 @@ struct dm_target {
* bio_set_dev(). NOTE: ideally a target should _not_ need this.
*/
bool needs_bio_set_dev:1;
+
+ /*
+ * Set if the target supports flush optimization. If all the targets in
+ * a table have flush_bypasses_map set, the dm core will not send
+ * flushes to the targets via a ->map method. It will iterate over
+ * dm_table->devices and send flushes to the devices directly. This
+ * optimization reduces the number of flushes being sent when multiple
+ * targets in a table use the same underlying device.
+ *
+ * This optimization may be enabled on targets that just pass the
+ * flushes to the underlying devices without performing any other
+ * actions on the flush request. Currently, dm-linear and dm-stripe
+ * support it.
+ */
+ bool flush_bypasses_map:1;
};
void *dm_per_bio_data(struct bio *bio, size_t data_size);
Index: linux-2.6/drivers/md/dm.c
===================================================================
--- linux-2.6.orig/drivers/md/dm.c 2024-05-23 19:00:00.000000000 +0200
+++ linux-2.6/drivers/md/dm.c 2024-05-23 19:31:14.000000000 +0200
@@ -645,7 +645,7 @@ static struct bio *alloc_tio(struct clon
/* Set default bdev, but target must bio_set_dev() before issuing IO */
clone->bi_bdev = md->disk->part0;
- if (unlikely(ti->needs_bio_set_dev))
+ if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
bio_set_dev(clone, md->disk->part0);
if (len) {
@@ -1107,7 +1107,7 @@ static void clone_endio(struct bio *bio)
blk_status_t error = bio->bi_status;
struct dm_target_io *tio = clone_to_tio(bio);
struct dm_target *ti = tio->ti;
- dm_endio_fn endio = ti->type->end_io;
+ dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
struct dm_io *io = tio->io;
struct mapped_device *md = io->md;
@@ -1154,7 +1154,7 @@ static void clone_endio(struct bio *bio)
}
if (static_branch_unlikely(&swap_bios_enabled) &&
- unlikely(swap_bios_limit(ti, bio)))
+ likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
up(&md->swap_bios_semaphore);
free_tio(bio);
@@ -1566,17 +1566,43 @@ static void __send_empty_flush(struct cl
ci->sector_count = 0;
ci->io->tio.clone.bi_iter.bi_size = 0;
- for (unsigned int i = 0; i < t->num_targets; i++) {
- unsigned int bios;
- struct dm_target *ti = dm_table_get_target(t, i);
-
- if (unlikely(ti->num_flush_bios == 0))
- continue;
-
- atomic_add(ti->num_flush_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
- NULL, GFP_NOWAIT);
- atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ if (!t->flush_bypasses_map) {
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ unsigned int bios;
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (unlikely(ti->num_flush_bios == 0))
+ continue;
+
+ atomic_add(ti->num_flush_bios, &ci->io->io_count);
+ bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
+ NULL, GFP_NOWAIT);
+ atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ }
+ } else {
+ /*
+ * Note that there's no need to grab t->devices_lock here
+ * because the targets that support flush optimization don't
+ * modify the list of devices.
+ */
+ struct list_head *devices = dm_table_get_devices(t);
+ unsigned int len = 0;
+ struct dm_dev_internal *dd;
+ list_for_each_entry(dd, devices, list) {
+ struct bio *clone;
+ /*
+ * Note that the structure dm_target_io is not
+ * associated with any target (because the device may be
+ * used by multiple targets), so we set tio->ti = NULL.
+ * We must check for NULL in the I/O processing path, to
+ * avoid NULL pointer dereference.
+ */
+ clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
+ atomic_add(1, &ci->io->io_count);
+ bio_set_dev(clone, dd->dm_dev->bdev);
+ clone->bi_end_io = clone_endio;
+ dm_submit_bio_remap(clone, NULL);
+ }
}
/*
On Thu, May 23, 2024 at 07:46:25PM +0200, Mikulas Patocka wrote:
> Hi
>
> Here I'm resending the patch, with more comments and explanations added.
>
> Mikulas
>
>
> From: Mikulas Patocka <[email protected]>
>
> Device mapper sends flush bios to all the targets and the targets send it
> to the underlying device. That may be inefficient, for example if a table
> contains 10 linear targets pointing to the same physical device, then
> device mapper would send 10 flush bios to that device - despite the fact
> that only one bio would be sufficient.
>
> This commit optimizes the flush behavior. It introduces a per-target
> variable flush_bypasses_map - it is set when the target supports flush
> optimization - currently, the dm-linear and dm-stripe targets support it.
> When all the targets in a table have flush_bypasses_map,
> flush_bypasses_map on the table is set. __send_empty_flush tests if the
> table has flush_bypasses_map - and if it has, no flush bios are sent to
> the targets via the "map" method and the list dm_table->devices is
> iterated and the flush bios are sent to each member of the list.
>
> Signed-off-by: Mikulas Patocka <[email protected]>
> Suggested-by: Yang Yang <[email protected]>
Nicely done, please feel free to stage for 6.11 (create a new
'dm-6.11' branch starting from 'dm-6.10' -- we'll need to rebase
dm-6.11 to 6.10-rc2 or so but at least we'll get this commit in the
pipeline, push to 'for-next').
Reviewed-by: Mike Snitzer <[email protected]>
On Wed, 22 May 2024, Mike Snitzer wrote:
> On Thu, May 16, 2024 at 10:49:55PM +0200, Mikulas Patocka wrote:
> > Device mapper sends flush bios to all the targets and the targets send it
> > to the underlying device. That may be inefficient, for example if a table
> > contains 10 linear targets pointing to the same physical device, then
> > device mapper would send 10 flush bios to that device - despite the fact
> > that only one bio would be sufficient.
> >
> > This commit optimizes the flush behavior. It introduces a per-target
> > variable flush_pass_around - it is set when the target supports flush
> > optimization - currently, the dm-linear and dm-stripe targets support it.
> > When all the targets in a table have flush_pass_around, flush_pass_around
> > on the table is set. __send_empty_flush tests if the table has
> > flush_pass_around - and if it has, no flush bios are sent to the targets
> > and the list dm_table->devices is iterated and the flush bios are sent to
> > each member of the list.
>
> What does "pass around" mean? Seems like an awkward name for this.
> (Naming can be hard, I don't have better suggestions at the moment.)
just playing with naming ideas from other disciplines in case you likes
one of these concepts better than "pass around". I'm not attached any of
these, this is just for making conversation so the flag can be easily
understood:
- flush_can_scatter (as in scatter/gather)
- flush_can_distribute
- flush_can_spread
- flush_deduplicate
..
> > Index: linux-2.6/include/linux/device-mapper.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
> > +++ linux-2.6/include/linux/device-mapper.h 2024-05-15 16:56:49.000000000 +0200
> > @@ -397,6 +397,11 @@ struct dm_target {
> > * bio_set_dev(). NOTE: ideally a target should _not_ need this.
> > */
> > bool needs_bio_set_dev:1;
> > +
> > + /*
> > + * Set if the target supports flush optimization
> > + */
> > + bool flush_pass_around:1;
> > };
>
> How does a developer _know_ if a target can set this flag? Please
> elaborate on the requirements in this code comment.
Relatedly,
To what extent can this be set automatically? For example, if you have a
bunch of non-DM (eg, SCSI) disks under a device mapper target, then it
seems reasonable that they would "support" this feature in the identity
sense: they can take flush and it will (should) not be spread to other
devices in the DM stack, so the device mapper targets being instantiated
in such a case would enable this flag. Thus, a new target that only has
non-DM devices can (probably?) default enabled; maybe there are counter
examples here.
Another consideration is for targets (eg, dm-thinpool) which have multiple
lower-level block devices on the same table definition line, often a data
and metadata volume. By contrast, linear tables may have multiple backing
devices in separate target table lines. This may be further complicated
by the fact that a device mapper target can be composed of multiple
disparate targets as separate table lines, each of which may have a
different number of backing devices for their own definition.
Perhaps your design already covers these edge cases, so I am only
mentioning this in case it may prompt ideas for other edge cases
to review.
--
Eric Wheeler
>
> >
> > void *dm_per_bio_data(struct bio *bio, size_t data_size);
> > Index: linux-2.6/drivers/md/dm.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm.c 2024-05-15 16:56:49.000000000 +0200
> > +++ linux-2.6/drivers/md/dm.c 2024-05-16 20:06:32.000000000 +0200
> > @@ -645,7 +645,7 @@ static struct bio *alloc_tio(struct clon
> >
> > /* Set default bdev, but target must bio_set_dev() before issuing IO */
> > clone->bi_bdev = md->disk->part0;
> > - if (unlikely(ti->needs_bio_set_dev))
> > + if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
> > bio_set_dev(clone, md->disk->part0);
> >
> > if (len) {
> > @@ -1107,7 +1107,7 @@ static void clone_endio(struct bio *bio)
> > blk_status_t error = bio->bi_status;
> > struct dm_target_io *tio = clone_to_tio(bio);
> > struct dm_target *ti = tio->ti;
> > - dm_endio_fn endio = ti->type->end_io;
> > + dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;
> > struct dm_io *io = tio->io;
> > struct mapped_device *md = io->md;
> >
> > @@ -1154,7 +1154,7 @@ static void clone_endio(struct bio *bio)
> > }
> >
> > if (static_branch_unlikely(&swap_bios_enabled) &&
> > - unlikely(swap_bios_limit(ti, bio)))
> > + likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
> > up(&md->swap_bios_semaphore);
> >
> > free_tio(bio);
>
> What is it about this commit that makes it important to verify ti
> isn't NULL in the above 3 hunks?
>
> Should these NULL checks be factored out as a separate fix?
>
> Or can these hunks be dropped?
>
> > @@ -1566,17 +1566,36 @@ static void __send_empty_flush(struct cl
> > ci->sector_count = 0;
> > ci->io->tio.clone.bi_iter.bi_size = 0;
> >
> > - for (unsigned int i = 0; i < t->num_targets; i++) {
> > - unsigned int bios;
> > - struct dm_target *ti = dm_table_get_target(t, i);
> > -
> > - if (unlikely(ti->num_flush_bios == 0))
> > - continue;
> > -
> > - atomic_add(ti->num_flush_bios, &ci->io->io_count);
> > - bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> > - NULL, GFP_NOWAIT);
> > - atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> > + if (!t->flush_pass_around) {
> > + for (unsigned int i = 0; i < t->num_targets; i++) {
> > + unsigned int bios;
> > + struct dm_target *ti = dm_table_get_target(t, i);
> > +
> > + if (unlikely(ti->num_flush_bios == 0))
> > + continue;
> > +
> > + atomic_add(ti->num_flush_bios, &ci->io->io_count);
> > + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> > + NULL, GFP_NOWAIT);
> > + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> > + }
> > + } else {
> > + /*
> > + * Note that there's no need to grab t->devices_lock here
> > + * because the targets that support flush pass-around don't
> > + * modify the list of devices.
> > + */
> > + struct list_head *devices = dm_table_get_devices(t);
> > + unsigned int len = 0;
> > + struct dm_dev_internal *dd;
> > + list_for_each_entry(dd, devices, list) {
> > + struct bio *clone;
> > + clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
> > + atomic_add(1, &ci->io->io_count);
> > + bio_set_dev(clone, dd->dm_dev->bdev);
> > + clone->bi_end_io = clone_endio;
> > + dm_submit_bio_remap(clone, NULL);
> > + }
> > }
> >
> > /*
> >
> >
>
> Still missing what "pass-around" is meant to convey given that you
> aren't passing around the same flush... you're cloning a new flush and
> issuing one per device. Probably worth explaining that's what you
> mean by "flush_pass_around" (both in commit header and elaborate in
> code)?
>
> Also, you're issuing a flush to _all_ devices in a table. Not just
> the data devices. .iterate_devices returns only the data devices.
> If/when there is a need to extend this feature to targets that have
> metadata devices (e.g. dm-thin, cache, etc): would it make sense to
> filter out non-data devices (by stepping through each target in the
> table and using iterate_devices)?
>
> Mike
>
>
On Thu, 23 May 2024, Mike Snitzer wrote:
> On Thu, May 23, 2024 at 07:46:25PM +0200, Mikulas Patocka wrote:
> > Hi
> >
> > Here I'm resending the patch, with more comments and explanations added.
> >
> > Mikulas
> >
> >
> > From: Mikulas Patocka <[email protected]>
> >
> > Device mapper sends flush bios to all the targets and the targets send it
> > to the underlying device. That may be inefficient, for example if a table
> > contains 10 linear targets pointing to the same physical device, then
> > device mapper would send 10 flush bios to that device - despite the fact
> > that only one bio would be sufficient.
> >
> > This commit optimizes the flush behavior. It introduces a per-target
> > variable flush_bypasses_map - it is set when the target supports flush
> > optimization - currently, the dm-linear and dm-stripe targets support it.
> > When all the targets in a table have flush_bypasses_map,
> > flush_bypasses_map on the table is set. __send_empty_flush tests if the
> > table has flush_bypasses_map - and if it has, no flush bios are sent to
> > the targets via the "map" method and the list dm_table->devices is
> > iterated and the flush bios are sent to each member of the list.
> >
> > Signed-off-by: Mikulas Patocka <[email protected]>
> > Suggested-by: Yang Yang <[email protected]>
>
> Nicely done, please feel free to stage for 6.11 (create a new
> 'dm-6.11' branch starting from 'dm-6.10' -- we'll need to rebase
> dm-6.11 to 6.10-rc2 or so but at least we'll get this commit in the
> pipeline, push to 'for-next').
>
> Reviewed-by: Mike Snitzer <[email protected]>
OK, done.
Mikulas