__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.
This is a real-world scenario that we have encountered:
1) Call fallocate(file_fd, 0, 0, SZ_8G)
2) Call ioctl(file_fd, FS_IOC_FIEMAP, fiemap). In situations of severe
file system fragmentation, fiemap->fm_mapped_extents may exceed 1000.
3) Create a dm-linear device based on fiemap->fm_extents
4) Create a snapshot-cow device based on the dm-linear device
Perf diff of fio test:
fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
--ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
--blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
Scenario one:
for i in {0..1023}; do
echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
done | sudo dmsetup create example
Before: bw=857KiB/
After: bw=30.8MiB/s +3580%
Scenario two:
for i in {0..1023}; do
if [[ $i -gt 511 ]]; then
echo $((8000*$i)) 8000 linear /dev/nvme0n1p6 $((16384*$i))
else
echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
fi
done | sudo dmsetup create example
Before: bw=1470KiB/
After: bw=33.9MiB/s +2261%
Any comments are welcome!
V4:
-- Fix the build warning "no semicolon at end of struct or union."
V3:
-- Focus on targets with num_flush_bios equal to 1 to simplify the code
-- Use t->devices_lock to protect the dm_table's device list
V2:
-- Split into smaller pieces that are easier to review
-- Add flush_pass_around, suggested by Mikulas Patocka
-- Handling different target types separately
Yang Yang (5):
dm: introduce flush_pass_around flag
dm: add __send_empty_flush_bios() helper
dm: support retrieving struct dm_target from struct dm_dev
dm: Avoid sending redundant empty flush bios to the same block device
dm linear: enable flush optimization function
drivers/md/dm-core.h | 3 +++
drivers/md/dm-ioctl.c | 4 ++++
drivers/md/dm-linear.c | 1 +
drivers/md/dm-table.c | 19 +++++++++++++++++++
drivers/md/dm.c | 34 +++++++++++++++++++++++++---------
include/linux/device-mapper.h | 6 ++++++
6 files changed, 58 insertions(+), 9 deletions(-)
--
2.34.1
introduce a per-target bit "flush_pass_around" and means that the
target supports flush optimization.
set a per-table "flush_pass_around" bit if all the targets in the
table have "flush_pass_around" set.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm-core.h | 3 +++
drivers/md/dm-ioctl.c | 4 ++++
drivers/md/dm-table.c | 3 +++
include/linux/device-mapper.h | 5 +++++
4 files changed, 15 insertions(+)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..b273f25b634d 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -208,6 +208,9 @@ struct dm_table {
bool singleton:1;
unsigned integrity_added:1;
+ /* set if all the targets in the table have "flush_pass_around" set */
+ bool flush_pass_around:1;
+
/*
* Indicates the rw permissions for the new logical device. This
* should be a combination of BLK_OPEN_READ and BLK_OPEN_WRITE.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c2c07bfa6471..bb178df2a340 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1445,6 +1445,8 @@ static int populate_table(struct dm_table *table,
return -EINVAL;
}
+ table->flush_pass_around = 1;
+
for (i = 0; i < param->target_count; i++) {
const char *nul_terminator;
@@ -2279,6 +2281,8 @@ int __init dm_early_create(struct dm_ioctl *dmi,
if (r)
goto err_hash_remove;
+ t->flush_pass_around = 1;
+
/* add targets */
for (i = 0; i < dmi->target_count; i++) {
r = dm_table_add_target(t, spec_array[i]->target_type,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..bd68af10afed 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -738,6 +738,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key))
static_branch_enable(&swap_bios_enabled);
+ if (ti->flush_pass_around == 0)
+ t->flush_pass_around = 0;
+
return 0;
bad:
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 82b2195efaca..0893ff8c01b6 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -397,6 +397,11 @@ struct dm_target {
* bio_set_dev(). NOTE: ideally a target should _not_ need this.
*/
bool needs_bio_set_dev:1;
+
+ /*
+ * Set if the target supports flush optimization
+ */
+ bool flush_pass_around:1;
};
void *dm_per_bio_data(struct bio *bio, size_t data_size);
--
2.34.1
There are no functional changes, the helper will be used in later
patches.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm.c | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..25215b93c3cf 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1543,6 +1543,20 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
return ret;
}
+static void __send_empty_flush_bios(struct dm_table *t, struct dm_target *ti,
+ struct clone_info *ci)
+{
+ unsigned int bios;
+
+ if (unlikely(ti->num_flush_bios == 0))
+ return;
+
+ atomic_add(ti->num_flush_bios, &ci->io->io_count);
+ bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
+ NULL, GFP_NOWAIT);
+ atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+}
+
static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
@@ -1561,16 +1575,9 @@ static void __send_empty_flush(struct clone_info *ci)
ci->io->tio.clone.bi_iter.bi_size = 0;
for (unsigned int i = 0; i < t->num_targets; i++) {
- unsigned int bios;
struct dm_target *ti = dm_table_get_target(t, i);
- if (unlikely(ti->num_flush_bios == 0))
- continue;
-
- atomic_add(ti->num_flush_bios, &ci->io->io_count);
- bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
- NULL, GFP_NOWAIT);
- atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ __send_empty_flush_bios(t, ti, ci);
}
/*
--
2.34.1
Add a list to the struct dm_dev structure to store the associated
targets.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm-table.c | 16 ++++++++++++++++
include/linux/device-mapper.h | 1 +
2 files changed, 17 insertions(+)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bd68af10afed..4817ddb660c4 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2134,6 +2134,13 @@ void dm_table_postsuspend_targets(struct dm_table *t)
suspend_targets(t, POSTSUSPEND);
}
+static int dm_link_dev_to_target(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ dev->ti = ti;
+ return 0;
+}
+
int dm_table_resume_targets(struct dm_table *t)
{
unsigned int i;
@@ -2162,6 +2169,15 @@ int dm_table_resume_targets(struct dm_table *t)
ti->type->resume(ti);
}
+ if (t->flush_pass_around) {
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (ti->type->iterate_devices)
+ ti->type->iterate_devices(ti, dm_link_dev_to_target, NULL);
+ }
+ }
+
return 0;
}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0893ff8c01b6..c6696f88bf66 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -169,6 +169,7 @@ struct dm_dev {
struct dax_device *dax_dev;
blk_mode_t mode;
char name[16];
+ struct dm_target *ti;
};
/*
--
2.34.1
If the num_targets is greater than the number of dm_devs in the
dm_table's devices list, __send_empty_flush() might invoke
__send_duplicate_bios() multiple times for the same block device.
This could lead to a substantial decrease in performance when
num_targets significantly exceeds the number of dm_devs.
This patch ensure that __send_duplicate_bios() is only called once
for each dm_dev.
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 25215b93c3cf..9dd0f5c97028 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1561,6 +1561,7 @@ static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
struct bio flush_bio;
+ struct dm_dev_internal *dd;
/*
* Use an on-stack bio for this, it's safe since we don't
@@ -1574,10 +1575,18 @@ static void __send_empty_flush(struct clone_info *ci)
ci->sector_count = 0;
ci->io->tio.clone.bi_iter.bi_size = 0;
- for (unsigned int i = 0; i < t->num_targets; i++) {
- struct dm_target *ti = dm_table_get_target(t, i);
+ if (!t->flush_pass_around) {
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+ __send_empty_flush_bios(t, ti, ci);
+ }
+ } else {
+ down_read(&t->devices_lock);
+
+ list_for_each_entry(dd, dm_table_get_devices(t), list)
+ __send_empty_flush_bios(t, dd->dm_dev->ti, ci);
- __send_empty_flush_bios(t, ti, ci);
+ up_read(&t->devices_lock);
}
/*
--
2.34.1
__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.
This issue can be reproduced using this command line:
for i in {0..1023}; do
echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
done | dmsetup create example
With this fix, a random write with fsync workload executed with the
following fio command:
fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
--ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
--blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
results in an increase from 857 KB/s to 30.8 MB/s of the write
throughput (3580% increase).
Signed-off-by: Yang Yang <[email protected]>
---
drivers/md/dm-linear.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 2d3e186ca87e..3e1a33b4d289 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_discard_bios = 1;
ti->num_secure_erase_bios = 1;
ti->num_write_zeroes_bios = 1;
+ ti->flush_pass_around = 1;
ti->private = lc;
return 0;
--
2.34.1