2022-10-06 22:28:27

by Jonathan Derrick

[permalink] [raw]
Subject: [PATCH 0/2] Bitmap percentage flushing

This introduces a percentage-flushing mechanism that works in-tandem to
the delay timer. The percentage argument is based on the number of
chunks dirty. It was chosen to use number of chunks due to large drives requiring
smaller and smaller percentages (eg, 32TB drives-> 1% is 320GB).

The first patch fixes a performance gap observed in RAID1
configurations. With a synchronous qd1 workload, bitmap writes can
easily become almost half of the I/O. This could be argued to be
expected, but undesirable. Moving the unplug operation to the periodic
delay work seemed to help the situation.

The second part of this set adds a new field in the superblock and
version, allowing for a new argument through mdadm specifying the number
of chunks allowed to be dirty before flushing.

Accompanying this set is an RFC for mdadm patch. It lacks documentation
which will be sent in v2 if this changeset is appropriate.

Jonathan Derrick (2):
md/bitmap: Move unplug to daemon thread
md/bitmap: Add chunk-count-based bitmap flushing

drivers/md/md-bitmap.c | 38 +++++++++++++++++++++++++++++++++++---
drivers/md/md-bitmap.h | 5 ++++-
drivers/md/md.h | 1 +
drivers/md/raid1.c | 2 --
drivers/md/raid10.c | 4 ----
5 files changed, 40 insertions(+), 10 deletions(-)

--
2.31.1


2022-10-06 23:04:12

by Jonathan Derrick

[permalink] [raw]
Subject: [PATCH 2/2] md/bitmap: Add chunk-count-based bitmap flushing

In addition to the timer, allow the bitmap flushing to be controlled by a
counter that tracks the number of dirty chunks and flushes when it exceeds a
user-defined chunk-count threshold.

This introduces a new field to the bitmap superblock and version 6.

Signed-off-by: Jonathan Derrick <[email protected]>
---
drivers/md/md-bitmap.c | 37 ++++++++++++++++++++++++++++++++++---
drivers/md/md-bitmap.h | 5 ++++-
drivers/md/md.h | 1 +
3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 451259b38d25..fa6b3c71c314 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -499,6 +499,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize));
pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+ pr_debug(" flush chunks: %d\n", le32_to_cpu(sb->daemon_flush_chunks));
pr_debug(" sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
@@ -581,6 +582,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
bitmap_super_t *sb;
unsigned long chunksize, daemon_sleep, write_behind;
unsigned long long events;
+ unsigned int daemon_flush_chunks;
int nodes = 0;
unsigned long sectors_reserved = 0;
int err = -EINVAL;
@@ -644,7 +646,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic";
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
- le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
+ le32_to_cpu(sb->version) > BITMAP_MAJOR_CHUNKFLUSH)
reason = "unrecognized superblock version";
else if (chunksize < 512)
reason = "bitmap chunksize too small";
@@ -660,6 +662,9 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
goto out;
}

+ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CHUNKFLUSH))
+ daemon_flush_chunks = le32_to_cpu(sb->daemon_flush_chunks);
+
/*
* Setup nodes/clustername only if bitmap version is
* cluster-compatible
@@ -720,6 +725,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
bitmap->events_cleared = bitmap->mddev->events;
bitmap->mddev->bitmap_info.chunksize = chunksize;
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
+ bitmap->mddev->bitmap_info.daemon_flush_chunks = daemon_flush_chunks;
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
bitmap->mddev->bitmap_info.nodes = nodes;
if (bitmap->mddev->bitmap_info.space == 0 ||
@@ -1218,6 +1224,31 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
sector_t offset, sector_t *blocks,
int create);

+static bool md_daemon_should_sleep(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ struct bitmap_page *bp;
+ unsigned long k, pages;
+ unsigned int count = 0;
+
+ if (time_after(jiffies, bitmap->daemon_lastrun
+ + mddev->bitmap_info.daemon_sleep))
+ return false;
+
+ if (mddev->bitmap_info.daemon_flush_chunks) {
+ bp = bitmap->counts.bp;
+ pages = bitmap->counts.pages;
+ for (k = 0; k < pages; k++)
+ if (bp[k].map && !bp[k].hijacked)
+ count += bp[k].count;
+
+ if (count >= mddev->bitmap_info.daemon_flush_chunks)
+ return false;
+ }
+
+ return true;
+}
+
/*
* bitmap daemon -- periodically wakes up to clean bits and flush pages
* out to disk
@@ -1240,8 +1271,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
mutex_unlock(&mddev->bitmap_info.mutex);
return;
}
- if (time_before(jiffies, bitmap->daemon_lastrun
- + mddev->bitmap_info.daemon_sleep))
+
+ if (md_daemon_should_sleep(mddev))
goto done;

md_bitmap_unplug(bitmap);
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index cfd7395de8fd..e0aeedbdde17 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -11,10 +11,12 @@
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
* Version 5 is currently set only for clustered devices
++ * Version 6 supports the flush-chunks threshold
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
+#define BITMAP_MAJOR_CHUNKFLUSH 6

/*
* in-memory bitmap:
@@ -135,7 +137,8 @@ typedef struct bitmap_super_s {
* reserved for the bitmap. */
__le32 nodes; /* 68 the maximum number of nodes in cluster. */
__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
- __u8 pad[256 - 136]; /* set to zero */
+ __le32 daemon_flush_chunks; /* 136 dirty chunks between flushes */
+ __u8 pad[256 - 140]; /* set to zero */
} bitmap_super_t;

/* notes:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b4e2d8b87b61..d25574e46283 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -497,6 +497,7 @@ struct mddev {
struct mutex mutex;
unsigned long chunksize;
unsigned long daemon_sleep; /* how many jiffies between updates? */
+ unsigned int daemon_flush_chunks; /* how many dirty chunks between updates */
unsigned long max_write_behind; /* write-behind mode */
int external;
int nodes; /* Maximum number of nodes in the cluster */
--
2.31.1

2022-10-07 18:23:13

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH 2/2] md/bitmap: Add chunk-count-based bitmap flushing

On Thu, Oct 6, 2022 at 3:09 PM Jonathan Derrick
<[email protected]> wrote:

[...]

> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
> index cfd7395de8fd..e0aeedbdde17 100644
> --- a/drivers/md/md-bitmap.h
> +++ b/drivers/md/md-bitmap.h
> @@ -11,10 +11,12 @@
> /* version 4 insists the bitmap is in little-endian order
> * with version 3, it is host-endian which is non-portable
> * Version 5 is currently set only for clustered devices
> ++ * Version 6 supports the flush-chunks threshold
> */
> #define BITMAP_MAJOR_HI 4
> #define BITMAP_MAJOR_CLUSTERED 5
> #define BITMAP_MAJOR_HOSTENDIAN 3
> +#define BITMAP_MAJOR_CHUNKFLUSH 6
>
> /*
> * in-memory bitmap:
> @@ -135,7 +137,8 @@ typedef struct bitmap_super_s {
> * reserved for the bitmap. */
> __le32 nodes; /* 68 the maximum number of nodes in cluster. */
> __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
> - __u8 pad[256 - 136]; /* set to zero */
> + __le32 daemon_flush_chunks; /* 136 dirty chunks between flushes */
> + __u8 pad[256 - 140]; /* set to zero */
> } bitmap_super_t;

Do we really need this to be persistent? How about we configure it at run
time via a sysfs file?

Also, please share more data on the performance benefit of the set.

Thanks,
Song

>
> /* notes:
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index b4e2d8b87b61..d25574e46283 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -497,6 +497,7 @@ struct mddev {
> struct mutex mutex;
> unsigned long chunksize;
> unsigned long daemon_sleep; /* how many jiffies between updates? */
> + unsigned int daemon_flush_chunks; /* how many dirty chunks between updates */
> unsigned long max_write_behind; /* write-behind mode */
> int external;
> int nodes; /* Maximum number of nodes in the cluster */
> --
> 2.31.1
>

2022-10-07 19:03:07

by Jonathan Derrick

[permalink] [raw]
Subject: Re: [PATCH 2/2] md/bitmap: Add chunk-count-based bitmap flushing



On 10/7/2022 11:50 AM, Song Liu wrote:
> On Thu, Oct 6, 2022 at 3:09 PM Jonathan Derrick
> <[email protected]> wrote:
>
> [...]
>
>> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
>> index cfd7395de8fd..e0aeedbdde17 100644
>> --- a/drivers/md/md-bitmap.h
>> +++ b/drivers/md/md-bitmap.h
>> @@ -11,10 +11,12 @@
>> /* version 4 insists the bitmap is in little-endian order
>> * with version 3, it is host-endian which is non-portable
>> * Version 5 is currently set only for clustered devices
>> ++ * Version 6 supports the flush-chunks threshold
>> */
>> #define BITMAP_MAJOR_HI 4
>> #define BITMAP_MAJOR_CLUSTERED 5
>> #define BITMAP_MAJOR_HOSTENDIAN 3
>> +#define BITMAP_MAJOR_CHUNKFLUSH 6
>>
>> /*
>> * in-memory bitmap:
>> @@ -135,7 +137,8 @@ typedef struct bitmap_super_s {
>> * reserved for the bitmap. */
>> __le32 nodes; /* 68 the maximum number of nodes in cluster. */
>> __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
>> - __u8 pad[256 - 136]; /* set to zero */
>> + __le32 daemon_flush_chunks; /* 136 dirty chunks between flushes */
>> + __u8 pad[256 - 140]; /* set to zero */
>> } bitmap_super_t;
>
> Do we really need this to be persistent? How about we configure it at run
> time via a sysfs file?
>
> Also, please share more data on the performance benefit of the set.
>
> Thanks,
> Song
>
Hi Song,

Patch 1/2 changes default behavior, which patch 2/2 tries to address.
I can change it to be configurable via sysfs instead.
Should there be a default?


Here are my observations via biosnoop and RAID1, 4M chunksize, 238436 chunks, bitmap=internal
fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --runtime=60
--percentile_list=1.0:25.0:50.0:75.0:90.0:95.0:99.0:99.9:99.99:99..999999:100.0


Default, bitmap updates happened concurrently with I/O:
bw ( KiB/s): min=18690, max=30618, per=99.94%, avg=23822.07, stdev=2522.73, samples=119
iops : min= 4672, max= 7654, avg=5955.20, stdev=630.71, samples=119

TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
38.090366 md0_raid1 4800 nvme6n1 W 40 4096 0.01
38.090423 md0_raid1 4800 nvme3n1 W 40 4096 0.07
38.090442 md0_raid1 4800 nvme3n1 W 1016633184 4096 0.01
38.090439 md0_raid1 4800 nvme6n1 W 1016633184 4096 0.01
38.090479 md0_raid1 4800 nvme6n1 W 56 4096 0.01
38.090493 md0_raid1 4800 nvme6n1 W 1449894256 4096 0.01
38.090477 md0_raid1 4800 nvme3n1 W 56 4096 0.01
38.090496 md0_raid1 4800 nvme3n1 W 1449894256 4096 0.01
38.090530 md0_raid1 4800 nvme3n1 W 16 4096 0.01
38.090555 md0_raid1 4800 nvme3n1 W 110493568 4096 0.01
38.090538 md0_raid1 4800 nvme6n1 W 16 4096 0.01
38.090551 md0_raid1 4800 nvme6n1 W 110493568 4096 0.01
38.090596 md0_raid1 4800 nvme6n1 W 56 4096 0.01
38.090647 md0_raid1 4800 nvme3n1 W 56 4096 0.06
38.090666 md0_raid1 4800 nvme3n1 W 1455846976 4096 0.01
38.090663 md0_raid1 4800 nvme6n1 W 1455846976 4096 0.01
38.090707 md0_raid1 4800 nvme6n1 W 64 4096 0.01
38.090699 md0_raid1 4800 nvme3n1 W 64 4096 0.01
38.090723 md0_raid1 4800 nvme3n1 W 1665013728 4096 0.01
38.090720 md0_raid1 4800 nvme6n1 W 1665013728 4096 0.01
38.090764 md0_raid1 4800 nvme6n1 W 64 4096 0.01
38.090812 md0_raid1 4800 nvme3n1 W 64 4096 0.06
38.090832 md0_raid1 4800 nvme3n1 W 1637994296 4096 0.01
38.090828 md0_raid1 4800 nvme6n1 W 1637994296 4096 0.01




With patch 1/2, bitmaps only update on the 'delay' parameter (default 5s):
bw ( KiB/s): min=135712, max=230938, per=100.00%, avg=209308.56, stdev=29254.31, samples=119
iops : min=33928, max=57734, avg=52326.78, stdev=7313.57, samples=119

TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
16.292235 md0_raid1 4841 nvme6n1 W 297367432 4096 0.01
16.292258 md0_raid1 4841 nvme6n1 W 16 4096 0.01
16.292266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
16.292277 md0_raid1 4841 nvme6n1 W 32 4096 0.01
16.292259 md0_raid1 4841 nvme3n1 W 16 4096 0.01
16.292280 md0_raid1 4841 nvme3n1 W 32 4096 0.01
16.292305 md0_raid1 4841 nvme3n1 W 56 4096 0.01
16.292286 md0_raid1 4841 nvme6n1 W 40 4096 0.01
16.292295 md0_raid1 4841 nvme6n1 W 48 4096 0.01
16.292326 md0_raid1 4841 nvme3n1 W 72 1536 0.01
16.292323 md0_raid1 4841 nvme6n1 W 64 4096 0.02
16.292326 md0_raid1 4841 nvme6n1 W 56 4096 0.03
16.292334 md0_raid1 4841 nvme6n1 W 72 1536 0.02
16.300697 md0_raid1 4841 nvme3n1 W 1297533744 4096 0.01
16.300702 md0_raid1 4841 nvme6n1 W 1297533744 4096 0.01
16.300803 md0_raid1 4841 nvme6n1 W 1649080856 4096 0.01
16.300798 md0_raid1 4841 nvme3n1 W 1649080856 4096 0.01
16.300823 md0_raid1 4841 nvme3n1 W 1539317792 4096 0.01
16.300845 md0_raid1 4841 nvme3n1 W 1634570232 4096 0.01
16.300867 md0_raid1 4841 nvme3n1 W 579232208 4096 0.01
16.300889 md0_raid1 4841 nvme3n1 W 1818140424 4096 0.01
16.300922 md0_raid1 4841 nvme3n1 W 412971920 4096 0.02
...
21.293225 md0_raid1 4841 nvme3n1 W 1279122360 4096 0.01
21.293242 md0_raid1 4841 nvme3n1 W 40326272 4096 0.01
21.293223 md0_raid1 4841 nvme6n1 W 1279122360 4096 0.01
21.293243 md0_raid1 4841 nvme6n1 W 40326272 4096 0.01
21.293261 md0_raid1 4841 nvme6n1 W 16 4096 0.01
21.293266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
21.293271 md0_raid1 4841 nvme6n1 W 32 4096 0.01
21.293275 md0_raid1 4841 nvme3n1 W 32 4096 0.01
21.293292 md0_raid1 4841 nvme3n1 W 48 4096 0.01
21.293296 md0_raid1 4841 nvme3n1 W 56 4096 0.01
21.293309 md0_raid1 4841 nvme3n1 W 72 1536 0.01
21.293266 md0_raid1 4841 nvme3n1 W 24 4096 0.01
21.293326 md0_raid1 4841 nvme6n1 W 48 4096 0.05
21.293328 md0_raid1 4841 nvme6n1 W 40 4096 0.06
21.293331 md0_raid1 4841 nvme6n1 W 72 1536 0.03
21.293333 md0_raid1 4841 nvme6n1 W 64 4096 0.04
21.293334 md0_raid1 4841 nvme6n1 W 56 4096 0.05
21.298526 md0_raid1 4841 nvme3n1 W 681973000 4096 0.01




Good, but with the granularity of N seconds, it might be too infrequent.
Here is chunk-flush=512 (2GB threshold in 4MB chunk size):
bw ( KiB/s): min=92692, max=134904, per=100.00%, avg=125127.43, stdev=6758.51, samples=119
iops : min=23173, max=33726, avg=31281.55, stdev=1689.63, samples=119

TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
13.193339 md0_raid1 5972 nvme6n1 W 16 4096 0.01
13.193344 md0_raid1 5972 nvme6n1 W 32 4096 0.01
13.193346 md0_raid1 5972 nvme6n1 W 24 4096 0.01
13.193350 md0_raid1 5972 nvme6n1 W 40 4096 0.01
13.193356 md0_raid1 5972 nvme6n1 W 48 4096 0.01
13.193361 md0_raid1 5972 nvme6n1 W 64 4096 0.01
13.193363 md0_raid1 5972 nvme6n1 W 56 4096 0.01
13.193555 md0_raid1 5972 nvme6n1 W 72 1536 0.20
13.193289 md0_raid1 5972 nvme3n1 W 1912285848 4096 0.01
13.193306 md0_raid1 5972 nvme3n1 W 836455896 4096 0.01
13.193323 md0_raid1 5972 nvme3n1 W 233728136 4096 0.01
13.193339 md0_raid1 5972 nvme3n1 W 16 4096 0.01
13.193344 md0_raid1 5972 nvme3n1 W 24 4096 0.01
13.193362 md0_raid1 5972 nvme3n1 W 48 4096 0.01
13.193365 md0_raid1 5972 nvme3n1 W 64 4096 0.01
13.193366 md0_raid1 5972 nvme3n1 W 56 4096 0.01
13.193574 md0_raid1 5972 nvme3n1 W 72 1536 0.21
13.196759 md0_raid1 5972 nvme3n1 W 89571592 4096 0.01
13.196810 md0_raid1 5972 nvme6n1 W 89571592 4096 0.06
13.196913 md0_raid1 5972 nvme6n1 W 16 4096 0.01
13.196910 md0_raid1 5972 nvme3n1 W 16 4096 0.01
13.199444 md0_raid1 5972 nvme3n1 W 64 4096 0.01
13.199447 md0_raid1 5972 nvme3n1 W 137126232 4096 0.01
13.199515 md0_raid1 5972 nvme6n1 W 137126232 4096 0.08
13.199519 md0_raid1 5972 nvme6n1 W 64 4096 0.08
13.199617 md0_raid1 5972 nvme6n1 W 1216062808 4096 0.01
... (508 ios later)
13.208764 md0_raid1 5972 nvme6n1 W 16 4096 0.01
13.208768 md0_raid1 5972 nvme6n1 W 32 4096 0.01
13.208770 md0_raid1 5972 nvme6n1 W 24 4096 0.01
13.208775 md0_raid1 5972 nvme6n1 W 40 4096 0.01
13.208781 md0_raid1 5972 nvme6n1 W 48 4096 0.01
13.208786 md0_raid1 5972 nvme6n1 W 56 4096 0.01
13.208790 md0_raid1 5972 nvme6n1 W 64 4096 0.01
13.208729 md0_raid1 5972 nvme3n1 W 1607847808 4096 0.01
13.208747 md0_raid1 5972 nvme3n1 W 371214368 4096 0.01
13.208770 md0_raid1 5972 nvme3n1 W 32 4096 0.01
13.208789 md0_raid1 5972 nvme3n1 W 64 4096 0.01
13.208952 md0_raid1 5972 nvme6n1 W 72 1536 0.17
13.209079 md0_raid1 5972 nvme3n1 W 72 1536 0.29
13.212216 md0_raid1 5972 nvme3n1 W 1146106480 4096 0.01
13.212269 md0_raid1 5972 nvme6n1 W 1146106480 4096 0.06
13.212368 md0_raid1 5972 nvme6n1 W 16 4096 0.01
13.212365 md0_raid1 5972 nvme3n1 W 16 4096 0.01


Without 1/2: 6k iops
With 1/2: 52k iops
With 2/2 params as above: 31k iops

The count calculation could use some improvement to close the iops gap to delay-based flushing

>>
>> /* notes:
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index b4e2d8b87b61..d25574e46283 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -497,6 +497,7 @@ struct mddev {
>> struct mutex mutex;
>> unsigned long chunksize;
>> unsigned long daemon_sleep; /* how many jiffies between updates? */
>> + unsigned int daemon_flush_chunks; /* how many dirty chunks between updates */
>> unsigned long max_write_behind; /* write-behind mode */
>> int external;
>> int nodes; /* Maximum number of nodes in the cluster */
>> --
>> 2.31.1
>>

2022-10-10 18:32:42

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH 2/2] md/bitmap: Add chunk-count-based bitmap flushing

On Fri, Oct 7, 2022 at 11:58 AM Jonathan Derrick
<[email protected]> wrote:
>
>
>
> On 10/7/2022 11:50 AM, Song Liu wrote:
> > On Thu, Oct 6, 2022 at 3:09 PM Jonathan Derrick
> > <[email protected]> wrote:
> >
> > [...]
> >
> >> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
> >> index cfd7395de8fd..e0aeedbdde17 100644
> >> --- a/drivers/md/md-bitmap.h
> >> +++ b/drivers/md/md-bitmap.h
> >> @@ -11,10 +11,12 @@
> >> /* version 4 insists the bitmap is in little-endian order
> >> * with version 3, it is host-endian which is non-portable
> >> * Version 5 is currently set only for clustered devices
> >> ++ * Version 6 supports the flush-chunks threshold
> >> */
> >> #define BITMAP_MAJOR_HI 4
> >> #define BITMAP_MAJOR_CLUSTERED 5
> >> #define BITMAP_MAJOR_HOSTENDIAN 3
> >> +#define BITMAP_MAJOR_CHUNKFLUSH 6
> >>
> >> /*
> >> * in-memory bitmap:
> >> @@ -135,7 +137,8 @@ typedef struct bitmap_super_s {
> >> * reserved for the bitmap. */
> >> __le32 nodes; /* 68 the maximum number of nodes in cluster. */
> >> __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
> >> - __u8 pad[256 - 136]; /* set to zero */
> >> + __le32 daemon_flush_chunks; /* 136 dirty chunks between flushes */
> >> + __u8 pad[256 - 140]; /* set to zero */
> >> } bitmap_super_t;
> >
> > Do we really need this to be persistent? How about we configure it at run
> > time via a sysfs file?
> >
> > Also, please share more data on the performance benefit of the set.
> >
> > Thanks,
> > Song
> >
> Hi Song,
>
> Patch 1/2 changes default behavior, which patch 2/2 tries to address.

Have you tried to evaluate the impact on the accuracy of the bitmap?
Specifically, if we power off the system during writes, do we see data
or parity mismatch that is not covered by the bitmap?

> I can change it to be configurable via sysfs instead.
> Should there be a default?

If there is any impact on bitmap accuracy. I think the default should
work identical as before the set. IOW, we should not delay the bitmap
update.

Thanks,
Song

>
>
> Here are my observations via biosnoop and RAID1, 4M chunksize, 238436 chunks, bitmap=internal
> fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --runtime=60
> --percentile_list=1.0:25.0:50.0:75.0:90.0:95.0:99.0:99.9:99.99:99..999999:100.0
>
>
> Default, bitmap updates happened concurrently with I/O:
> bw ( KiB/s): min=18690, max=30618, per=99.94%, avg=23822.07, stdev=2522.73, samples=119
> iops : min= 4672, max= 7654, avg=5955.20, stdev=630.71, samples=119
>
> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
> 38.090366 md0_raid1 4800 nvme6n1 W 40 4096 0.01
> 38.090423 md0_raid1 4800 nvme3n1 W 40 4096 0.07
> 38.090442 md0_raid1 4800 nvme3n1 W 1016633184 4096 0.01
> 38.090439 md0_raid1 4800 nvme6n1 W 1016633184 4096 0.01
> 38.090479 md0_raid1 4800 nvme6n1 W 56 4096 0.01
> 38.090493 md0_raid1 4800 nvme6n1 W 1449894256 4096 0.01
> 38.090477 md0_raid1 4800 nvme3n1 W 56 4096 0.01
> 38.090496 md0_raid1 4800 nvme3n1 W 1449894256 4096 0.01
> 38.090530 md0_raid1 4800 nvme3n1 W 16 4096 0.01
> 38.090555 md0_raid1 4800 nvme3n1 W 110493568 4096 0.01
> 38.090538 md0_raid1 4800 nvme6n1 W 16 4096 0.01
> 38.090551 md0_raid1 4800 nvme6n1 W 110493568 4096 0.01
> 38.090596 md0_raid1 4800 nvme6n1 W 56 4096 0.01
> 38.090647 md0_raid1 4800 nvme3n1 W 56 4096 0.06
> 38.090666 md0_raid1 4800 nvme3n1 W 1455846976 4096 0.01
> 38.090663 md0_raid1 4800 nvme6n1 W 1455846976 4096 0.01
> 38.090707 md0_raid1 4800 nvme6n1 W 64 4096 0.01
> 38.090699 md0_raid1 4800 nvme3n1 W 64 4096 0.01
> 38.090723 md0_raid1 4800 nvme3n1 W 1665013728 4096 0.01
> 38.090720 md0_raid1 4800 nvme6n1 W 1665013728 4096 0.01
> 38.090764 md0_raid1 4800 nvme6n1 W 64 4096 0.01
> 38.090812 md0_raid1 4800 nvme3n1 W 64 4096 0.06
> 38.090832 md0_raid1 4800 nvme3n1 W 1637994296 4096 0.01
> 38.090828 md0_raid1 4800 nvme6n1 W 1637994296 4096 0.01
>
>
>
>
> With patch 1/2, bitmaps only update on the 'delay' parameter (default 5s):
> bw ( KiB/s): min=135712, max=230938, per=100.00%, avg=209308.56, stdev=29254.31, samples=119
> iops : min=33928, max=57734, avg=52326.78, stdev=7313.57, samples=119
>
> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
> 16.292235 md0_raid1 4841 nvme6n1 W 297367432 4096 0.01
> 16.292258 md0_raid1 4841 nvme6n1 W 16 4096 0.01
> 16.292266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
> 16.292277 md0_raid1 4841 nvme6n1 W 32 4096 0.01
> 16.292259 md0_raid1 4841 nvme3n1 W 16 4096 0.01
> 16.292280 md0_raid1 4841 nvme3n1 W 32 4096 0.01
> 16.292305 md0_raid1 4841 nvme3n1 W 56 4096 0.01
> 16.292286 md0_raid1 4841 nvme6n1 W 40 4096 0.01
> 16.292295 md0_raid1 4841 nvme6n1 W 48 4096 0.01
> 16.292326 md0_raid1 4841 nvme3n1 W 72 1536 0.01
> 16.292323 md0_raid1 4841 nvme6n1 W 64 4096 0.02
> 16.292326 md0_raid1 4841 nvme6n1 W 56 4096 0.03
> 16.292334 md0_raid1 4841 nvme6n1 W 72 1536 0.02
> 16.300697 md0_raid1 4841 nvme3n1 W 1297533744 4096 0.01
> 16.300702 md0_raid1 4841 nvme6n1 W 1297533744 4096 0.01
> 16.300803 md0_raid1 4841 nvme6n1 W 1649080856 4096 0.01
> 16.300798 md0_raid1 4841 nvme3n1 W 1649080856 4096 0.01
> 16.300823 md0_raid1 4841 nvme3n1 W 1539317792 4096 0.01
> 16.300845 md0_raid1 4841 nvme3n1 W 1634570232 4096 0.01
> 16.300867 md0_raid1 4841 nvme3n1 W 579232208 4096 0.01
> 16.300889 md0_raid1 4841 nvme3n1 W 1818140424 4096 0.01
> 16.300922 md0_raid1 4841 nvme3n1 W 412971920 4096 0.02
> ...
> 21.293225 md0_raid1 4841 nvme3n1 W 1279122360 4096 0.01
> 21.293242 md0_raid1 4841 nvme3n1 W 40326272 4096 0.01
> 21.293223 md0_raid1 4841 nvme6n1 W 1279122360 4096 0.01
> 21.293243 md0_raid1 4841 nvme6n1 W 40326272 4096 0.01
> 21.293261 md0_raid1 4841 nvme6n1 W 16 4096 0.01
> 21.293266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
> 21.293271 md0_raid1 4841 nvme6n1 W 32 4096 0.01
> 21.293275 md0_raid1 4841 nvme3n1 W 32 4096 0.01
> 21.293292 md0_raid1 4841 nvme3n1 W 48 4096 0.01
> 21.293296 md0_raid1 4841 nvme3n1 W 56 4096 0.01
> 21.293309 md0_raid1 4841 nvme3n1 W 72 1536 0.01
> 21.293266 md0_raid1 4841 nvme3n1 W 24 4096 0.01
> 21.293326 md0_raid1 4841 nvme6n1 W 48 4096 0.05
> 21.293328 md0_raid1 4841 nvme6n1 W 40 4096 0.06
> 21.293331 md0_raid1 4841 nvme6n1 W 72 1536 0.03
> 21.293333 md0_raid1 4841 nvme6n1 W 64 4096 0.04
> 21.293334 md0_raid1 4841 nvme6n1 W 56 4096 0.05
> 21.298526 md0_raid1 4841 nvme3n1 W 681973000 4096 0.01
>
>
>
>
> Good, but with the granularity of N seconds, it might be too infrequent.
> Here is chunk-flush=512 (2GB threshold in 4MB chunk size):
> bw ( KiB/s): min=92692, max=134904, per=100.00%, avg=125127.43, stdev=6758.51, samples=119
> iops : min=23173, max=33726, avg=31281.55, stdev=1689.63, samples=119
>
> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
> 13.193339 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> 13.193344 md0_raid1 5972 nvme6n1 W 32 4096 0.01
> 13.193346 md0_raid1 5972 nvme6n1 W 24 4096 0.01
> 13.193350 md0_raid1 5972 nvme6n1 W 40 4096 0.01
> 13.193356 md0_raid1 5972 nvme6n1 W 48 4096 0.01
> 13.193361 md0_raid1 5972 nvme6n1 W 64 4096 0.01
> 13.193363 md0_raid1 5972 nvme6n1 W 56 4096 0.01
> 13.193555 md0_raid1 5972 nvme6n1 W 72 1536 0.20
> 13.193289 md0_raid1 5972 nvme3n1 W 1912285848 4096 0.01
> 13.193306 md0_raid1 5972 nvme3n1 W 836455896 4096 0.01
> 13.193323 md0_raid1 5972 nvme3n1 W 233728136 4096 0.01
> 13.193339 md0_raid1 5972 nvme3n1 W 16 4096 0.01
> 13.193344 md0_raid1 5972 nvme3n1 W 24 4096 0.01
> 13.193362 md0_raid1 5972 nvme3n1 W 48 4096 0.01
> 13.193365 md0_raid1 5972 nvme3n1 W 64 4096 0.01
> 13.193366 md0_raid1 5972 nvme3n1 W 56 4096 0.01
> 13.193574 md0_raid1 5972 nvme3n1 W 72 1536 0.21
> 13.196759 md0_raid1 5972 nvme3n1 W 89571592 4096 0.01
> 13.196810 md0_raid1 5972 nvme6n1 W 89571592 4096 0.06
> 13.196913 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> 13.196910 md0_raid1 5972 nvme3n1 W 16 4096 0.01
> 13.199444 md0_raid1 5972 nvme3n1 W 64 4096 0.01
> 13.199447 md0_raid1 5972 nvme3n1 W 137126232 4096 0.01
> 13.199515 md0_raid1 5972 nvme6n1 W 137126232 4096 0.08
> 13.199519 md0_raid1 5972 nvme6n1 W 64 4096 0.08
> 13.199617 md0_raid1 5972 nvme6n1 W 1216062808 4096 0.01
> ... (508 ios later)
> 13.208764 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> 13.208768 md0_raid1 5972 nvme6n1 W 32 4096 0.01
> 13.208770 md0_raid1 5972 nvme6n1 W 24 4096 0.01
> 13.208775 md0_raid1 5972 nvme6n1 W 40 4096 0.01
> 13.208781 md0_raid1 5972 nvme6n1 W 48 4096 0.01
> 13.208786 md0_raid1 5972 nvme6n1 W 56 4096 0.01
> 13.208790 md0_raid1 5972 nvme6n1 W 64 4096 0.01
> 13.208729 md0_raid1 5972 nvme3n1 W 1607847808 4096 0.01
> 13.208747 md0_raid1 5972 nvme3n1 W 371214368 4096 0.01
> 13.208770 md0_raid1 5972 nvme3n1 W 32 4096 0.01
> 13.208789 md0_raid1 5972 nvme3n1 W 64 4096 0.01
> 13.208952 md0_raid1 5972 nvme6n1 W 72 1536 0.17
> 13.209079 md0_raid1 5972 nvme3n1 W 72 1536 0.29
> 13.212216 md0_raid1 5972 nvme3n1 W 1146106480 4096 0.01
> 13.212269 md0_raid1 5972 nvme6n1 W 1146106480 4096 0.06
> 13.212368 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> 13.212365 md0_raid1 5972 nvme3n1 W 16 4096 0.01
>
>
> Without 1/2: 6k iops
> With 1/2: 52k iops
> With 2/2 params as above: 31k iops
>
> The count calculation could use some improvement to close the iops gap to delay-based flushing
>
> >>
> >> /* notes:
> >> diff --git a/drivers/md/md.h b/drivers/md/md.h
> >> index b4e2d8b87b61..d25574e46283 100644
> >> --- a/drivers/md/md.h
> >> +++ b/drivers/md/md.h
> >> @@ -497,6 +497,7 @@ struct mddev {
> >> struct mutex mutex;
> >> unsigned long chunksize;
> >> unsigned long daemon_sleep; /* how many jiffies between updates? */
> >> + unsigned int daemon_flush_chunks; /* how many dirty chunks between updates */
> >> unsigned long max_write_behind; /* write-behind mode */
> >> int external;
> >> int nodes; /* Maximum number of nodes in the cluster */
> >> --
> >> 2.31.1
> >>

2022-10-13 22:27:03

by Jonathan Derrick

[permalink] [raw]
Subject: Re: [PATCH 2/2] md/bitmap: Add chunk-count-based bitmap flushing



On 10/10/2022 12:18 PM, Song Liu wrote:
> On Fri, Oct 7, 2022 at 11:58 AM Jonathan Derrick
> <[email protected]> wrote:
>>
>>
>>
>> On 10/7/2022 11:50 AM, Song Liu wrote:
>>> On Thu, Oct 6, 2022 at 3:09 PM Jonathan Derrick
>>> <[email protected]> wrote:
>>>
>>> [...]
>>>
>>>> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
>>>> index cfd7395de8fd..e0aeedbdde17 100644
>>>> --- a/drivers/md/md-bitmap.h
>>>> +++ b/drivers/md/md-bitmap.h
>>>> @@ -11,10 +11,12 @@
>>>> /* version 4 insists the bitmap is in little-endian order
>>>> * with version 3, it is host-endian which is non-portable
>>>> * Version 5 is currently set only for clustered devices
>>>> ++ * Version 6 supports the flush-chunks threshold
>>>> */
>>>> #define BITMAP_MAJOR_HI 4
>>>> #define BITMAP_MAJOR_CLUSTERED 5
>>>> #define BITMAP_MAJOR_HOSTENDIAN 3
>>>> +#define BITMAP_MAJOR_CHUNKFLUSH 6
>>>>
>>>> /*
>>>> * in-memory bitmap:
>>>> @@ -135,7 +137,8 @@ typedef struct bitmap_super_s {
>>>> * reserved for the bitmap. */
>>>> __le32 nodes; /* 68 the maximum number of nodes in cluster. */
>>>> __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
>>>> - __u8 pad[256 - 136]; /* set to zero */
>>>> + __le32 daemon_flush_chunks; /* 136 dirty chunks between flushes */
>>>> + __u8 pad[256 - 140]; /* set to zero */
>>>> } bitmap_super_t;
>>>
>>> Do we really need this to be persistent? How about we configure it at run
>>> time via a sysfs file?
>>>
>>> Also, please share more data on the performance benefit of the set.
>>>
>>> Thanks,
>>> Song
>>>
>> Hi Song,
>>
>> Patch 1/2 changes default behavior, which patch 2/2 tries to address.
>
> Have you tried to evaluate the impact on the accuracy of the bitmap?
> Specifically, if we power off the system during writes, do we see data
> or parity mismatch that is not covered by the bitmap?
Fair. I'm assuming this has to do with md_bitmap_init_from_disk()'s
outofdate BITMAP_STALE check? And my patch 1/2 would likely guarantee
a full resync unless the system was lost just after during the daemon
wake time. However patch 2/2 increases the likelihood of reading a good
bitmap.


>
>> I can change it to be configurable via sysfs instead.
>> Should there be a default?
>
> If there is any impact on bitmap accuracy. I think the default should
> work identical as before the set. IOW, we should not delay the bitmap
> update.
With results like mine, I'm under the impression bitmap=internal is not
regularly used for write-heavy workloads [1].

The thing is, is that it's not very consistent right now. I've had runs
where the bitmap isn't updated for minutes until the run ends, and then
I have most runs where it's doing it every other I/O or so. And it seems
to depend on the number of chunks relative to the device size (if it can
fit in a single page.)

I have v2 coming which should help fix a few of these inconsistencies.

[1] Similar results https://blog.liw.fi/posts/write-intent-bitmaps/

>
> Thanks,
> Song
>
>>
>>
>> Here are my observations via biosnoop and RAID1, 4M chunksize, 238436 chunks, bitmap=internal
>> fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --runtime=60
>> --percentile_list=1.0:25.0:50.0:75.0:90.0:95.0:99.0:99.9:99.99:99..999999:100.0
>>
>>
>> Default, bitmap updates happened concurrently with I/O:
>> bw ( KiB/s): min=18690, max=30618, per=99.94%, avg=23822.07, stdev=2522.73, samples=119
>> iops : min= 4672, max= 7654, avg=5955.20, stdev=630.71, samples=119
>>
>> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
>> 38.090366 md0_raid1 4800 nvme6n1 W 40 4096 0.01
>> 38.090423 md0_raid1 4800 nvme3n1 W 40 4096 0.07
>> 38.090442 md0_raid1 4800 nvme3n1 W 1016633184 4096 0.01
>> 38.090439 md0_raid1 4800 nvme6n1 W 1016633184 4096 0.01
>> 38.090479 md0_raid1 4800 nvme6n1 W 56 4096 0.01
>> 38.090493 md0_raid1 4800 nvme6n1 W 1449894256 4096 0.01
>> 38.090477 md0_raid1 4800 nvme3n1 W 56 4096 0.01
>> 38.090496 md0_raid1 4800 nvme3n1 W 1449894256 4096 0.01
>> 38.090530 md0_raid1 4800 nvme3n1 W 16 4096 0.01
>> 38.090555 md0_raid1 4800 nvme3n1 W 110493568 4096 0.01
>> 38.090538 md0_raid1 4800 nvme6n1 W 16 4096 0.01
>> 38.090551 md0_raid1 4800 nvme6n1 W 110493568 4096 0.01
>> 38.090596 md0_raid1 4800 nvme6n1 W 56 4096 0.01
>> 38.090647 md0_raid1 4800 nvme3n1 W 56 4096 0.06
>> 38.090666 md0_raid1 4800 nvme3n1 W 1455846976 4096 0.01
>> 38.090663 md0_raid1 4800 nvme6n1 W 1455846976 4096 0.01
>> 38.090707 md0_raid1 4800 nvme6n1 W 64 4096 0.01
>> 38.090699 md0_raid1 4800 nvme3n1 W 64 4096 0.01
>> 38.090723 md0_raid1 4800 nvme3n1 W 1665013728 4096 0.01
>> 38.090720 md0_raid1 4800 nvme6n1 W 1665013728 4096 0.01
>> 38.090764 md0_raid1 4800 nvme6n1 W 64 4096 0.01
>> 38.090812 md0_raid1 4800 nvme3n1 W 64 4096 0.06
>> 38.090832 md0_raid1 4800 nvme3n1 W 1637994296 4096 0.01
>> 38.090828 md0_raid1 4800 nvme6n1 W 1637994296 4096 0.01
>>
>>
>>
>>
>> With patch 1/2, bitmaps only update on the 'delay' parameter (default 5s):
>> bw ( KiB/s): min=135712, max=230938, per=100.00%, avg=209308.56, stdev=29254.31, samples=119
>> iops : min=33928, max=57734, avg=52326.78, stdev=7313.57, samples=119
>>
>> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
>> 16.292235 md0_raid1 4841 nvme6n1 W 297367432 4096 0.01
>> 16.292258 md0_raid1 4841 nvme6n1 W 16 4096 0.01
>> 16.292266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
>> 16.292277 md0_raid1 4841 nvme6n1 W 32 4096 0.01
>> 16.292259 md0_raid1 4841 nvme3n1 W 16 4096 0.01
>> 16.292280 md0_raid1 4841 nvme3n1 W 32 4096 0.01
>> 16.292305 md0_raid1 4841 nvme3n1 W 56 4096 0.01
>> 16.292286 md0_raid1 4841 nvme6n1 W 40 4096 0.01
>> 16.292295 md0_raid1 4841 nvme6n1 W 48 4096 0.01
>> 16.292326 md0_raid1 4841 nvme3n1 W 72 1536 0.01
>> 16.292323 md0_raid1 4841 nvme6n1 W 64 4096 0.02
>> 16.292326 md0_raid1 4841 nvme6n1 W 56 4096 0.03
>> 16.292334 md0_raid1 4841 nvme6n1 W 72 1536 0.02
>> 16.300697 md0_raid1 4841 nvme3n1 W 1297533744 4096 0.01
>> 16.300702 md0_raid1 4841 nvme6n1 W 1297533744 4096 0.01
>> 16.300803 md0_raid1 4841 nvme6n1 W 1649080856 4096 0.01
>> 16.300798 md0_raid1 4841 nvme3n1 W 1649080856 4096 0.01
>> 16.300823 md0_raid1 4841 nvme3n1 W 1539317792 4096 0.01
>> 16.300845 md0_raid1 4841 nvme3n1 W 1634570232 4096 0.01
>> 16.300867 md0_raid1 4841 nvme3n1 W 579232208 4096 0.01
>> 16.300889 md0_raid1 4841 nvme3n1 W 1818140424 4096 0.01
>> 16.300922 md0_raid1 4841 nvme3n1 W 412971920 4096 0.02
>> ...
>> 21.293225 md0_raid1 4841 nvme3n1 W 1279122360 4096 0.01
>> 21.293242 md0_raid1 4841 nvme3n1 W 40326272 4096 0.01
>> 21.293223 md0_raid1 4841 nvme6n1 W 1279122360 4096 0.01
>> 21.293243 md0_raid1 4841 nvme6n1 W 40326272 4096 0.01
>> 21.293261 md0_raid1 4841 nvme6n1 W 16 4096 0.01
>> 21.293266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
>> 21.293271 md0_raid1 4841 nvme6n1 W 32 4096 0.01
>> 21.293275 md0_raid1 4841 nvme3n1 W 32 4096 0.01
>> 21.293292 md0_raid1 4841 nvme3n1 W 48 4096 0.01
>> 21.293296 md0_raid1 4841 nvme3n1 W 56 4096 0.01
>> 21.293309 md0_raid1 4841 nvme3n1 W 72 1536 0.01
>> 21.293266 md0_raid1 4841 nvme3n1 W 24 4096 0.01
>> 21.293326 md0_raid1 4841 nvme6n1 W 48 4096 0.05
>> 21.293328 md0_raid1 4841 nvme6n1 W 40 4096 0.06
>> 21.293331 md0_raid1 4841 nvme6n1 W 72 1536 0.03
>> 21.293333 md0_raid1 4841 nvme6n1 W 64 4096 0.04
>> 21.293334 md0_raid1 4841 nvme6n1 W 56 4096 0.05
>> 21.298526 md0_raid1 4841 nvme3n1 W 681973000 4096 0.01
>>
>>
>>
>>
>> Good, but with the granularity of N seconds, it might be too infrequent.
>> Here is chunk-flush=512 (2GB threshold in 4MB chunk size):
>> bw ( KiB/s): min=92692, max=134904, per=100.00%, avg=125127.43, stdev=6758.51, samples=119
>> iops : min=23173, max=33726, avg=31281.55, stdev=1689.63, samples=119
>>
>> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
>> 13.193339 md0_raid1 5972 nvme6n1 W 16 4096 0.01
>> 13.193344 md0_raid1 5972 nvme6n1 W 32 4096 0.01
>> 13.193346 md0_raid1 5972 nvme6n1 W 24 4096 0.01
>> 13.193350 md0_raid1 5972 nvme6n1 W 40 4096 0.01
>> 13.193356 md0_raid1 5972 nvme6n1 W 48 4096 0.01
>> 13.193361 md0_raid1 5972 nvme6n1 W 64 4096 0.01
>> 13.193363 md0_raid1 5972 nvme6n1 W 56 4096 0.01
>> 13.193555 md0_raid1 5972 nvme6n1 W 72 1536 0.20
>> 13.193289 md0_raid1 5972 nvme3n1 W 1912285848 4096 0.01
>> 13.193306 md0_raid1 5972 nvme3n1 W 836455896 4096 0.01
>> 13.193323 md0_raid1 5972 nvme3n1 W 233728136 4096 0.01
>> 13.193339 md0_raid1 5972 nvme3n1 W 16 4096 0.01
>> 13.193344 md0_raid1 5972 nvme3n1 W 24 4096 0.01
>> 13.193362 md0_raid1 5972 nvme3n1 W 48 4096 0.01
>> 13.193365 md0_raid1 5972 nvme3n1 W 64 4096 0.01
>> 13.193366 md0_raid1 5972 nvme3n1 W 56 4096 0.01
>> 13.193574 md0_raid1 5972 nvme3n1 W 72 1536 0.21
>> 13.196759 md0_raid1 5972 nvme3n1 W 89571592 4096 0.01
>> 13.196810 md0_raid1 5972 nvme6n1 W 89571592 4096 0.06
>> 13.196913 md0_raid1 5972 nvme6n1 W 16 4096 0.01
>> 13.196910 md0_raid1 5972 nvme3n1 W 16 4096 0.01
>> 13.199444 md0_raid1 5972 nvme3n1 W 64 4096 0.01
>> 13.199447 md0_raid1 5972 nvme3n1 W 137126232 4096 0.01
>> 13.199515 md0_raid1 5972 nvme6n1 W 137126232 4096 0.08
>> 13.199519 md0_raid1 5972 nvme6n1 W 64 4096 0.08
>> 13.199617 md0_raid1 5972 nvme6n1 W 1216062808 4096 0.01
>> ... (508 ios later)
>> 13.208764 md0_raid1 5972 nvme6n1 W 16 4096 0.01
>> 13.208768 md0_raid1 5972 nvme6n1 W 32 4096 0.01
>> 13.208770 md0_raid1 5972 nvme6n1 W 24 4096 0.01
>> 13.208775 md0_raid1 5972 nvme6n1 W 40 4096 0.01
>> 13.208781 md0_raid1 5972 nvme6n1 W 48 4096 0.01
>> 13.208786 md0_raid1 5972 nvme6n1 W 56 4096 0.01
>> 13.208790 md0_raid1 5972 nvme6n1 W 64 4096 0.01
>> 13.208729 md0_raid1 5972 nvme3n1 W 1607847808 4096 0.01
>> 13.208747 md0_raid1 5972 nvme3n1 W 371214368 4096 0.01
>> 13.208770 md0_raid1 5972 nvme3n1 W 32 4096 0.01
>> 13.208789 md0_raid1 5972 nvme3n1 W 64 4096 0.01
>> 13.208952 md0_raid1 5972 nvme6n1 W 72 1536 0.17
>> 13.209079 md0_raid1 5972 nvme3n1 W 72 1536 0.29
>> 13.212216 md0_raid1 5972 nvme3n1 W 1146106480 4096 0.01
>> 13.212269 md0_raid1 5972 nvme6n1 W 1146106480 4096 0.06
>> 13.212368 md0_raid1 5972 nvme6n1 W 16 4096 0.01
>> 13.212365 md0_raid1 5972 nvme3n1 W 16 4096 0.01
>>
>>
>> Without 1/2: 6k iops
>> With 1/2: 52k iops
>> With 2/2 params as above: 31k iops
>>
>> The count calculation could use some improvement to close the iops gap to delay-based flushing
>>
>>>>
>>>> /* notes:
>>>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>>>> index b4e2d8b87b61..d25574e46283 100644
>>>> --- a/drivers/md/md.h
>>>> +++ b/drivers/md/md.h
>>>> @@ -497,6 +497,7 @@ struct mddev {
>>>> struct mutex mutex;
>>>> unsigned long chunksize;
>>>> unsigned long daemon_sleep; /* how many jiffies between updates? */
>>>> + unsigned int daemon_flush_chunks; /* how many dirty chunks between updates */
>>>> unsigned long max_write_behind; /* write-behind mode */
>>>> int external;
>>>> int nodes; /* Maximum number of nodes in the cluster */
>>>> --
>>>> 2.31.1
>>>>

2022-10-13 23:52:32

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH 2/2] md/bitmap: Add chunk-count-based bitmap flushing

On Thu, Oct 13, 2022 at 3:19 PM Jonathan Derrick
<[email protected]> wrote:
>
>
>
> On 10/10/2022 12:18 PM, Song Liu wrote:
> > On Fri, Oct 7, 2022 at 11:58 AM Jonathan Derrick
> > <[email protected]> wrote:
> >>
> >>
> >>
> >> On 10/7/2022 11:50 AM, Song Liu wrote:
> >>> On Thu, Oct 6, 2022 at 3:09 PM Jonathan Derrick
> >>> <[email protected]> wrote:
> >>>
> >>> [...]
> >>>
> >>>> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
> >>>> index cfd7395de8fd..e0aeedbdde17 100644
> >>>> --- a/drivers/md/md-bitmap.h
> >>>> +++ b/drivers/md/md-bitmap.h
> >>>> @@ -11,10 +11,12 @@
> >>>> /* version 4 insists the bitmap is in little-endian order
> >>>> * with version 3, it is host-endian which is non-portable
> >>>> * Version 5 is currently set only for clustered devices
> >>>> ++ * Version 6 supports the flush-chunks threshold
> >>>> */
> >>>> #define BITMAP_MAJOR_HI 4
> >>>> #define BITMAP_MAJOR_CLUSTERED 5
> >>>> #define BITMAP_MAJOR_HOSTENDIAN 3
> >>>> +#define BITMAP_MAJOR_CHUNKFLUSH 6
> >>>>
> >>>> /*
> >>>> * in-memory bitmap:
> >>>> @@ -135,7 +137,8 @@ typedef struct bitmap_super_s {
> >>>> * reserved for the bitmap. */
> >>>> __le32 nodes; /* 68 the maximum number of nodes in cluster. */
> >>>> __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
> >>>> - __u8 pad[256 - 136]; /* set to zero */
> >>>> + __le32 daemon_flush_chunks; /* 136 dirty chunks between flushes */
> >>>> + __u8 pad[256 - 140]; /* set to zero */
> >>>> } bitmap_super_t;
> >>>
> >>> Do we really need this to be persistent? How about we configure it at run
> >>> time via a sysfs file?
> >>>
> >>> Also, please share more data on the performance benefit of the set.
> >>>
> >>> Thanks,
> >>> Song
> >>>
> >> Hi Song,
> >>
> >> Patch 1/2 changes default behavior, which patch 2/2 tries to address.
> >
> > Have you tried to evaluate the impact on the accuracy of the bitmap?
> > Specifically, if we power off the system during writes, do we see data
> > or parity mismatch that is not covered by the bitmap?
> Fair. I'm assuming this has to do with md_bitmap_init_from_disk()'s
> outofdate BITMAP_STALE check? And my patch 1/2 would likely guarantee
> a full resync unless the system was lost just after during the daemon
> wake time. However patch 2/2 increases the likelihood of reading a good
> bitmap.

kernel test bot reported a failed mdadm test after 1/2. Could you please check
whether that's accurate?

>
>
> >
> >> I can change it to be configurable via sysfs instead.
> >> Should there be a default?
> >
> > If there is any impact on bitmap accuracy. I think the default should
> > work identical as before the set. IOW, we should not delay the bitmap
> > update.
> With results like mine, I'm under the impression bitmap=internal is not
> regularly used for write-heavy workloads [1].

It is pretty bad for really random writes. But it shouldn't be too bad
for normal
workload (where folks already optimize writes to be more sequential).

>
> The thing is, is that it's not very consistent right now. I've had runs
> where the bitmap isn't updated for minutes until the run ends, and then
> I have most runs where it's doing it every other I/O or so. And it seems
> to depend on the number of chunks relative to the device size (if it can
> fit in a single page.)
>
> I have v2 coming which should help fix a few of these inconsistencies.

Sounds great. Thanks!
Song

>
> [1] Similar results https://blog.liw.fi/posts/write-intent-bitmaps/
>
> >
> > Thanks,
> > Song
> >
> >>
> >>
> >> Here are my observations via biosnoop and RAID1, 4M chunksize, 238436 chunks, bitmap=internal
> >> fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --runtime=60
> >> --percentile_list=1.0:25.0:50.0:75.0:90.0:95.0:99.0:99.9:99.99:99..999999:100.0
> >>
> >>
> >> Default, bitmap updates happened concurrently with I/O:
> >> bw ( KiB/s): min=18690, max=30618, per=99.94%, avg=23822.07, stdev=2522.73, samples=119
> >> iops : min= 4672, max= 7654, avg=5955.20, stdev=630.71, samples=119
> >>
> >> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
> >> 38.090366 md0_raid1 4800 nvme6n1 W 40 4096 0.01
> >> 38.090423 md0_raid1 4800 nvme3n1 W 40 4096 0.07
> >> 38.090442 md0_raid1 4800 nvme3n1 W 1016633184 4096 0.01
> >> 38.090439 md0_raid1 4800 nvme6n1 W 1016633184 4096 0.01
> >> 38.090479 md0_raid1 4800 nvme6n1 W 56 4096 0.01
> >> 38.090493 md0_raid1 4800 nvme6n1 W 1449894256 4096 0.01
> >> 38.090477 md0_raid1 4800 nvme3n1 W 56 4096 0.01
> >> 38.090496 md0_raid1 4800 nvme3n1 W 1449894256 4096 0.01
> >> 38.090530 md0_raid1 4800 nvme3n1 W 16 4096 0.01
> >> 38.090555 md0_raid1 4800 nvme3n1 W 110493568 4096 0.01
> >> 38.090538 md0_raid1 4800 nvme6n1 W 16 4096 0.01
> >> 38.090551 md0_raid1 4800 nvme6n1 W 110493568 4096 0.01
> >> 38.090596 md0_raid1 4800 nvme6n1 W 56 4096 0.01
> >> 38.090647 md0_raid1 4800 nvme3n1 W 56 4096 0.06
> >> 38.090666 md0_raid1 4800 nvme3n1 W 1455846976 4096 0.01
> >> 38.090663 md0_raid1 4800 nvme6n1 W 1455846976 4096 0.01
> >> 38.090707 md0_raid1 4800 nvme6n1 W 64 4096 0.01
> >> 38.090699 md0_raid1 4800 nvme3n1 W 64 4096 0.01
> >> 38.090723 md0_raid1 4800 nvme3n1 W 1665013728 4096 0.01
> >> 38.090720 md0_raid1 4800 nvme6n1 W 1665013728 4096 0.01
> >> 38.090764 md0_raid1 4800 nvme6n1 W 64 4096 0.01
> >> 38.090812 md0_raid1 4800 nvme3n1 W 64 4096 0.06
> >> 38.090832 md0_raid1 4800 nvme3n1 W 1637994296 4096 0.01
> >> 38.090828 md0_raid1 4800 nvme6n1 W 1637994296 4096 0.01
> >>
> >>
> >>
> >>
> >> With patch 1/2, bitmaps only update on the 'delay' parameter (default 5s):
> >> bw ( KiB/s): min=135712, max=230938, per=100.00%, avg=209308.56, stdev=29254.31, samples=119
> >> iops : min=33928, max=57734, avg=52326.78, stdev=7313.57, samples=119
> >>
> >> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
> >> 16.292235 md0_raid1 4841 nvme6n1 W 297367432 4096 0.01
> >> 16.292258 md0_raid1 4841 nvme6n1 W 16 4096 0.01
> >> 16.292266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
> >> 16.292277 md0_raid1 4841 nvme6n1 W 32 4096 0.01
> >> 16.292259 md0_raid1 4841 nvme3n1 W 16 4096 0.01
> >> 16.292280 md0_raid1 4841 nvme3n1 W 32 4096 0.01
> >> 16.292305 md0_raid1 4841 nvme3n1 W 56 4096 0.01
> >> 16.292286 md0_raid1 4841 nvme6n1 W 40 4096 0.01
> >> 16.292295 md0_raid1 4841 nvme6n1 W 48 4096 0.01
> >> 16.292326 md0_raid1 4841 nvme3n1 W 72 1536 0.01
> >> 16.292323 md0_raid1 4841 nvme6n1 W 64 4096 0.02
> >> 16.292326 md0_raid1 4841 nvme6n1 W 56 4096 0.03
> >> 16.292334 md0_raid1 4841 nvme6n1 W 72 1536 0.02
> >> 16.300697 md0_raid1 4841 nvme3n1 W 1297533744 4096 0.01
> >> 16.300702 md0_raid1 4841 nvme6n1 W 1297533744 4096 0.01
> >> 16.300803 md0_raid1 4841 nvme6n1 W 1649080856 4096 0.01
> >> 16.300798 md0_raid1 4841 nvme3n1 W 1649080856 4096 0.01
> >> 16.300823 md0_raid1 4841 nvme3n1 W 1539317792 4096 0.01
> >> 16.300845 md0_raid1 4841 nvme3n1 W 1634570232 4096 0.01
> >> 16.300867 md0_raid1 4841 nvme3n1 W 579232208 4096 0.01
> >> 16.300889 md0_raid1 4841 nvme3n1 W 1818140424 4096 0.01
> >> 16.300922 md0_raid1 4841 nvme3n1 W 412971920 4096 0.02
> >> ...
> >> 21.293225 md0_raid1 4841 nvme3n1 W 1279122360 4096 0.01
> >> 21.293242 md0_raid1 4841 nvme3n1 W 40326272 4096 0.01
> >> 21.293223 md0_raid1 4841 nvme6n1 W 1279122360 4096 0.01
> >> 21.293243 md0_raid1 4841 nvme6n1 W 40326272 4096 0.01
> >> 21.293261 md0_raid1 4841 nvme6n1 W 16 4096 0.01
> >> 21.293266 md0_raid1 4841 nvme6n1 W 24 4096 0.01
> >> 21.293271 md0_raid1 4841 nvme6n1 W 32 4096 0.01
> >> 21.293275 md0_raid1 4841 nvme3n1 W 32 4096 0.01
> >> 21.293292 md0_raid1 4841 nvme3n1 W 48 4096 0.01
> >> 21.293296 md0_raid1 4841 nvme3n1 W 56 4096 0.01
> >> 21.293309 md0_raid1 4841 nvme3n1 W 72 1536 0.01
> >> 21.293266 md0_raid1 4841 nvme3n1 W 24 4096 0.01
> >> 21.293326 md0_raid1 4841 nvme6n1 W 48 4096 0.05
> >> 21.293328 md0_raid1 4841 nvme6n1 W 40 4096 0.06
> >> 21.293331 md0_raid1 4841 nvme6n1 W 72 1536 0.03
> >> 21.293333 md0_raid1 4841 nvme6n1 W 64 4096 0.04
> >> 21.293334 md0_raid1 4841 nvme6n1 W 56 4096 0.05
> >> 21.298526 md0_raid1 4841 nvme3n1 W 681973000 4096 0.01
> >>
> >>
> >>
> >>
> >> Good, but with the granularity of N seconds, it might be too infrequent.
> >> Here is chunk-flush=512 (2GB threshold in 4MB chunk size):
> >> bw ( KiB/s): min=92692, max=134904, per=100.00%, avg=125127.43, stdev=6758.51, samples=119
> >> iops : min=23173, max=33726, avg=31281.55, stdev=1689.63, samples=119
> >>
> >> TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
> >> 13.193339 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> >> 13.193344 md0_raid1 5972 nvme6n1 W 32 4096 0.01
> >> 13.193346 md0_raid1 5972 nvme6n1 W 24 4096 0.01
> >> 13.193350 md0_raid1 5972 nvme6n1 W 40 4096 0.01
> >> 13.193356 md0_raid1 5972 nvme6n1 W 48 4096 0.01
> >> 13.193361 md0_raid1 5972 nvme6n1 W 64 4096 0.01
> >> 13.193363 md0_raid1 5972 nvme6n1 W 56 4096 0.01
> >> 13.193555 md0_raid1 5972 nvme6n1 W 72 1536 0.20
> >> 13.193289 md0_raid1 5972 nvme3n1 W 1912285848 4096 0.01
> >> 13.193306 md0_raid1 5972 nvme3n1 W 836455896 4096 0.01
> >> 13.193323 md0_raid1 5972 nvme3n1 W 233728136 4096 0.01
> >> 13.193339 md0_raid1 5972 nvme3n1 W 16 4096 0.01
> >> 13.193344 md0_raid1 5972 nvme3n1 W 24 4096 0.01
> >> 13.193362 md0_raid1 5972 nvme3n1 W 48 4096 0.01
> >> 13.193365 md0_raid1 5972 nvme3n1 W 64 4096 0.01
> >> 13.193366 md0_raid1 5972 nvme3n1 W 56 4096 0.01
> >> 13.193574 md0_raid1 5972 nvme3n1 W 72 1536 0.21
> >> 13.196759 md0_raid1 5972 nvme3n1 W 89571592 4096 0.01
> >> 13.196810 md0_raid1 5972 nvme6n1 W 89571592 4096 0.06
> >> 13.196913 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> >> 13.196910 md0_raid1 5972 nvme3n1 W 16 4096 0.01
> >> 13.199444 md0_raid1 5972 nvme3n1 W 64 4096 0.01
> >> 13.199447 md0_raid1 5972 nvme3n1 W 137126232 4096 0.01
> >> 13.199515 md0_raid1 5972 nvme6n1 W 137126232 4096 0.08
> >> 13.199519 md0_raid1 5972 nvme6n1 W 64 4096 0.08
> >> 13.199617 md0_raid1 5972 nvme6n1 W 1216062808 4096 0.01
> >> ... (508 ios later)
> >> 13.208764 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> >> 13.208768 md0_raid1 5972 nvme6n1 W 32 4096 0.01
> >> 13.208770 md0_raid1 5972 nvme6n1 W 24 4096 0.01
> >> 13.208775 md0_raid1 5972 nvme6n1 W 40 4096 0.01
> >> 13.208781 md0_raid1 5972 nvme6n1 W 48 4096 0.01
> >> 13.208786 md0_raid1 5972 nvme6n1 W 56 4096 0.01
> >> 13.208790 md0_raid1 5972 nvme6n1 W 64 4096 0.01
> >> 13.208729 md0_raid1 5972 nvme3n1 W 1607847808 4096 0.01
> >> 13.208747 md0_raid1 5972 nvme3n1 W 371214368 4096 0.01
> >> 13.208770 md0_raid1 5972 nvme3n1 W 32 4096 0.01
> >> 13.208789 md0_raid1 5972 nvme3n1 W 64 4096 0.01
> >> 13.208952 md0_raid1 5972 nvme6n1 W 72 1536 0.17
> >> 13.209079 md0_raid1 5972 nvme3n1 W 72 1536 0.29
> >> 13.212216 md0_raid1 5972 nvme3n1 W 1146106480 4096 0.01
> >> 13.212269 md0_raid1 5972 nvme6n1 W 1146106480 4096 0.06
> >> 13.212368 md0_raid1 5972 nvme6n1 W 16 4096 0.01
> >> 13.212365 md0_raid1 5972 nvme3n1 W 16 4096 0.01
> >>
> >>
> >> Without 1/2: 6k iops
> >> With 1/2: 52k iops
> >> With 2/2 params as above: 31k iops
> >>
> >> The count calculation could use some improvement to close the iops gap to delay-based flushing
> >>
> >>>>
> >>>> /* notes:
> >>>> diff --git a/drivers/md/md.h b/drivers/md/md.h
> >>>> index b4e2d8b87b61..d25574e46283 100644
> >>>> --- a/drivers/md/md.h
> >>>> +++ b/drivers/md/md.h
> >>>> @@ -497,6 +497,7 @@ struct mddev {
> >>>> struct mutex mutex;
> >>>> unsigned long chunksize;
> >>>> unsigned long daemon_sleep; /* how many jiffies between updates? */
> >>>> + unsigned int daemon_flush_chunks; /* how many dirty chunks between updates */
> >>>> unsigned long max_write_behind; /* write-behind mode */
> >>>> int external;
> >>>> int nodes; /* Maximum number of nodes in the cluster */
> >>>> --
> >>>> 2.31.1
> >>>>