2020-02-20 18:05:30

by Lukas Straub

[permalink] [raw]
Subject: [PATCH] dm-integrity: Prevent RMW for full tag area writes

If a full tag area is being written, don't read it first. This prevents a
read-modify-write cycle and increases performance on HDDs considerably.

To do this we now calculate the checksums for all sectors in the bio in one
go in integrity_metadata and then pass the result to dm_integrity_rw_tag,
which now checks if we overwrite the whole tag area.

Benchmarking with a 5400RPM HDD with bitmap mode:
integritysetup format --no-wipe --batch-mode --interleave-sectors $((64*1024)) -t 4 -s 512 -I crc32c -B /dev/sdc
integritysetup open -I crc32c -B /dev/sdc hdda_integ
dd if=/dev/zero of=/dev/mapper/hdda_integ bs=64K count=$((16*1024*4)) conv=fsync oflag=direct status=progress

Without patch:
4294967296 bytes (4.3 GB, 4.0 GiB) copied, 400.326 s, 10.7 MB/s

With patch:
4294967296 bytes (4.3 GB, 4.0 GiB) copied, 41.2057 s, 104 MB/s

Signed-off-by: Lukas Straub <[email protected]>
---
drivers/md/dm-integrity.c | 80 ++++++++++++++++++++++-----------------
1 file changed, 46 insertions(+), 34 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index b225b3e445fa..0e5ddcf44935 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1309,9 +1309,16 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
if (unlikely(r))
return r;

- data = dm_bufio_read(ic->bufio, *metadata_block, &b);
- if (IS_ERR(data))
- return PTR_ERR(data);
+ /* Don't read tag area from disk if we're going to overwrite it completely */
+ if (op == TAG_WRITE && *metadata_offset == 0 && total_size >= ic->metadata_run) {
+ data = dm_bufio_new(ic->bufio, *metadata_block, &b);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ } else {
+ data = dm_bufio_read(ic->bufio, *metadata_block, &b);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ }

to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
dp = data + *metadata_offset;
@@ -1514,6 +1521,8 @@ static void integrity_metadata(struct work_struct *w)
{
struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
struct dm_integrity_c *ic = dio->ic;
+ unsigned sectors_to_process = dio->range.n_sectors;
+ sector_t sector = dio->range.logical_sector;

int r;

@@ -1522,16 +1531,14 @@ static void integrity_metadata(struct work_struct *w)
struct bio_vec bv;
unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
- char *checksums;
+ char *checksums, *checksums_ptr;
unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
char checksums_onstack[HASH_MAX_DIGESTSIZE];
- unsigned sectors_to_process = dio->range.n_sectors;
- sector_t sector = dio->range.logical_sector;

if (unlikely(ic->mode == 'R'))
goto skip_io;

- checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
+ checksums = kmalloc((dio->range.n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
if (!checksums) {
checksums = checksums_onstack;
@@ -1542,49 +1549,45 @@ static void integrity_metadata(struct work_struct *w)
}
}

+ checksums_ptr = checksums;
__bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
unsigned pos;
- char *mem, *checksums_ptr;
-
-again:
+ char *mem;
mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
pos = 0;
- checksums_ptr = checksums;
do {
integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
- checksums_ptr += ic->tag_size;
- sectors_to_process -= ic->sectors_per_block;
+
+ if (likely(checksums != checksums_onstack)) {
+ checksums_ptr += ic->tag_size;
+ } else {
+ r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
+ ic->tag_size, !dio->write ? TAG_CMP : TAG_WRITE);
+ if (unlikely(r))
+ goto internal_hash_error;
+ }
+
pos += ic->sectors_per_block << SECTOR_SHIFT;
sector += ic->sectors_per_block;
- } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
+ sectors_to_process -= ic->sectors_per_block;
+ } while (pos < bv.bv_len && sectors_to_process);
kunmap_atomic(mem);

- r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
- checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
- if (unlikely(r)) {
- if (r > 0) {
- DMERR_LIMIT("Checksum failed at sector 0x%llx",
- (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
- r = -EILSEQ;
- atomic64_inc(&ic->number_of_mismatches);
- }
- if (likely(checksums != checksums_onstack))
- kfree(checksums);
- goto error;
- }
-
if (!sectors_to_process)
break;
+ }

- if (unlikely(pos < bv.bv_len)) {
- bv.bv_offset += pos;
- bv.bv_len -= pos;
- goto again;
+ if (likely(checksums != checksums_onstack)) {
+ r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
+ (dio->range.n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size,
+ !dio->write ? TAG_CMP : TAG_WRITE);
+ if (unlikely(r)) {
+ kfree(checksums);
+ goto internal_hash_error;
}
+ kfree(checksums);
}

- if (likely(checksums != checksums_onstack))
- kfree(checksums);
} else {
struct bio_integrity_payload *bip = dio->orig_bi_integrity;

@@ -1615,6 +1618,13 @@ static void integrity_metadata(struct work_struct *w)
skip_io:
dec_in_flight(dio);
return;
+internal_hash_error:
+ if (r > 0) {
+ DMERR_LIMIT("Checksum failed at sector 0x%llx",
+ (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
+ r = -EILSEQ;
+ atomic64_inc(&ic->number_of_mismatches);
+ }
error:
dio->bi_status = errno_to_blk_status(r);
dec_in_flight(dio);
@@ -3019,6 +3029,8 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim
limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
}
+
+ blk_limits_io_opt(limits, (1U << ic->sb->log2_interleave_sectors));
}

static void calculate_journal_section_size(struct dm_integrity_c *ic)
--
2.20.1


2020-02-25 17:04:44

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm-integrity: Prevent RMW for full tag area writes



On Thu, 20 Feb 2020, Lukas Straub wrote:

> If a full tag area is being written, don't read it first. This prevents a
> read-modify-write cycle and increases performance on HDDs considerably.
>
> To do this we now calculate the checksums for all sectors in the bio in one
> go in integrity_metadata and then pass the result to dm_integrity_rw_tag,
> which now checks if we overwrite the whole tag area.
>
> Benchmarking with a 5400RPM HDD with bitmap mode:
> integritysetup format --no-wipe --batch-mode --interleave-sectors $((64*1024)) -t 4 -s 512 -I crc32c -B /dev/sdc
> integritysetup open -I crc32c -B /dev/sdc hdda_integ
> dd if=/dev/zero of=/dev/mapper/hdda_integ bs=64K count=$((16*1024*4)) conv=fsync oflag=direct status=progress
>
> Without patch:
> 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 400.326 s, 10.7 MB/s
>
> With patch:
> 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 41.2057 s, 104 MB/s
>
> Signed-off-by: Lukas Straub <[email protected]>
> ---
> drivers/md/dm-integrity.c | 80 ++++++++++++++++++++++-----------------
> 1 file changed, 46 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
> index b225b3e445fa..0e5ddcf44935 100644
> --- a/drivers/md/dm-integrity.c
> +++ b/drivers/md/dm-integrity.c
> @@ -1309,9 +1309,16 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
> if (unlikely(r))
> return r;
>
> - data = dm_bufio_read(ic->bufio, *metadata_block, &b);
> - if (IS_ERR(data))
> - return PTR_ERR(data);
> + /* Don't read tag area from disk if we're going to overwrite it completely */
> + if (op == TAG_WRITE && *metadata_offset == 0 && total_size >= ic->metadata_run) {

Hi

This is incorrect logic because ic->metadata_run is in the units of
512-byte sectors and total_size is in bytes.

If I correct the bug and change it to "if (op == TAG_WRITE &&
*metadata_offset == 0 && total_size >= ic->metadata_run << SECTOR_SHIFT)",
then the benchmark doesn't show any performance advantage at all.

You would need much bigger bios to take advantage for this - for example,
if we have 4k block size and 64k metadata buffer size and 4-byte crc32,
there are 65536/4=16384 tags in one metadata buffer and we would need
16384*4096=64MiB bio to completely overwrite the metadata buffer. Such big
bios are not realistic.

Mikulas


> + data = dm_bufio_new(ic->bufio, *metadata_block, &b);
> + if (IS_ERR(data))
> + return PTR_ERR(data);
> + } else {
> + data = dm_bufio_read(ic->bufio, *metadata_block, &b);
> + if (IS_ERR(data))
> + return PTR_ERR(data);
> + }
>
> to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
> dp = data + *metadata_offset;
> @@ -1514,6 +1521,8 @@ static void integrity_metadata(struct work_struct *w)
> {
> struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
> struct dm_integrity_c *ic = dio->ic;
> + unsigned sectors_to_process = dio->range.n_sectors;
> + sector_t sector = dio->range.logical_sector;
>
> int r;
>
> @@ -1522,16 +1531,14 @@ static void integrity_metadata(struct work_struct *w)
> struct bio_vec bv;
> unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
> struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
> - char *checksums;
> + char *checksums, *checksums_ptr;
> unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
> char checksums_onstack[HASH_MAX_DIGESTSIZE];
> - unsigned sectors_to_process = dio->range.n_sectors;
> - sector_t sector = dio->range.logical_sector;
>
> if (unlikely(ic->mode == 'R'))
> goto skip_io;
>
> - checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
> + checksums = kmalloc((dio->range.n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
> GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
> if (!checksums) {
> checksums = checksums_onstack;
> @@ -1542,49 +1549,45 @@ static void integrity_metadata(struct work_struct *w)
> }
> }
>
> + checksums_ptr = checksums;
> __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
> unsigned pos;
> - char *mem, *checksums_ptr;
> -
> -again:
> + char *mem;
> mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
> pos = 0;
> - checksums_ptr = checksums;
> do {
> integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
> - checksums_ptr += ic->tag_size;
> - sectors_to_process -= ic->sectors_per_block;
> +
> + if (likely(checksums != checksums_onstack)) {
> + checksums_ptr += ic->tag_size;
> + } else {
> + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
> + ic->tag_size, !dio->write ? TAG_CMP : TAG_WRITE);
> + if (unlikely(r))
> + goto internal_hash_error;
> + }
> +
> pos += ic->sectors_per_block << SECTOR_SHIFT;
> sector += ic->sectors_per_block;
> - } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
> + sectors_to_process -= ic->sectors_per_block;
> + } while (pos < bv.bv_len && sectors_to_process);
> kunmap_atomic(mem);
>
> - r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
> - checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
> - if (unlikely(r)) {
> - if (r > 0) {
> - DMERR_LIMIT("Checksum failed at sector 0x%llx",
> - (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
> - r = -EILSEQ;
> - atomic64_inc(&ic->number_of_mismatches);
> - }
> - if (likely(checksums != checksums_onstack))
> - kfree(checksums);
> - goto error;
> - }
> -
> if (!sectors_to_process)
> break;
> + }
>
> - if (unlikely(pos < bv.bv_len)) {
> - bv.bv_offset += pos;
> - bv.bv_len -= pos;
> - goto again;
> + if (likely(checksums != checksums_onstack)) {
> + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
> + (dio->range.n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size,
> + !dio->write ? TAG_CMP : TAG_WRITE);
> + if (unlikely(r)) {
> + kfree(checksums);
> + goto internal_hash_error;
> }
> + kfree(checksums);
> }
>
> - if (likely(checksums != checksums_onstack))
> - kfree(checksums);
> } else {
> struct bio_integrity_payload *bip = dio->orig_bi_integrity;
>
> @@ -1615,6 +1618,13 @@ static void integrity_metadata(struct work_struct *w)
> skip_io:
> dec_in_flight(dio);
> return;
> +internal_hash_error:
> + if (r > 0) {
> + DMERR_LIMIT("Checksum failed at sector 0x%llx",
> + (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
> + r = -EILSEQ;
> + atomic64_inc(&ic->number_of_mismatches);
> + }
> error:
> dio->bi_status = errno_to_blk_status(r);
> dec_in_flight(dio);
> @@ -3019,6 +3029,8 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim
> limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
> blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
> }
> +
> + blk_limits_io_opt(limits, (1U << ic->sb->log2_interleave_sectors));
> }
>
> static void calculate_journal_section_size(struct dm_integrity_c *ic)
> --
> 2.20.1
>
>
> --
> dm-devel mailing list
> [email protected]
> https://www.redhat.com/mailman/listinfo/dm-devel
>

2020-02-26 08:27:37

by Lukas Straub

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm-integrity: Prevent RMW for full tag area writes

On Tue, 25 Feb 2020 11:41:45 -0500 (EST)
Mikulas Patocka <[email protected]> wrote:

> On Thu, 20 Feb 2020, Lukas Straub wrote:
>
> > If a full tag area is being written, don't read it first. This prevents a
> > read-modify-write cycle and increases performance on HDDs considerably.
> >
> > To do this we now calculate the checksums for all sectors in the bio in one
> > go in integrity_metadata and then pass the result to dm_integrity_rw_tag,
> > which now checks if we overwrite the whole tag area.
> >
> > Benchmarking with a 5400RPM HDD with bitmap mode:
> > integritysetup format --no-wipe --batch-mode --interleave-sectors $((64*1024)) -t 4 -s 512 -I crc32c -B /dev/sdc
> > integritysetup open -I crc32c -B /dev/sdc hdda_integ
> > dd if=/dev/zero of=/dev/mapper/hdda_integ bs=64K count=$((16*1024*4)) conv=fsync oflag=direct status=progress
> >
> > Without patch:
> > 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 400.326 s, 10.7 MB/s
> >
> > With patch:
> > 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 41.2057 s, 104 MB/s
> >
> > Signed-off-by: Lukas Straub <[email protected]>
> > ---
> > drivers/md/dm-integrity.c | 80 ++++++++++++++++++++++-----------------
> > 1 file changed, 46 insertions(+), 34 deletions(-)
> >
> > diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
> > index b225b3e445fa..0e5ddcf44935 100644
> > --- a/drivers/md/dm-integrity.c
> > +++ b/drivers/md/dm-integrity.c
> > @@ -1309,9 +1309,16 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
> > if (unlikely(r))
> > return r;
> >
> > - data = dm_bufio_read(ic->bufio, *metadata_block, &b);
> > - if (IS_ERR(data))
> > - return PTR_ERR(data);
> > + /* Don't read tag area from disk if we're going to overwrite it completely */
> > + if (op == TAG_WRITE && *metadata_offset == 0 && total_size >= ic->metadata_run) {
>
> Hi
>
> This is incorrect logic because ic->metadata_run is in the units of
> 512-byte sectors and total_size is in bytes.
>
> If I correct the bug and change it to "if (op == TAG_WRITE &&
> *metadata_offset == 0 && total_size >= ic->metadata_run << SECTOR_SHIFT)",
> then the benchmark doesn't show any performance advantage at all.

Uh oh, looking at it again i have mixed up sectors/bytes elsewhere too.
Actually, could we rewrite this check like
total_size >= (1U << SECTOR_SHIFT << ic->log2_buffer_sectors)?
this should work, right?
So we only have to overwrite part of the tag area, as long as it's whole sectors.

> You would need much bigger bios to take advantage for this - for example,
> if we have 4k block size and 64k metadata buffer size and 4-byte crc32,
> there are 65536/4=16384 tags in one metadata buffer and we would need
> 16384*4096=64MiB bio to completely overwrite the metadata buffer. Such big
> bios are not realistic.

What prevents us from using a single sector as the tag area? (Which was my assumption with this patch)
Then we would have (with 512b sectors) 512/4 = 128 tags = 64k bio, which is still below the optimal write
size of raid5/6.
I just tried to accomplish this, but there seems to be minimum limit of interleave_sectors.

Regards,
Lukas Straub

> Mikulas
>
>
> > + data = dm_bufio_new(ic->bufio, *metadata_block, &b);
> > + if (IS_ERR(data))
> > + return PTR_ERR(data);
> > + } else {
> > + data = dm_bufio_read(ic->bufio, *metadata_block, &b);
> > + if (IS_ERR(data))
> > + return PTR_ERR(data);
> > + }
> >
> > to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
> > dp = data + *metadata_offset;
> > @@ -1514,6 +1521,8 @@ static void integrity_metadata(struct work_struct *w)
> > {
> > struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
> > struct dm_integrity_c *ic = dio->ic;
> > + unsigned sectors_to_process = dio->range.n_sectors;
> > + sector_t sector = dio->range.logical_sector;
> >
> > int r;
> >
> > @@ -1522,16 +1531,14 @@ static void integrity_metadata(struct work_struct *w)
> > struct bio_vec bv;
> > unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
> > struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
> > - char *checksums;
> > + char *checksums, *checksums_ptr;
> > unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
> > char checksums_onstack[HASH_MAX_DIGESTSIZE];
> > - unsigned sectors_to_process = dio->range.n_sectors;
> > - sector_t sector = dio->range.logical_sector;
> >
> > if (unlikely(ic->mode == 'R'))
> > goto skip_io;
> >
> > - checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
> > + checksums = kmalloc((dio->range.n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
> > GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
> > if (!checksums) {
> > checksums = checksums_onstack;
> > @@ -1542,49 +1549,45 @@ static void integrity_metadata(struct work_struct *w)
> > }
> > }
> >
> > + checksums_ptr = checksums;
> > __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
> > unsigned pos;
> > - char *mem, *checksums_ptr;
> > -
> > -again:
> > + char *mem;
> > mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
> > pos = 0;
> > - checksums_ptr = checksums;
> > do {
> > integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
> > - checksums_ptr += ic->tag_size;
> > - sectors_to_process -= ic->sectors_per_block;
> > +
> > + if (likely(checksums != checksums_onstack)) {
> > + checksums_ptr += ic->tag_size;
> > + } else {
> > + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
> > + ic->tag_size, !dio->write ? TAG_CMP : TAG_WRITE);
> > + if (unlikely(r))
> > + goto internal_hash_error;
> > + }
> > +
> > pos += ic->sectors_per_block << SECTOR_SHIFT;
> > sector += ic->sectors_per_block;
> > - } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
> > + sectors_to_process -= ic->sectors_per_block;
> > + } while (pos < bv.bv_len && sectors_to_process);
> > kunmap_atomic(mem);
> >
> > - r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
> > - checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
> > - if (unlikely(r)) {
> > - if (r > 0) {
> > - DMERR_LIMIT("Checksum failed at sector 0x%llx",
> > - (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
> > - r = -EILSEQ;
> > - atomic64_inc(&ic->number_of_mismatches);
> > - }
> > - if (likely(checksums != checksums_onstack))
> > - kfree(checksums);
> > - goto error;
> > - }
> > -
> > if (!sectors_to_process)
> > break;
> > + }
> >
> > - if (unlikely(pos < bv.bv_len)) {
> > - bv.bv_offset += pos;
> > - bv.bv_len -= pos;
> > - goto again;
> > + if (likely(checksums != checksums_onstack)) {
> > + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
> > + (dio->range.n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size,
> > + !dio->write ? TAG_CMP : TAG_WRITE);
> > + if (unlikely(r)) {
> > + kfree(checksums);
> > + goto internal_hash_error;
> > }
> > + kfree(checksums);
> > }
> >
> > - if (likely(checksums != checksums_onstack))
> > - kfree(checksums);
> > } else {
> > struct bio_integrity_payload *bip = dio->orig_bi_integrity;
> >
> > @@ -1615,6 +1618,13 @@ static void integrity_metadata(struct work_struct *w)
> > skip_io:
> > dec_in_flight(dio);
> > return;
> > +internal_hash_error:
> > + if (r > 0) {
> > + DMERR_LIMIT("Checksum failed at sector 0x%llx",
> > + (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
> > + r = -EILSEQ;
> > + atomic64_inc(&ic->number_of_mismatches);
> > + }
> > error:
> > dio->bi_status = errno_to_blk_status(r);
> > dec_in_flight(dio);
> > @@ -3019,6 +3029,8 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim
> > limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
> > blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
> > }
> > +
> > + blk_limits_io_opt(limits, (1U << ic->sb->log2_interleave_sectors));
> > }
> >
> > static void calculate_journal_section_size(struct dm_integrity_c *ic)
> > --
> > 2.20.1
> >
> >
> > --
> > dm-devel mailing list
> > [email protected]
> > https://www.redhat.com/mailman/listinfo/dm-devel
> >
>

2020-02-26 14:15:19

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm-integrity: Prevent RMW for full tag area writes



On Wed, 26 Feb 2020, Lukas Straub wrote:

> > > - data = dm_bufio_read(ic->bufio, *metadata_block, &b);
> > > - if (IS_ERR(data))
> > > - return PTR_ERR(data);
> > > + /* Don't read tag area from disk if we're going to overwrite it completely */
> > > + if (op == TAG_WRITE && *metadata_offset == 0 && total_size >= ic->metadata_run) {
> >
> > Hi
> >
> > This is incorrect logic because ic->metadata_run is in the units of
> > 512-byte sectors and total_size is in bytes.
> >
> > If I correct the bug and change it to "if (op == TAG_WRITE &&
> > *metadata_offset == 0 && total_size >= ic->metadata_run << SECTOR_SHIFT)",
> > then the benchmark doesn't show any performance advantage at all.
>
> Uh oh, looking at it again i have mixed up sectors/bytes elsewhere too.
> Actually, could we rewrite this check like
> total_size >= (1U << SECTOR_SHIFT << ic->log2_buffer_sectors)?
> this should work, right?
> So we only have to overwrite part of the tag area, as long as it's whole sectors.
>
> > You would need much bigger bios to take advantage for this - for example,
> > if we have 4k block size and 64k metadata buffer size and 4-byte crc32,
> > there are 65536/4=16384 tags in one metadata buffer and we would need
> > 16384*4096=64MiB bio to completely overwrite the metadata buffer. Such big
> > bios are not realistic.
>
> What prevents us from using a single sector as the tag area? (Which was

Single sector writes perform badly on SSDs (and on disks with 4k physical
sector size). We would need at least 4k.

There's another problem - using smaller metadata blocks will degrade read
performance, because we would need to issue more requests to read the
metadata.

> my assumption with this patch) Then we would have (with 512b sectors)
> 512/4 = 128 tags = 64k bio, which is still below the optimal write size

4096/4*4096 = 4MiB - it may be possible, but it's still large.

> of raid5/6. I just tried to accomplish this, but there seems to be
> minimum limit of interleave_sectors.
>
> Regards,
> Lukas Straub

Mikulas

2020-02-27 12:09:07

by Lukas Straub

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm-integrity: Prevent RMW for full tag area writes

On Wed, 26 Feb 2020 09:14:31 -0500 (EST)
Mikulas Patocka <[email protected]> wrote:

> On Wed, 26 Feb 2020, Lukas Straub wrote:
>
> > > > - data = dm_bufio_read(ic->bufio, *metadata_block, &b);
> > > > - if (IS_ERR(data))
> > > > - return PTR_ERR(data);
> > > > + /* Don't read tag area from disk if we're going to overwrite it completely */
> > > > + if (op == TAG_WRITE && *metadata_offset == 0 && total_size >= ic->metadata_run) {
> > >
> > > Hi
> > >
> > > This is incorrect logic because ic->metadata_run is in the units of
> > > 512-byte sectors and total_size is in bytes.
> > >
> > > If I correct the bug and change it to "if (op == TAG_WRITE &&
> > > *metadata_offset == 0 && total_size >= ic->metadata_run << SECTOR_SHIFT)",
> > > then the benchmark doesn't show any performance advantage at all.
> >
> > Uh oh, looking at it again i have mixed up sectors/bytes elsewhere too.
> > Actually, could we rewrite this check like
> > total_size >= (1U << SECTOR_SHIFT << ic->log2_buffer_sectors)?
> > this should work, right?
> > So we only have to overwrite part of the tag area, as long as it's whole sectors.
> >
> > > You would need much bigger bios to take advantage for this - for example,
> > > if we have 4k block size and 64k metadata buffer size and 4-byte crc32,
> > > there are 65536/4=16384 tags in one metadata buffer and we would need
> > > 16384*4096=64MiB bio to completely overwrite the metadata buffer. Such big
> > > bios are not realistic.
> >
> > What prevents us from using a single sector as the tag area? (Which was
>
> Single sector writes perform badly on SSDs (and on disks with 4k physical
> sector size). We would need at least 4k.
People with SSDs can still use a large tag area.

> There's another problem - using smaller metadata blocks will degrade read
> performance, because we would need to issue more requests to read the
> metadata.
We can use prefetching, dm-bufio supports that.

> > my assumption with this patch) Then we would have (with 512b sectors)
> > 512/4 = 128 tags = 64k bio, which is still below the optimal write size
>
> 4096/4*4096 = 4MiB - it may be possible, but it's still large.

We don't have to fill whole sector with metadata, we can for example use just the first
512 bytes (giving 512/4*4096 = 512k writes).
The space overhead is negligible:
For 1T of data we have 1G of metadata if we fill the whole sector.
If we use just the first 512 bytes, we get 8G of metadata.

Regards,
Lukas Straub

> > of raid5/6. I just tried to accomplish this, but there seems to be
> > minimum limit of interleave_sectors.
> >
> > Regards,
> > Lukas Straub
>
> Mikulas
>