2020-10-23 11:30:12

by Artem Blagodarenko

[permalink] [raw]
Subject: [PATCH] libfs: Fix DIO mode aligment

From: Alexey Lyashkov <[email protected]>

Bounce buffer must be extended to hold a whole disk block size.
Read/write operations with unaligned offsets is prohobined
in the DIO mode, so read/write blocks should be adjusted to
reflect it.

Change-Id: Ic573c9ff0d476028dd2293f8b814c6112705db0e
Signed-off-by: Alexey Lyashkov <[email protected]>
HPE-bug-id: LUS-9241
---
lib/ext2fs/io_manager.c | 5 +-
lib/ext2fs/unix_io.c | 296 +++++++++++++++++++++++++---------------
2 files changed, 190 insertions(+), 111 deletions(-)

diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index c395d615..84399a12 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -20,6 +20,9 @@
#include "ext2_fs.h"
#include "ext2fs.h"

+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+
errcode_t io_channel_set_options(io_channel channel, const char *opts)
{
errcode_t retval = 0;
@@ -128,7 +131,7 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
size_t size;

if (count == 0)
- size = io->block_size;
+ size = max(io->block_size, io->align);
else if (count > 0)
size = io->block_size * count;
else
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 628e60c3..53647a22 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -154,46 +154,30 @@ static char *safe_getenv(const char *arg)
/*
* Here are the raw I/O functions
*/
-static errcode_t raw_read_blk(io_channel channel,
+static errcode_t raw_aligned_read_blk(io_channel channel,
struct unix_private_data *data,
- unsigned long long block,
- int count, void *bufv)
+ ext2_loff_t location,
+ ssize_t size, void *bufv,
+ int *asize)
{
- errcode_t retval;
- ssize_t size;
- ext2_loff_t location;
int actual = 0;
+ errcode_t retval;
unsigned char *buf = bufv;
- ssize_t really_read = 0;
-
- size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
- data->io_stats.bytes_read += size;
- location = ((ext2_loff_t) block * channel->block_size) + data->offset;

- if (data->flags & IO_FLAG_FORCE_BOUNCE) {
- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
- retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
- goto error_out;
- }
- goto bounce_read;
- }
+#ifdef ALIGN_DEBUG
+ printf("raw_aligned_read_blk: %p %lu<>%lu\n", buf,
+ location, (unsigned long) size);
+#endif

#ifdef HAVE_PREAD64
/* Try an aligned pread */
- if ((channel->align == 0) ||
- (IS_ALIGNED(buf, channel->align) &&
- IS_ALIGNED(size, channel->align))) {
- actual = pread64(data->dev, buf, size, location);
- if (actual == size)
- return 0;
- actual = 0;
- }
+ actual = pread64(data->dev, buf, size, location);
+ if (actual == size)
+ return 0;
+ actual = 0;
#elif HAVE_PREAD
/* Try an aligned pread */
- if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
- ((channel->align == 0) ||
- (IS_ALIGNED(buf, channel->align) &&
- IS_ALIGNED(size, channel->align)))) {
+ if (sizeof(off_t) >= sizeof(ext2_loff_t)) {
actual = pread(data->dev, buf, size, location);
if (actual == size)
return 0;
@@ -205,47 +189,100 @@ static errcode_t raw_read_blk(io_channel channel,
retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
goto error_out;
}
+
+ actual = read(data->dev, buf, size);
+ if (actual != size) {
+ if (actual < 0) {
+ retval = errno;
+ actual = 0;
+ } else {
+ retval = EXT2_ET_SHORT_READ;
+ }
+ }
+
+error_out:
+ *asize = actual;
+ return retval;
+}
+
+
+/*
+ * Here are the raw I/O functions
+ */
+static errcode_t raw_read_blk(io_channel channel,
+ struct unix_private_data *data,
+ unsigned long long block,
+ int count, void *bufv)
+{
+ errcode_t retval;
+ ssize_t size;
+ ext2_loff_t location;
+ int actual = 0;
+ unsigned char *buf = bufv;
+ unsigned int align = channel->align;
+ ssize_t really_read = 0;
+ blk64_t blk;
+ loff_t offset;
+
+ size = (count < 0) ? -count : count * channel->block_size;
+ data->io_stats.bytes_read += size;
+ location = ((ext2_loff_t) block * channel->block_size) + data->offset;
+
+ if (data->flags & IO_FLAG_FORCE_BOUNCE) {
+ align = channel->block_size;
+ goto bounce_read;
+ }
+
if ((channel->align == 0) ||
- (IS_ALIGNED(buf, channel->align) &&
+ (IS_ALIGNED(location, channel->align) &&
+ IS_ALIGNED(buf, channel->align) &&
IS_ALIGNED(size, channel->align))) {
- actual = read(data->dev, buf, size);
- if (actual != size) {
- short_read:
- if (actual < 0) {
- retval = errno;
- actual = 0;
- } else
- retval = EXT2_ET_SHORT_READ;
+ retval = raw_aligned_read_blk(channel, data, location,
+ size, bufv, &actual);
+ if (retval != 0)
goto error_out;
- }
- return 0;
+
+ return retval;
}

+bounce_read:
#ifdef ALIGN_DEBUG
- printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
- (unsigned long) size);
+ printf("raw_read_blk: O_DIRECT fallback: %p %lu<>%lu\n", buf,
+ location, (unsigned long) size);
#endif
-
/*
* The buffer or size which we're trying to read isn't aligned
* to the O_DIRECT rules, so we need to do this the hard way...
+ * read / write must be aligned to the block device sector size
*/
-bounce_read:
+
+ blk = location / align;
+ offset = location % align;
+
+ if (lseek(data->dev, blk * align, SEEK_SET) < 0)
+ return errno;
+
while (size > 0) {
- actual = read(data->dev, data->bounce, channel->block_size);
- if (actual != channel->block_size) {
+ actual = read(data->dev, data->bounce, align);
+ if (actual != align) {
actual = really_read;
buf -= really_read;
size += really_read;
- goto short_read;
+ retval = EXT2_ET_SHORT_READ;
+ goto error_out;
}
actual = size;
- if (size > channel->block_size)
- actual = channel->block_size;
- memcpy(buf, data->bounce, actual);
+ if ((actual + offset) > align)
+ actual = align - offset;
+ if (actual > size)
+ actual = size;
+
+ memcpy(buf, data->bounce + offset, actual);
really_read += actual;
size -= actual;
buf += actual;
+ offset = 0;
+ blk++;
}
return 0;

@@ -258,6 +295,58 @@ error_out:
return retval;
}

+static errcode_t raw_aligned_write_blk(io_channel channel,
+ struct unix_private_data *data,
+ ext2_loff_t location,
+ ssize_t size, const void *bufv,
+ int *asize)
+
+{
+ int actual = 0;
+ errcode_t retval;
+ const unsigned char *buf = (void *)bufv;
+
+#ifdef ALIGN_DEBUG
+ printf("raw_aligned_write_blk: %p %lu %lu\n", buf,
+ location, (unsigned long) size);
+#endif
+
+#ifdef HAVE_PWRITE64
+ /* Try an aligned pwrite */
+ actual = pwrite64(data->dev, buf, size, location);
+ if (actual == size)
+ return 0;
+#elif HAVE_PWRITE
+ /* Try an aligned pwrite */
+ if ((sizeof(off_t) >= sizeof(ext2_loff_t)) {
+ actual = pwrite(data->dev, buf, size, location);
+ if (actual == size)
+ return 0;
+ }
+#endif /* HAVE_PWRITE */
+
+ if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+ goto error_out;
+ }
+
+ actual = write(data->dev, buf, size);
+ if (actual < 0) {
+ retval = errno;
+ goto error_out;
+ }
+ if (actual != size) {
+ retval = EXT2_ET_SHORT_WRITE;
+ goto error_out;
+ }
+ retval = 0;
+error_out:
+ *asize = actual;
+ return retval;
+
+}
+
+
static errcode_t raw_write_blk(io_channel channel,
struct unix_private_data *data,
unsigned long long block,
@@ -268,6 +357,9 @@ static errcode_t raw_write_blk(io_channel channel,
int actual = 0;
errcode_t retval;
const unsigned char *buf = bufv;
+ unsigned int align = channel->align;
+ blk64_t blk;
+ loff_t offset;

if (count == 1)
size = channel->block_size;
@@ -282,95 +374,79 @@ static errcode_t raw_write_blk(io_channel channel,
location = ((ext2_loff_t) block * channel->block_size) + data->offset;

if (data->flags & IO_FLAG_FORCE_BOUNCE) {
- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
- retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
- goto error_out;
- }
+ align = channel->block_size;
goto bounce_write;
}

-#ifdef HAVE_PWRITE64
- /* Try an aligned pwrite */
if ((channel->align == 0) ||
- (IS_ALIGNED(buf, channel->align) &&
+ (IS_ALIGNED(location, channel->align) &&
+ IS_ALIGNED(buf, channel->align) &&
IS_ALIGNED(size, channel->align))) {
- actual = pwrite64(data->dev, buf, size, location);
- if (actual == size)
- return 0;
- }
-#elif HAVE_PWRITE
- /* Try an aligned pwrite */
- if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
- ((channel->align == 0) ||
- (IS_ALIGNED(buf, channel->align) &&
- IS_ALIGNED(size, channel->align)))) {
- actual = pwrite(data->dev, buf, size, location);
- if (actual == size)
- return 0;
- }
-#endif /* HAVE_PWRITE */
+ retval = raw_aligned_write_blk(channel, data, location,
+ size, bufv, &actual);
+ if (retval != 0)
+ goto error_out;

- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
- retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
- goto error_out;
+ return retval;
}

- if ((channel->align == 0) ||
- (IS_ALIGNED(buf, channel->align) &&
- IS_ALIGNED(size, channel->align))) {
- actual = write(data->dev, buf, size);
- if (actual < 0) {
- retval = errno;
- goto error_out;
- }
- if (actual != size) {
- short_write:
- retval = EXT2_ET_SHORT_WRITE;
- goto error_out;
- }
- return 0;
- }

+bounce_write:
#ifdef ALIGN_DEBUG
printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
(unsigned long) size);
#endif
+
+ /* logical offset may don't aligned with block device block size */
+ blk = location / align;
+ offset = location % align;
+
+ if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
+ retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+ goto error_out;
+ }
+
/*
* The buffer or size which we're trying to write isn't aligned
* to the O_DIRECT rules, so we need to do this the hard way...
*/
-bounce_write:
while (size > 0) {
- if (size < channel->block_size) {
- actual = read(data->dev, data->bounce,
- channel->block_size);
- if (actual != channel->block_size) {
- if (actual < 0) {
- retval = errno;
- goto error_out;
- }
- memset((char *) data->bounce + actual, 0,
- channel->block_size - actual);
+ int actual_w;
+
+ memset((char *) data->bounce, 0, align);
+ if (offset || (size < align)) {
+ actual = read(data->dev, data->bounce, align);
+ if (actual < 0) {
+ retval = errno;
+ goto error_out;
}
}
actual = size;
- if (size > channel->block_size)
- actual = channel->block_size;
- memcpy(data->bounce, buf, actual);
- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ if ((actual + offset) > align)
+ actual = align - offset;
+ if (actual > size)
+ actual = size;
+ memcpy(((char *)data->bounce) + offset, buf, actual);
+
+ if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
goto error_out;
}
- actual = write(data->dev, data->bounce, channel->block_size);
- if (actual < 0) {
+
+ actual_w = write(data->dev, data->bounce, align);
+ if (actual_w < 0) {
retval = errno;
goto error_out;
}
- if (actual != channel->block_size)
- goto short_write;
+ if (actual_w != align) {
+ retval = EXT2_ET_SHORT_WRITE;
+ goto error_out;
+ }
size -= actual;
buf += actual;
location += actual;
+ blk ++;
+ offset = 0;
}
return 0;

--
2.18.4


2020-11-17 15:32:00

by Artem Blagodarenko

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

Hello,

Any thoughts about this change? Thanks.

Best regards,
Artem Blagodarenko.

> On 23 Oct 2020, at 14:26, Artem Blagodarenko <[email protected]> wrote:
>
> From: Alexey Lyashkov <[email protected]>
>
> Bounce buffer must be extended to hold a whole disk block size.
> Read/write operations with unaligned offsets is prohobined
> in the DIO mode, so read/write blocks should be adjusted to
> reflect it.
>
> Change-Id: Ic573c9ff0d476028dd2293f8b814c6112705db0e
> Signed-off-by: Alexey Lyashkov <[email protected]>
> HPE-bug-id: LUS-9241
> ---
> lib/ext2fs/io_manager.c | 5 +-
> lib/ext2fs/unix_io.c | 296 +++++++++++++++++++++++++---------------
> 2 files changed, 190 insertions(+), 111 deletions(-)
>
> diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
> index c395d615..84399a12 100644
> --- a/lib/ext2fs/io_manager.c
> +++ b/lib/ext2fs/io_manager.c
> @@ -20,6 +20,9 @@
> #include "ext2_fs.h"
> #include "ext2fs.h"
>
> +#define max(a, b) ((a) > (b) ? (a) : (b))
> +
> +
> errcode_t io_channel_set_options(io_channel channel, const char *opts)
> {
> errcode_t retval = 0;
> @@ -128,7 +131,7 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
> size_t size;
>
> if (count == 0)
> - size = io->block_size;
> + size = max(io->block_size, io->align);
> else if (count > 0)
> size = io->block_size * count;
> else
> diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
> index 628e60c3..53647a22 100644
> --- a/lib/ext2fs/unix_io.c
> +++ b/lib/ext2fs/unix_io.c
> @@ -154,46 +154,30 @@ static char *safe_getenv(const char *arg)
> /*
> * Here are the raw I/O functions
> */
> -static errcode_t raw_read_blk(io_channel channel,
> +static errcode_t raw_aligned_read_blk(io_channel channel,
> struct unix_private_data *data,
> - unsigned long long block,
> - int count, void *bufv)
> + ext2_loff_t location,
> + ssize_t size, void *bufv,
> + int *asize)
> {
> - errcode_t retval;
> - ssize_t size;
> - ext2_loff_t location;
> int actual = 0;
> + errcode_t retval;
> unsigned char *buf = bufv;
> - ssize_t really_read = 0;
> -
> - size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
> - data->io_stats.bytes_read += size;
> - location = ((ext2_loff_t) block * channel->block_size) + data->offset;
>
> - if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> - retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> - goto error_out;
> - }
> - goto bounce_read;
> - }
> +#ifdef ALIGN_DEBUG
> + printf("raw_aligned_read_blk: %p %lu<>%lu\n", buf,
> + location, (unsigned long) size);
> +#endif
>
> #ifdef HAVE_PREAD64
> /* Try an aligned pread */
> - if ((channel->align == 0) ||
> - (IS_ALIGNED(buf, channel->align) &&
> - IS_ALIGNED(size, channel->align))) {
> - actual = pread64(data->dev, buf, size, location);
> - if (actual == size)
> - return 0;
> - actual = 0;
> - }
> + actual = pread64(data->dev, buf, size, location);
> + if (actual == size)
> + return 0;
> + actual = 0;
> #elif HAVE_PREAD
> /* Try an aligned pread */
> - if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> - ((channel->align == 0) ||
> - (IS_ALIGNED(buf, channel->align) &&
> - IS_ALIGNED(size, channel->align)))) {
> + if (sizeof(off_t) >= sizeof(ext2_loff_t)) {
> actual = pread(data->dev, buf, size, location);
> if (actual == size)
> return 0;
> @@ -205,47 +189,100 @@ static errcode_t raw_read_blk(io_channel channel,
> retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> goto error_out;
> }
> +
> + actual = read(data->dev, buf, size);
> + if (actual != size) {
> + if (actual < 0) {
> + retval = errno;
> + actual = 0;
> + } else {
> + retval = EXT2_ET_SHORT_READ;
> + }
> + }
> +
> +error_out:
> + *asize = actual;
> + return retval;
> +}
> +
> +
> +/*
> + * Here are the raw I/O functions
> + */
> +static errcode_t raw_read_blk(io_channel channel,
> + struct unix_private_data *data,
> + unsigned long long block,
> + int count, void *bufv)
> +{
> + errcode_t retval;
> + ssize_t size;
> + ext2_loff_t location;
> + int actual = 0;
> + unsigned char *buf = bufv;
> + unsigned int align = channel->align;
> + ssize_t really_read = 0;
> + blk64_t blk;
> + loff_t offset;
> +
> + size = (count < 0) ? -count : count * channel->block_size;
> + data->io_stats.bytes_read += size;
> + location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> +
> + if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> + align = channel->block_size;
> + goto bounce_read;
> + }
> +
> if ((channel->align == 0) ||
> - (IS_ALIGNED(buf, channel->align) &&
> + (IS_ALIGNED(location, channel->align) &&
> + IS_ALIGNED(buf, channel->align) &&
> IS_ALIGNED(size, channel->align))) {
> - actual = read(data->dev, buf, size);
> - if (actual != size) {
> - short_read:
> - if (actual < 0) {
> - retval = errno;
> - actual = 0;
> - } else
> - retval = EXT2_ET_SHORT_READ;
> + retval = raw_aligned_read_blk(channel, data, location,
> + size, bufv, &actual);
> + if (retval != 0)
> goto error_out;
> - }
> - return 0;
> +
> + return retval;
> }
>
> +bounce_read:
> #ifdef ALIGN_DEBUG
> - printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
> - (unsigned long) size);
> + printf("raw_read_blk: O_DIRECT fallback: %p %lu<>%lu\n", buf,
> + location, (unsigned long) size);
> #endif
> -
> /*
> * The buffer or size which we're trying to read isn't aligned
> * to the O_DIRECT rules, so we need to do this the hard way...
> + * read / write must be aligned to the block device sector size
> */
> -bounce_read:
> +
> + blk = location / align;
> + offset = location % align;
> +
> + if (lseek(data->dev, blk * align, SEEK_SET) < 0)
> + return errno;
> +
> while (size > 0) {
> - actual = read(data->dev, data->bounce, channel->block_size);
> - if (actual != channel->block_size) {
> + actual = read(data->dev, data->bounce, align);
> + if (actual != align) {
> actual = really_read;
> buf -= really_read;
> size += really_read;
> - goto short_read;
> + retval = EXT2_ET_SHORT_READ;
> + goto error_out;
> }
> actual = size;
> - if (size > channel->block_size)
> - actual = channel->block_size;
> - memcpy(buf, data->bounce, actual);
> + if ((actual + offset) > align)
> + actual = align - offset;
> + if (actual > size)
> + actual = size;
> +
> + memcpy(buf, data->bounce + offset, actual);
> really_read += actual;
> size -= actual;
> buf += actual;
> + offset = 0;
> + blk++;
> }
> return 0;
>
> @@ -258,6 +295,58 @@ error_out:
> return retval;
> }
>
> +static errcode_t raw_aligned_write_blk(io_channel channel,
> + struct unix_private_data *data,
> + ext2_loff_t location,
> + ssize_t size, const void *bufv,
> + int *asize)
> +
> +{
> + int actual = 0;
> + errcode_t retval;
> + const unsigned char *buf = (void *)bufv;
> +
> +#ifdef ALIGN_DEBUG
> + printf("raw_aligned_write_blk: %p %lu %lu\n", buf,
> + location, (unsigned long) size);
> +#endif
> +
> +#ifdef HAVE_PWRITE64
> + /* Try an aligned pwrite */
> + actual = pwrite64(data->dev, buf, size, location);
> + if (actual == size)
> + return 0;
> +#elif HAVE_PWRITE
> + /* Try an aligned pwrite */
> + if ((sizeof(off_t) >= sizeof(ext2_loff_t)) {
> + actual = pwrite(data->dev, buf, size, location);
> + if (actual == size)
> + return 0;
> + }
> +#endif /* HAVE_PWRITE */
> +
> + if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> + goto error_out;
> + }
> +
> + actual = write(data->dev, buf, size);
> + if (actual < 0) {
> + retval = errno;
> + goto error_out;
> + }
> + if (actual != size) {
> + retval = EXT2_ET_SHORT_WRITE;
> + goto error_out;
> + }
> + retval = 0;
> +error_out:
> + *asize = actual;
> + return retval;
> +
> +}
> +
> +
> static errcode_t raw_write_blk(io_channel channel,
> struct unix_private_data *data,
> unsigned long long block,
> @@ -268,6 +357,9 @@ static errcode_t raw_write_blk(io_channel channel,
> int actual = 0;
> errcode_t retval;
> const unsigned char *buf = bufv;
> + unsigned int align = channel->align;
> + blk64_t blk;
> + loff_t offset;
>
> if (count == 1)
> size = channel->block_size;
> @@ -282,95 +374,79 @@ static errcode_t raw_write_blk(io_channel channel,
> location = ((ext2_loff_t) block * channel->block_size) + data->offset;
>
> if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> - retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> - goto error_out;
> - }
> + align = channel->block_size;
> goto bounce_write;
> }
>
> -#ifdef HAVE_PWRITE64
> - /* Try an aligned pwrite */
> if ((channel->align == 0) ||
> - (IS_ALIGNED(buf, channel->align) &&
> + (IS_ALIGNED(location, channel->align) &&
> + IS_ALIGNED(buf, channel->align) &&
> IS_ALIGNED(size, channel->align))) {
> - actual = pwrite64(data->dev, buf, size, location);
> - if (actual == size)
> - return 0;
> - }
> -#elif HAVE_PWRITE
> - /* Try an aligned pwrite */
> - if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> - ((channel->align == 0) ||
> - (IS_ALIGNED(buf, channel->align) &&
> - IS_ALIGNED(size, channel->align)))) {
> - actual = pwrite(data->dev, buf, size, location);
> - if (actual == size)
> - return 0;
> - }
> -#endif /* HAVE_PWRITE */
> + retval = raw_aligned_write_blk(channel, data, location,
> + size, bufv, &actual);
> + if (retval != 0)
> + goto error_out;
>
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> - retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> - goto error_out;
> + return retval;
> }
>
> - if ((channel->align == 0) ||
> - (IS_ALIGNED(buf, channel->align) &&
> - IS_ALIGNED(size, channel->align))) {
> - actual = write(data->dev, buf, size);
> - if (actual < 0) {
> - retval = errno;
> - goto error_out;
> - }
> - if (actual != size) {
> - short_write:
> - retval = EXT2_ET_SHORT_WRITE;
> - goto error_out;
> - }
> - return 0;
> - }
>
> +bounce_write:
> #ifdef ALIGN_DEBUG
> printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
> (unsigned long) size);
> #endif
> +
> + /* logical offset may don't aligned with block device block size */
> + blk = location / align;
> + offset = location % align;
> +
> + if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
> + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> + goto error_out;
> + }
> +
> /*
> * The buffer or size which we're trying to write isn't aligned
> * to the O_DIRECT rules, so we need to do this the hard way...
> */
> -bounce_write:
> while (size > 0) {
> - if (size < channel->block_size) {
> - actual = read(data->dev, data->bounce,
> - channel->block_size);
> - if (actual != channel->block_size) {
> - if (actual < 0) {
> - retval = errno;
> - goto error_out;
> - }
> - memset((char *) data->bounce + actual, 0,
> - channel->block_size - actual);
> + int actual_w;
> +
> + memset((char *) data->bounce, 0, align);
> + if (offset || (size < align)) {
> + actual = read(data->dev, data->bounce, align);
> + if (actual < 0) {
> + retval = errno;
> + goto error_out;
> }
> }
> actual = size;
> - if (size > channel->block_size)
> - actual = channel->block_size;
> - memcpy(data->bounce, buf, actual);
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> + if ((actual + offset) > align)
> + actual = align - offset;
> + if (actual > size)
> + actual = size;
> + memcpy(((char *)data->bounce) + offset, buf, actual);
> +
> + if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
> retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> goto error_out;
> }
> - actual = write(data->dev, data->bounce, channel->block_size);
> - if (actual < 0) {
> +
> + actual_w = write(data->dev, data->bounce, align);
> + if (actual_w < 0) {
> retval = errno;
> goto error_out;
> }
> - if (actual != channel->block_size)
> - goto short_write;
> + if (actual_w != align) {
> + retval = EXT2_ET_SHORT_WRITE;
> + goto error_out;
> + }
> size -= actual;
> buf += actual;
> location += actual;
> + blk ++;
> + offset = 0;
> }
> return 0;
>
> --
> 2.18.4
>

2020-11-17 19:21:22

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
> Hello,
>
> Any thoughts about this change? Thanks.

I'm trying to think of situations where this could actually trigger in
real life. The only one I can think of is if a file system with a 1k
block file system is located on an an Advanced FormatDrive with a 4k
sector size.

What was the use case where this was actually an issue?

- Ted

2020-11-19 12:29:36

by Lyashkov, Alexey

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

Tso,

This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.
e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
(many other places exist, but it simplest).
>>
if (superblock) {
if (!block_size) {
retval = EXT2_ET_INVALID_ARGUMENT;
goto cleanup;
}
io_channel_set_blksize(fs->io, block_size);
group_block = superblock;
fs->orig_super = 0;
} else {
io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET); <<<<< this is problem
superblock = 1;
group_block = 0;
retval = ext2fs_get_mem(SUPERBLOCK_SIZE, &fs->orig_super);
if (retval)
goto cleanup;
}
retval = io_channel_read_blk(fs->io, superblock, -SUPERBLOCK_SIZE,
fs->super);
>>
It caused errors like
# e2image -Q /dev/md65 /tmp/node05_image_out
e2image 1.45.6.cr1 (14-Aug-2020)
e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/md65
Couldn’t find valid filesystem superblock.

It looks like I don't first person to found a bug, as someone was add

Alex

On 17/11/2020, 22:19, "Theodore Y. Ts'o" <[email protected]> wrote:

On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
> Hello,
>
> Any thoughts about this change? Thanks.

I'm trying to think of situations where this could actually trigger in
real life. The only one I can think of is if a file system with a 1k
block file system is located on an an Advanced FormatDrive with a 4k
sector size.

What was the use case where this was actually an issue?

- Ted

2020-12-08 08:37:01

by Artem Blagodarenko

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

Hello Theodore,

Is this case important enough to fix the code?

Thanks.
Best regards,
Artem Blagodarenko.


> On 19 Nov 2020, at 15:26, Lyashkov, Alexey <[email protected]> wrote:
>
> Tso,
>
> This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.
> e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
> (many other places exist, but it simplest).
>>>
> if (superblock) {
> if (!block_size) {
> retval = EXT2_ET_INVALID_ARGUMENT;
> goto cleanup;
> }
> io_channel_set_blksize(fs->io, block_size);
> group_block = superblock;
> fs->orig_super = 0;
> } else {
> io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET); <<<<< this is problem
> superblock = 1;
> group_block = 0;
> retval = ext2fs_get_mem(SUPERBLOCK_SIZE, &fs->orig_super);
> if (retval)
> goto cleanup;
> }
> retval = io_channel_read_blk(fs->io, superblock, -SUPERBLOCK_SIZE,
> fs->super);
>>>
> It caused errors like
> # e2image -Q /dev/md65 /tmp/node05_image_out
> e2image 1.45.6.cr1 (14-Aug-2020)
> e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/md65
> Couldn’t find valid filesystem superblock.
>
> It looks like I don't first person to found a bug, as someone was add
>
> Alex
>
> On 17/11/2020, 22:19, "Theodore Y. Ts'o" <[email protected]> wrote:
>
> On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
>> Hello,
>>
>> Any thoughts about this change? Thanks.
>
> I'm trying to think of situations where this could actually trigger in
> real life. The only one I can think of is if a file system with a 1k
> block file system is located on an an Advanced FormatDrive with a 4k
> sector size.
>
> What was the use case where this was actually an issue?
>
> - Ted
>

2020-12-18 22:08:13

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

On Nov 19, 2020, at 5:26 AM, Lyashkov, Alexey <[email protected]> wrote:
>
> Tso,
>
> This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.

It would be useful to include this patch for e2image as part of this submission,
so that this can be tested. I suspect that O_DIRECT would be useful for other
tools (e.g. e2fsck, debugfs, etc.) since the IO manager would avoid double
buffering the data in both the kernel and userspace.

> e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
> (many other places exist, but it simplest).

Are there actually other places where it is doing sub-block-size reads from disk?

It seems simpler to fix the superblock read at open to always read the first 4KB
into a buffer (and to make it easy to extend to 16KB or 64KB if sector sizes get
even larger), then find the superblock within the buffer to decide the blocksize.

That avoids the short/unaligned read from disk when opening the filesystem, without
the need to add complexity to the reading code to buffer all unaligned reads, for
a case that doesn't seem likely otherwise. The only other possibility I can think
that would need this is a small-block filesystem image (e.g. 1KB) copied to a
large-sector device? It isn't clear if the kernel would be able to mount that...

Cheers, Andreas

> if (superblock) {
> if (!block_size) {
> retval = EXT2_ET_INVALID_ARGUMENT;
> goto cleanup;
> }
> io_channel_set_blksize(fs->io, block_size);
> group_block = superblock;
> fs->orig_super = 0;
> } else {
> io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET); <<<<< this is problem
> superblock = 1;
> group_block = 0;
> retval = ext2fs_get_mem(SUPERBLOCK_SIZE, &fs->orig_super);
> if (retval)
> goto cleanup;
> }
> retval = io_channel_read_blk(fs->io, superblock, -SUPERBLOCK_SIZE,
> fs->super);
>
> It caused errors like
> # e2image -Q /dev/md65 /tmp/node05_image_out
> e2image 1.45.6.cr1 (14-Aug-2020)
> e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/md65
> Couldn’t find valid filesystem superblock.
>
> It looks like I don't first person to found a bug, as someone was add
>
> Alex
>
> On 17/11/2020, 22:19, "Theodore Y. Ts'o" <[email protected]> wrote:
>
> On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
>> Hello,
>>
>> Any thoughts about this change? Thanks.
>
> I'm trying to think of situations where this could actually trigger in
> real life. The only one I can think of is if a file system with a 1k
> block file system is located on an an Advanced FormatDrive with a 4k
> sector size.
>
> What was the use case where this was actually an issue?
>
> - Ted
>


Cheers, Andreas






Attachments:
signature.asc (890.00 B)
Message signed with OpenPGP

2020-12-19 04:34:11

by Lyashkov, Alexey

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment



On 19/12/2020, 01:03, "Andreas Dilger" <[email protected]> wrote:

On Nov 19, 2020, at 5:26 AM, Lyashkov, Alexey <[email protected]> wrote:
>
> Tso,
>
> This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.

> It would be useful to include this patch for e2image as part of this submission,
> so that this can be tested. I suspect that O_DIRECT would be useful for other
> tools (e.g. e2fsck, debugfs, etc.) since the IO manager would avoid double
> buffering the data in both the kernel and userspace.

debugfs have a -D option already. As about e2fsck have run in single user and several loops over FS exist.
So caching is good to have there. Don't forget - caching permits an readahead works - which is very usefull for the large filesystem open.



> e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
> (many other places exist, but it simplest).

> Are there actually other places where it is doing sub-block-size reads from disk?
Many places.

bash-3.2$ grep -rn io_channel_set_blksize * | grep SUPERBLOCK
lib/ext2fs/undo_io.c:223: io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);
lib/ext2fs/undo_io.c:506: io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);
lib/ext2fs/closefs.c:201: io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET);
lib/ext2fs/openfs.c:218: io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET);
misc/mke2fs.c:2573: io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);
misc/e2undo.c:168: io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);

and some places where set_blksize was called with other size different than block device size.
In theory we can create an FS with 1K block size, and tools should able to work with it.


> It seems simpler to fix the superblock read at open to always read the first 4KB
> into a buffer (and to make it easy to extend to 16KB or 64KB if sector sizes get
> even larger), then find the superblock within the buffer to decide the blocksize.

And make it on many places including an metadata reading in case FS block size is 1k.




2021-02-26 23:19:29

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

I've rewritten the patch because it was simplest way to review the
code. The resulting code is simpler and has a smaller number of lines
of code. I don't have any 4k advanced format disks handy, so I'd
appreciate it if someone can test it. It passes the existing
regression tests, and I've run a number of manual tests to simulate a
advanced format HDD, with the tests being run with clang and UBSAN and
ADDRSAN enabled.

If someone with access to an advanced format disk can test running
debugfs -D on an advanced format disk, that would be great, thanks.

- Ted

commit c001596110e834a85b01a47a20811b318cb3b9e4
Author: Theodore Ts'o <[email protected]>
Date: Fri Feb 26 17:41:06 2021 -0500

libext2fs: fix unix_io's Direct I/O support

The previous Direct I/O support worked on HDD's with 512 byte logical
sector sizes, and on FreeBSD which required 4k aligned memory buffers.
However, it was incomplete and was not correctly working on HDD's with
4k logical sector sizes (aka Advanced Format Disks).

Based on a patch from Alexey Lyashkov <[email protected]> but
rewritten to work with the latest e2fsprogs and to minimize changes to
make it easier to review.

Signed-off-by: Theodore Ts'o <[email protected]>
Reported-by: Artem Blagodarenko <[email protected]>

diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index c395d615..996c31a1 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -134,9 +134,11 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
else
size = -count;

- if (io->align)
+ if (io->align) {
+ if (io->align > size)
+ size = io->align;
return ext2fs_get_memalign(size, io->align, ptr);
- else
+ } else
return ext2fs_get_mem(size, ptr);
}

diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 73f5667e..8965535c 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -218,6 +218,8 @@ static errcode_t raw_read_blk(io_channel channel,
int actual = 0;
unsigned char *buf = bufv;
ssize_t really_read = 0;
+ unsigned long long aligned_blk;
+ int align_size, offset;

size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
mutex_lock(data, STATS_MTX);
@@ -226,7 +228,7 @@ static errcode_t raw_read_blk(io_channel channel,
location = ((ext2_loff_t) block * channel->block_size) + data->offset;

if (data->flags & IO_FLAG_FORCE_BOUNCE) {
- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
goto error_out;
}
@@ -237,6 +239,7 @@ static errcode_t raw_read_blk(io_channel channel,
/* Try an aligned pread */
if ((channel->align == 0) ||
(IS_ALIGNED(buf, channel->align) &&
+ IS_ALIGNED(location, channel->align) &&
IS_ALIGNED(size, channel->align))) {
actual = pread64(data->dev, buf, size, location);
if (actual == size)
@@ -248,6 +251,7 @@ static errcode_t raw_read_blk(io_channel channel,
if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
((channel->align == 0) ||
(IS_ALIGNED(buf, channel->align) &&
+ IS_ALIGNED(location, channel->align) &&
IS_ALIGNED(size, channel->align)))) {
actual = pread(data->dev, buf, size, location);
if (actual == size)
@@ -256,12 +260,13 @@ static errcode_t raw_read_blk(io_channel channel,
}
#endif /* HAVE_PREAD */

- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
goto error_out;
}
if ((channel->align == 0) ||
(IS_ALIGNED(buf, channel->align) &&
+ IS_ALIGNED(location, channel->align) &&
IS_ALIGNED(size, channel->align))) {
actual = read(data->dev, buf, size);
if (actual != size) {
@@ -286,23 +291,39 @@ static errcode_t raw_read_blk(io_channel channel,
* to the O_DIRECT rules, so we need to do this the hard way...
*/
bounce_read:
+ if ((channel->block_size > channel->align) &&
+ (channel->block_size % channel->align) == 0)
+ align_size = channel->block_size;
+ else
+ align_size = channel->align;
+ aligned_blk = location / align_size;
+ offset = location % align_size;
+
+ if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
+ retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+ goto error_out;
+ }
while (size > 0) {
mutex_lock(data, BOUNCE_MTX);
- actual = read(data->dev, data->bounce, channel->block_size);
- if (actual != channel->block_size) {
+ actual = read(data->dev, data->bounce, align_size);
+ if (actual != align_size) {
mutex_unlock(data, BOUNCE_MTX);
actual = really_read;
buf -= really_read;
size += really_read;
goto short_read;
}
- actual = size;
- if (size > channel->block_size)
- actual = channel->block_size;
- memcpy(buf, data->bounce, actual);
+ if ((actual + offset) > align_size)
+ actual = align_size - offset;
+ if (actual > size)
+ actual = size;
+ memcpy(buf, data->bounce + offset, actual);
+
really_read += actual;
size -= actual;
buf += actual;
+ offset = 0;
+ aligned_blk++;
mutex_unlock(data, BOUNCE_MTX);
}
return 0;
@@ -326,6 +347,8 @@ static errcode_t raw_write_blk(io_channel channel,
int actual = 0;
errcode_t retval;
const unsigned char *buf = bufv;
+ unsigned long long aligned_blk;
+ int align_size, offset;

if (count == 1)
size = channel->block_size;
@@ -342,7 +365,7 @@ static errcode_t raw_write_blk(io_channel channel,
location = ((ext2_loff_t) block * channel->block_size) + data->offset;

if (data->flags & IO_FLAG_FORCE_BOUNCE) {
- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
goto error_out;
}
@@ -353,6 +376,7 @@ static errcode_t raw_write_blk(io_channel channel,
/* Try an aligned pwrite */
if ((channel->align == 0) ||
(IS_ALIGNED(buf, channel->align) &&
+ IS_ALIGNED(location, channel->align) &&
IS_ALIGNED(size, channel->align))) {
actual = pwrite64(data->dev, buf, size, location);
if (actual == size)
@@ -363,6 +387,7 @@ static errcode_t raw_write_blk(io_channel channel,
if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
((channel->align == 0) ||
(IS_ALIGNED(buf, channel->align) &&
+ IS_ALIGNED(location, channel->align) &&
IS_ALIGNED(size, channel->align)))) {
actual = pwrite(data->dev, buf, size, location);
if (actual == size)
@@ -370,13 +395,14 @@ static errcode_t raw_write_blk(io_channel channel,
}
#endif /* HAVE_PWRITE */

- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
goto error_out;
}

if ((channel->align == 0) ||
(IS_ALIGNED(buf, channel->align) &&
+ IS_ALIGNED(location, channel->align) &&
IS_ALIGNED(size, channel->align))) {
actual = write(data->dev, buf, size);
if (actual < 0) {
@@ -400,40 +426,59 @@ static errcode_t raw_write_blk(io_channel channel,
* to the O_DIRECT rules, so we need to do this the hard way...
*/
bounce_write:
+ if ((channel->block_size > channel->align) &&
+ (channel->block_size % channel->align) == 0)
+ align_size = channel->block_size;
+ else
+ align_size = channel->align;
+ aligned_blk = location / align_size;
+ offset = location % align_size;
+
while (size > 0) {
+ int actual_w;
+
mutex_lock(data, BOUNCE_MTX);
- if (size < channel->block_size) {
+ if (size < align_size || offset) {
+ if (ext2fs_llseek(data->dev, aligned_blk * align_size,
+ SEEK_SET) < 0) {
+ retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+ goto error_out;
+ }
actual = read(data->dev, data->bounce,
- channel->block_size);
- if (actual != channel->block_size) {
+ align_size);
+ if (actual != align_size) {
if (actual < 0) {
mutex_unlock(data, BOUNCE_MTX);
retval = errno;
goto error_out;
}
memset((char *) data->bounce + actual, 0,
- channel->block_size - actual);
+ align_size - actual);
}
}
actual = size;
- if (size > channel->block_size)
- actual = channel->block_size;
- memcpy(data->bounce, buf, actual);
- if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ if ((actual + offset) > align_size)
+ actual = align_size - offset;
+ if (actual > size)
+ actual = size;
+ memcpy(((char *)data->bounce) + offset, buf, actual);
+ if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
goto error_out;
}
- actual = write(data->dev, data->bounce, channel->block_size);
+ actual_w = write(data->dev, data->bounce, align_size);
mutex_unlock(data, BOUNCE_MTX);
- if (actual < 0) {
+ if (actual_w < 0) {
retval = errno;
goto error_out;
}
- if (actual != channel->block_size)
+ if (actual_w != align_size)
goto short_write;
size -= actual;
buf += actual;
location += actual;
+ aligned_blk++;
+ offset = 0;
}
return 0;

2021-03-01 16:19:56

by Artem Blagodarenko

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

Hello Ted!

Thank you very much for the new version of the fix.

The problem can be easily reproduced, using loop device.

Here is how I reproduced it originally (without the patch)

# truncate -s 512MB /tmp/lustre-ost
# losetup -b 4096 /dev/loop0 /tmp/lustre-ost
# mkfs.ext4 /dev/loop0
mke2fs 1.45.6.wc1 (20-Mar-2020)
detected raid stride 4096 too large, use optimum 512
Discarding device blocks: done
Creating filesystem with 125000 4k blocks and 125056 inodes
Filesystem UUID: 541ad524-556a-45be-84fe-5b68c1ca4562
Superblock backups stored on blocks:
32768, 98304

Allocating group tables: done
Writing inode tables: done
Creating journal (4096 blocks): done
Writing superblocks and filesystem accounting information: done

# git rev-parse HEAD
0f880f553dfa09dabe4cb03752b97a07b66eb998

[root@CO82 e2fsprogs-hpe]# misc/e2image /dev/loop0 /tmp/ost-image
e2image 1.45.2.cr2 (09-Apr-2020)
misc/e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/loop0
Couldn't find valid filesystem superblock.


With your patch e2image works fine.


[root@CO82 e2fsprogs-kernel]# truncate -s 512MB /tmp/lustre-ost
[root@CO82 e2fsprogs-kernel]# losetup -b 4096 /dev/loop0 /tmp/lustre-ost
[root@CO82 e2fsprogs-kernel]# mkfs.ext4 /dev/loop0
mke2fs 1.45.6.wc1 (20-Mar-2020)
detected raid stride 4096 too large, use optimum 512
Discarding device blocks: done
Creating filesystem with 125000 4k blocks and 125056 inodes
Filesystem UUID: 42f6fc7d-382a-4681-99b8-79f3fcd2b4bf
Superblock backups stored on blocks:
32768, 98304

Allocating group tables: done
Writing inode tables: done
Creating journal (4096 blocks): done
Writing superblocks and filesystem accounting information: done

[root@CO82 e2fsprogs-kernel]# git rev-parse HEAD
67f2b54667e65cf5a478fcea8b85722be9ee6e8d
[root@CO82 e2fsprogs-kernel]# misc/e2image /dev/loop0 /tmp/ost-image
e2image 1.46.1 (9-Feb-2021)

Thanks again.
Best regards,
Artem Blagodarenko.


> On 27 Feb 2021, at 02:17, Theodore Ts'o <[email protected]> wrote:
>
> I've rewritten the patch because it was simplest way to review the
> code. The resulting code is simpler and has a smaller number of lines
> of code. I don't have any 4k advanced format disks handy, so I'd
> appreciate it if someone can test it. It passes the existing
> regression tests, and I've run a number of manual tests to simulate a
> advanced format HDD, with the tests being run with clang and UBSAN and
> ADDRSAN enabled.
>
> If someone with access to an advanced format disk can test running
> debugfs -D on an advanced format disk, that would be great, thanks.
>
> - Ted
>
> commit c001596110e834a85b01a47a20811b318cb3b9e4
> Author: Theodore Ts'o <[email protected]>
> Date: Fri Feb 26 17:41:06 2021 -0500
>
> libext2fs: fix unix_io's Direct I/O support
>
> The previous Direct I/O support worked on HDD's with 512 byte logical
> sector sizes, and on FreeBSD which required 4k aligned memory buffers.
> However, it was incomplete and was not correctly working on HDD's with
> 4k logical sector sizes (aka Advanced Format Disks).
>
> Based on a patch from Alexey Lyashkov <[email protected]> but
> rewritten to work with the latest e2fsprogs and to minimize changes to
> make it easier to review.
>
> Signed-off-by: Theodore Ts'o <[email protected]>
> Reported-by: Artem Blagodarenko <[email protected]>
>
> diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
> index c395d615..996c31a1 100644
> --- a/lib/ext2fs/io_manager.c
> +++ b/lib/ext2fs/io_manager.c
> @@ -134,9 +134,11 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
> else
> size = -count;
>
> - if (io->align)
> + if (io->align) {
> + if (io->align > size)
> + size = io->align;
> return ext2fs_get_memalign(size, io->align, ptr);
> - else
> + } else
> return ext2fs_get_mem(size, ptr);
> }
>
> diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
> index 73f5667e..8965535c 100644
> --- a/lib/ext2fs/unix_io.c
> +++ b/lib/ext2fs/unix_io.c
> @@ -218,6 +218,8 @@ static errcode_t raw_read_blk(io_channel channel,
> int actual = 0;
> unsigned char *buf = bufv;
> ssize_t really_read = 0;
> + unsigned long long aligned_blk;
> + int align_size, offset;
>
> size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
> mutex_lock(data, STATS_MTX);
> @@ -226,7 +228,7 @@ static errcode_t raw_read_blk(io_channel channel,
> location = ((ext2_loff_t) block * channel->block_size) + data->offset;
>
> if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> goto error_out;
> }
> @@ -237,6 +239,7 @@ static errcode_t raw_read_blk(io_channel channel,
> /* Try an aligned pread */
> if ((channel->align == 0) ||
> (IS_ALIGNED(buf, channel->align) &&
> + IS_ALIGNED(location, channel->align) &&
> IS_ALIGNED(size, channel->align))) {
> actual = pread64(data->dev, buf, size, location);
> if (actual == size)
> @@ -248,6 +251,7 @@ static errcode_t raw_read_blk(io_channel channel,
> if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> ((channel->align == 0) ||
> (IS_ALIGNED(buf, channel->align) &&
> + IS_ALIGNED(location, channel->align) &&
> IS_ALIGNED(size, channel->align)))) {
> actual = pread(data->dev, buf, size, location);
> if (actual == size)
> @@ -256,12 +260,13 @@ static errcode_t raw_read_blk(io_channel channel,
> }
> #endif /* HAVE_PREAD */
>
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> goto error_out;
> }
> if ((channel->align == 0) ||
> (IS_ALIGNED(buf, channel->align) &&
> + IS_ALIGNED(location, channel->align) &&
> IS_ALIGNED(size, channel->align))) {
> actual = read(data->dev, buf, size);
> if (actual != size) {
> @@ -286,23 +291,39 @@ static errcode_t raw_read_blk(io_channel channel,
> * to the O_DIRECT rules, so we need to do this the hard way...
> */
> bounce_read:
> + if ((channel->block_size > channel->align) &&
> + (channel->block_size % channel->align) == 0)
> + align_size = channel->block_size;
> + else
> + align_size = channel->align;
> + aligned_blk = location / align_size;
> + offset = location % align_size;
> +
> + if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
> + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> + goto error_out;
> + }
> while (size > 0) {
> mutex_lock(data, BOUNCE_MTX);
> - actual = read(data->dev, data->bounce, channel->block_size);
> - if (actual != channel->block_size) {
> + actual = read(data->dev, data->bounce, align_size);
> + if (actual != align_size) {
> mutex_unlock(data, BOUNCE_MTX);
> actual = really_read;
> buf -= really_read;
> size += really_read;
> goto short_read;
> }
> - actual = size;
> - if (size > channel->block_size)
> - actual = channel->block_size;
> - memcpy(buf, data->bounce, actual);
> + if ((actual + offset) > align_size)
> + actual = align_size - offset;
> + if (actual > size)
> + actual = size;
> + memcpy(buf, data->bounce + offset, actual);
> +
> really_read += actual;
> size -= actual;
> buf += actual;
> + offset = 0;
> + aligned_blk++;
> mutex_unlock(data, BOUNCE_MTX);
> }
> return 0;
> @@ -326,6 +347,8 @@ static errcode_t raw_write_blk(io_channel channel,
> int actual = 0;
> errcode_t retval;
> const unsigned char *buf = bufv;
> + unsigned long long aligned_blk;
> + int align_size, offset;
>
> if (count == 1)
> size = channel->block_size;
> @@ -342,7 +365,7 @@ static errcode_t raw_write_blk(io_channel channel,
> location = ((ext2_loff_t) block * channel->block_size) + data->offset;
>
> if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> goto error_out;
> }
> @@ -353,6 +376,7 @@ static errcode_t raw_write_blk(io_channel channel,
> /* Try an aligned pwrite */
> if ((channel->align == 0) ||
> (IS_ALIGNED(buf, channel->align) &&
> + IS_ALIGNED(location, channel->align) &&
> IS_ALIGNED(size, channel->align))) {
> actual = pwrite64(data->dev, buf, size, location);
> if (actual == size)
> @@ -363,6 +387,7 @@ static errcode_t raw_write_blk(io_channel channel,
> if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> ((channel->align == 0) ||
> (IS_ALIGNED(buf, channel->align) &&
> + IS_ALIGNED(location, channel->align) &&
> IS_ALIGNED(size, channel->align)))) {
> actual = pwrite(data->dev, buf, size, location);
> if (actual == size)
> @@ -370,13 +395,14 @@ static errcode_t raw_write_blk(io_channel channel,
> }
> #endif /* HAVE_PWRITE */
>
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> goto error_out;
> }
>
> if ((channel->align == 0) ||
> (IS_ALIGNED(buf, channel->align) &&
> + IS_ALIGNED(location, channel->align) &&
> IS_ALIGNED(size, channel->align))) {
> actual = write(data->dev, buf, size);
> if (actual < 0) {
> @@ -400,40 +426,59 @@ static errcode_t raw_write_blk(io_channel channel,
> * to the O_DIRECT rules, so we need to do this the hard way...
> */
> bounce_write:
> + if ((channel->block_size > channel->align) &&
> + (channel->block_size % channel->align) == 0)
> + align_size = channel->block_size;
> + else
> + align_size = channel->align;
> + aligned_blk = location / align_size;
> + offset = location % align_size;
> +
> while (size > 0) {
> + int actual_w;
> +
> mutex_lock(data, BOUNCE_MTX);
> - if (size < channel->block_size) {
> + if (size < align_size || offset) {
> + if (ext2fs_llseek(data->dev, aligned_blk * align_size,
> + SEEK_SET) < 0) {
> + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> + goto error_out;
> + }
> actual = read(data->dev, data->bounce,
> - channel->block_size);
> - if (actual != channel->block_size) {
> + align_size);
> + if (actual != align_size) {
> if (actual < 0) {
> mutex_unlock(data, BOUNCE_MTX);
> retval = errno;
> goto error_out;
> }
> memset((char *) data->bounce + actual, 0,
> - channel->block_size - actual);
> + align_size - actual);
> }
> }
> actual = size;
> - if (size > channel->block_size)
> - actual = channel->block_size;
> - memcpy(data->bounce, buf, actual);
> - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> + if ((actual + offset) > align_size)
> + actual = align_size - offset;
> + if (actual > size)
> + actual = size;
> + memcpy(((char *)data->bounce) + offset, buf, actual);
> + if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
> retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> goto error_out;
> }
> - actual = write(data->dev, data->bounce, channel->block_size);
> + actual_w = write(data->dev, data->bounce, align_size);
> mutex_unlock(data, BOUNCE_MTX);
> - if (actual < 0) {
> + if (actual_w < 0) {
> retval = errno;
> goto error_out;
> }
> - if (actual != channel->block_size)
> + if (actual_w != align_size)
> goto short_write;
> size -= actual;
> buf += actual;
> location += actual;
> + aligned_blk++;
> + offset = 0;
> }
> return 0;
>

2021-03-01 17:47:19

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] libfs: Fix DIO mode aligment

On Mon, Mar 01, 2021 at 07:14:00PM +0300, Благодаренко Артём wrote:
> Here is how I reproduced it originally (without the patch)
>
> # truncate -s 512MB /tmp/lustre-ost
> # losetup -b 4096 /dev/loop0 /tmp/lustre-ost
> # mkfs.ext4 /dev/loop0

Thanks for pointing out that this can be tested using losetup.

> With your patch e2image works fine.
>
> [root@CO82 e2fsprogs-kernel]# git rev-parse HEAD
> 67f2b54667e65cf5a478fcea8b85722be9ee6e8d
> [root@CO82 e2fsprogs-kernel]# misc/e2image /dev/loop0 /tmp/ost-image
> e2image 1.46.1 (9-Feb-2021)

So commit 67f2b54667e6 is 1.46.2, so I'm not sure that you recompiled
e2image. More importantly, e2image in upstream doesn't use Direct
I/O. When I tried using debugfs -D, it looks like Direct I/O on an
Advanced Format HDD isn't working correctly:

# debugfs -D /dev/loop0
debugfs 1.46.2 (28-Feb-2021)
debugfs: Bad magic number in super-block while trying to open /dev/loop0
/dev/loop0 contains a ext4 file system
created on Mon Mar 1 12:31:43 2021

This isn't a regression since it was broken in upstream before, but it
would be good to get this fixed before e2fsprogs 1.46.3.

Cheers,

- Ted