2023-09-29 13:35:32

by John Garry

[permalink] [raw]
Subject: [PATCH 16/21] fs: iomap: Atomic write support

Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
bio is being created and all the rules there need to be followed.

It is the task of the FS iomap iter callbacks to ensure that the mapping
created adheres to those rules, like size is power-of-2, is at a
naturally-aligned offset, etc.

In iomap_dio_bio_iter(), ensure that for a non-dsync iocb that the mapping
is not dirty nor unmapped.

A write should only produce a single bio, so error when it doesn't.

Signed-off-by: John Garry <[email protected]>
---
fs/iomap/direct-io.c | 26 ++++++++++++++++++++++++--
fs/iomap/trace.h | 3 ++-
include/linux/iomap.h | 1 +
3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..6ef25e26f1a1 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -275,10 +275,11 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
{
+ bool atomic_write = iter->flags & IOMAP_ATOMIC_WRITE;
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
- loff_t length = iomap_length(iter);
+ const loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@@ -292,6 +293,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;

+ if (atomic_write && !iocb_is_dsync(dio->iocb)) {
+ if (iomap->flags & IOMAP_F_DIRTY)
+ return -EIO;
+ if (iomap->type != IOMAP_MAPPED)
+ return -EIO;
+ }
+
if (iomap->type == IOMAP_UNWRITTEN) {
dio->flags |= IOMAP_DIO_UNWRITTEN;
need_zeroout = true;
@@ -381,6 +389,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_ioprio = dio->iocb->ki_ioprio;
+ if (atomic_write)
+ bio->bi_opf |= REQ_ATOMIC;
+
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;

@@ -397,6 +408,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}

n = bio->bi_iter.bi_size;
+ if (atomic_write && n != length) {
+ /* This bio should have covered the complete length */
+ ret = -EINVAL;
+ bio_put(bio);
+ goto out;
+ }
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@@ -554,6 +571,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
struct blk_plug plug;
struct iomap_dio *dio;
loff_t ret = 0;
+ bool is_read = iov_iter_rw(iter) == READ;
+ bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;

trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);

@@ -579,7 +598,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;

- if (iov_iter_rw(iter) == READ) {
+ if (is_read) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;

@@ -605,6 +624,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
dio->flags |= IOMAP_DIO_CALLER_COMP;

+ if (atomic_write)
+ iomi.flags |= IOMAP_ATOMIC_WRITE;
+
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..f9932733c180 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
- { IOMAP_NOWAIT, "NOWAIT" }
+ { IOMAP_NOWAIT, "NOWAIT" }, \
+ { IOMAP_ATOMIC_WRITE, "ATOMIC" }

#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..5138cede54fc 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC_WRITE (1 << 9)

struct iomap_ops {
/*
--
2.31.1


2023-10-03 04:24:40

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH 16/21] fs: iomap: Atomic write support

On Fri, Sep 29, 2023 at 10:27:21AM +0000, John Garry wrote:
> Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
> bio is being created and all the rules there need to be followed.
>
> It is the task of the FS iomap iter callbacks to ensure that the mapping
> created adheres to those rules, like size is power-of-2, is at a
> naturally-aligned offset, etc.

The mapping being returned by the filesystem can span a much greater
range than the actual IO needs - the iomap itself is not guaranteed
to be aligned to anything in particular, but the IO location within
that map can still conform to atomic IO constraints. See how
iomap_sector() calculates the actual LBA address of the IO from
the iomap and the current file position the IO is being done at.

hence I think saying "the filesysetm should make sure all IO
alignment adheres to atomic IO rules is probably wrong. The iomap
layer doesn't care what the filesystem does, all it cares about is
whether the IO can be done given the extent map that was returned to
it.

Indeed, iomap_dio_bio_iter() is doing all these alignment checks for
normal DIO reads and writes which must be logical block sized
aligned. i.e. this check:

if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;

Hence I think that atomic IO units, which are similarly defined by
the bdev, should be checked at the iomap layer, too. e.g, by
following up with:

if ((dio->iocb->ki_flags & IOCB_ATOMIC) &&
((pos | length) & (bdev_atomic_unit_min(iomap->bdev) - 1) ||
!bdev_iter_is_atomic_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;

At this point, filesystems don't really need to know anything about
atomic IO - if they've allocated a large contiguous extent (e.g. via
fallocate()), then RWF_ATOMIC will just work for the cases where the
block device supports it...

This then means that stuff like XFS extent size hints only need to
check when the hint is set that it is aligned to the underlying
device atomic IO constraints. Then when it sees the IOMAP_ATOMIC
modifier, it can fail allocation if it can't get extent size hint
aligned allocation.

IOWs, I'm starting to think this doesn't need any change to the
on-disk format for XFS - it can be driven entirely through two
dynamic mechanisms:

1. (IOMAP_WRITE | IOMAP_ATOMIC) requests from the direct IO layer
which causes mapping/allocation to fail if it can't allocate (or
map) atomic IO compatible extents for the IO.

2. FALLOC_FL_ATOMIC preallocation flag modifier to tell fallocate()
to force alignment of all preallocated extents to atomic IO
constraints.

This doesn't require extent size hints at all. The filesystem can
query the bdev at mount time, store the min/max atomic write sizes,
and then use them for all requests that have _ATOMIC modifiers set
on them.

With iomap doing the same "get the atomic constraints from the bdev"
style lookups for per-IO file offset and size checking, I don't
think we actually need extent size hints or an on-disk flag to force
extent size hint alignment.

That doesn't mean extent size hints can't be used - it just means
that extent size hints have to be constrained to being aligned to
atomic IOs (e.g. extent size hint must be an integer multiple of the
max atomic IO size). This then acts as a modifier for _ATOMIC
context allocations, much like it is a modifier for normal
allocations now.

> In iomap_dio_bio_iter(), ensure that for a non-dsync iocb that the mapping
> is not dirty nor unmapped.
>
> A write should only produce a single bio, so error when it doesn't.

I comment on both these things below.

>
> Signed-off-by: John Garry <[email protected]>
> ---
> fs/iomap/direct-io.c | 26 ++++++++++++++++++++++++--
> fs/iomap/trace.h | 3 ++-
> include/linux/iomap.h | 1 +
> 3 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index bcd3f8cf5ea4..6ef25e26f1a1 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -275,10 +275,11 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> struct iomap_dio *dio)
> {
> + bool atomic_write = iter->flags & IOMAP_ATOMIC_WRITE;
> const struct iomap *iomap = &iter->iomap;
> struct inode *inode = iter->inode;
> unsigned int fs_block_size = i_blocksize(inode), pad;
> - loff_t length = iomap_length(iter);
> + const loff_t length = iomap_length(iter);
> loff_t pos = iter->pos;
> blk_opf_t bio_opf;
> struct bio *bio;
> @@ -292,6 +293,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> return -EINVAL;
>
> + if (atomic_write && !iocb_is_dsync(dio->iocb)) {
> + if (iomap->flags & IOMAP_F_DIRTY)
> + return -EIO;
> + if (iomap->type != IOMAP_MAPPED)
> + return -EIO;
> + }

How do we get here without space having been allocated for the
write?

Perhaps what this is trying to do is make RWF_ATOMIC only be valid
into written space? I mean, this will fail with preallocated space
(IOMAP_UNWRITTEN) even though we still have exactly the RWF_ATOMIC
all-or-nothing behaviour guaranteed after a crash because of journal
recovery behaviour. i.e. if the unwritten conversion gets written to
the journal, the data will be there. If it isn't written to the
journal, then the space remains unwritten and there's no data across
that entire range....

So I'm not really sure that either of these checks are valid or why
they are actually needed....

> +
> if (iomap->type == IOMAP_UNWRITTEN) {
> dio->flags |= IOMAP_DIO_UNWRITTEN;
> need_zeroout = true;
> @@ -381,6 +389,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> GFP_KERNEL);
> bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> bio->bi_ioprio = dio->iocb->ki_ioprio;
> + if (atomic_write)
> + bio->bi_opf |= REQ_ATOMIC;
> +
> bio->bi_private = dio;
> bio->bi_end_io = iomap_dio_bio_end_io;
>
> @@ -397,6 +408,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> }
>
> n = bio->bi_iter.bi_size;
> + if (atomic_write && n != length) {
> + /* This bio should have covered the complete length */
> + ret = -EINVAL;
> + bio_put(bio);
> + goto out;

Why? The actual bio can be any length that meets the aligned
criteria between min and max, yes? So it's valid to split a
RWF_ATOMIC write request up into multiple min unit sized bios, is it
not? I mean, that's the whole point of the min/max unit setup, isn't
it? That the max sized write only guarantees that it will tear at
min unit boundaries, not within those min unit boundaries? If
I've understood this correctly, then why does this "single bio for
large atomic write" constraint need to exist?


> + }
> if (dio->flags & IOMAP_DIO_WRITE) {
> task_io_account_write(n);
> } else {
> @@ -554,6 +571,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> struct blk_plug plug;
> struct iomap_dio *dio;
> loff_t ret = 0;
> + bool is_read = iov_iter_rw(iter) == READ;
> + bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;

This does not need to be done here, because....

>
> trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
>
> @@ -579,7 +598,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> if (iocb->ki_flags & IOCB_NOWAIT)
> iomi.flags |= IOMAP_NOWAIT;
>
> - if (iov_iter_rw(iter) == READ) {
> + if (is_read) {
> /* reads can always complete inline */
> dio->flags |= IOMAP_DIO_INLINE_COMP;
>
> @@ -605,6 +624,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
> dio->flags |= IOMAP_DIO_CALLER_COMP;
>
> + if (atomic_write)
> + iomi.flags |= IOMAP_ATOMIC_WRITE;

.... it is only checked once in the write path, so

if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC;

> +
> if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
> ret = -EAGAIN;
> if (iomi.pos >= dio->i_size ||
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index c16fd55f5595..f9932733c180 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
> { IOMAP_REPORT, "REPORT" }, \
> { IOMAP_FAULT, "FAULT" }, \
> { IOMAP_DIRECT, "DIRECT" }, \
> - { IOMAP_NOWAIT, "NOWAIT" }
> + { IOMAP_NOWAIT, "NOWAIT" }, \
> + { IOMAP_ATOMIC_WRITE, "ATOMIC" }

We already have an IOMAP_WRITE flag, so IOMAP_ATOMIC is the modifier
for the write IO behaviour (like NOWAIT), not a replacement write
flag.

-Dave.
--
Dave Chinner
[email protected]

2023-10-03 12:56:34

by John Garry

[permalink] [raw]
Subject: Re: [PATCH 16/21] fs: iomap: Atomic write support

On 03/10/2023 05:24, Dave Chinner wrote:
> On Fri, Sep 29, 2023 at 10:27:21AM +0000, John Garry wrote:
>> Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
>> bio is being created and all the rules there need to be followed.
>>
>> It is the task of the FS iomap iter callbacks to ensure that the mapping
>> created adheres to those rules, like size is power-of-2, is at a
>> naturally-aligned offset, etc.
>
> The mapping being returned by the filesystem can span a much greater
> range than the actual IO needs - the iomap itself is not guaranteed
> to be aligned to anything in particular, but the IO location within
> that map can still conform to atomic IO constraints. See how
> iomap_sector() calculates the actual LBA address of the IO from
> the iomap and the current file position the IO is being done at.

I see, but I was working on the basis that the filesystem produces an
iomap which itself conforms to all the rules. And that is because the
atomic write unit min and max for the file depend on the extent
alignment, which only the filesystem is aware of.

>
> hence I think saying "the filesysetm should make sure all IO
> alignment adheres to atomic IO rules is probably wrong. The iomap
> layer doesn't care what the filesystem does, all it cares about is
> whether the IO can be done given the extent map that was returned to
> it.
>
> Indeed, iomap_dio_bio_iter() is doing all these alignment checks for
> normal DIO reads and writes which must be logical block sized
> aligned. i.e. this check:
>
> if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
> !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> return -EINVAL;
>
> Hence I think that atomic IO units, which are similarly defined by
> the bdev, should be checked at the iomap layer, too. e.g, by
> following up with:
>
> if ((dio->iocb->ki_flags & IOCB_ATOMIC) &&
> ((pos | length) & (bdev_atomic_unit_min(iomap->bdev) - 1) ||
> !bdev_iter_is_atomic_aligned(iomap->bdev, dio->submit.iter))
> return -EINVAL;

Seems ok for at least enforcing alignment for the bdev. Again,
filesystem extent alignment is my concern.

>
> At this point, filesystems don't really need to know anything about
> atomic IO - if they've allocated a large contiguous extent (e.g. via
> fallocate()), then RWF_ATOMIC will just work for the cases where the
> block device supports it...
>
> This then means that stuff like XFS extent size hints only need to
> check when the hint is set that it is aligned to the underlying
> device atomic IO constraints. Then when it sees the IOMAP_ATOMIC
> modifier, it can fail allocation if it can't get extent size hint
> aligned allocation.

I am not sure what you mean by allocation in this context. I assume that
fallocate allocates the extents, but they remain unwritten. So if we
then dd into that file to zero it or init it any other way, they become
written and the extent size hint or bdev atomic write constraints would
be just ignored then.

BTW, if you remember, we did propose an XFS fallocate extension for
extent alignment in the initial RFC, but decided to drop it.

>
> IOWs, I'm starting to think this doesn't need any change to the
> on-disk format for XFS - it can be driven entirely through two
> dynamic mechanisms:
>
> 1. (IOMAP_WRITE | IOMAP_ATOMIC) requests from the direct IO layer
> which causes mapping/allocation to fail if it can't allocate (or
> map) atomic IO compatible extents for the IO.
>
> 2. FALLOC_FL_ATOMIC preallocation flag modifier to tell fallocate()
> to force alignment of all preallocated extents to atomic IO
> constraints.

Would that be a sticky flag? What stops the extents mutating before the
atomic write?

>
> This doesn't require extent size hints at all. The filesystem can
> query the bdev at mount time, store the min/max atomic write sizes,
> and then use them for all requests that have _ATOMIC modifiers set
> on them.

A drawback is that the storage device may support atomic write unit max
much bigger than the user requires and cause inefficient alignment, e.g.
bdev atomic write unit max = 1M, and we only ever want 8KB atomic
writes. But you are mentioning extent size hints can be paid attention
to, below.

>
> With iomap doing the same "get the atomic constraints from the bdev"
> style lookups for per-IO file offset and size checking, I don't
> think we actually need extent size hints or an on-disk flag to force
> extent size hint alignment.
>
> That doesn't mean extent size hints can't be used - it just means
> that extent size hints have to be constrained to being aligned to
> atomic IOs (e.g. extent size hint must be an integer multiple of the
> max atomic IO size).

Yeah, well I think that we already agreed something like this.

> This then acts as a modifier for _ATOMIC
> context allocations, much like it is a modifier for normal
> allocations now.
>
>> In iomap_dio_bio_iter(), ensure that for a non-dsync iocb that the mapping
>> is not dirty nor unmapped.
>>
>> A write should only produce a single bio, so error when it doesn't.
>
> I comment on both these things below.
>
>>
>> Signed-off-by: John Garry <[email protected]>
>> ---
>> fs/iomap/direct-io.c | 26 ++++++++++++++++++++++++--
>> fs/iomap/trace.h | 3 ++-
>> include/linux/iomap.h | 1 +
>> 3 files changed, 27 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index bcd3f8cf5ea4..6ef25e26f1a1 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -275,10 +275,11 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>> static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>> struct iomap_dio *dio)
>> {
>> + bool atomic_write = iter->flags & IOMAP_ATOMIC_WRITE;
>> const struct iomap *iomap = &iter->iomap;
>> struct inode *inode = iter->inode;
>> unsigned int fs_block_size = i_blocksize(inode), pad;
>> - loff_t length = iomap_length(iter);
>> + const loff_t length = iomap_length(iter);
>> loff_t pos = iter->pos;
>> blk_opf_t bio_opf;
>> struct bio *bio;
>> @@ -292,6 +293,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>> !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>> return -EINVAL;
>>
>> + if (atomic_write && !iocb_is_dsync(dio->iocb)) {
>> + if (iomap->flags & IOMAP_F_DIRTY)
>> + return -EIO;
>> + if (iomap->type != IOMAP_MAPPED)
>> + return -EIO;
>> + }
>
> How do we get here without space having been allocated for the
> write?

I don't think that we can, but we are checking that the space is also
written.

>
> Perhaps what this is trying to do is make RWF_ATOMIC only be valid
> into written space?

Yes, and we now detail this in the man pages.

> I mean, this will fail with preallocated space
> (IOMAP_UNWRITTEN) even though we still have exactly the RWF_ATOMIC
> all-or-nothing behaviour guaranteed after a crash because of journal
> recovery behaviour. i.e. if the unwritten conversion gets written to
> the journal, the data will be there. If it isn't written to the
> journal, then the space remains unwritten and there's no data across
> that entire range....
>
> So I'm not really sure that either of these checks are valid or why
> they are actually needed....

I think that the idea is that the space is already written and the
metadata for the space is persisted or going to be. Darrick guided me on
this, so hopefully can comment more.

>
>> +
>> if (iomap->type == IOMAP_UNWRITTEN) {
>> dio->flags |= IOMAP_DIO_UNWRITTEN;
>> need_zeroout = true;
>> @@ -381,6 +389,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>> GFP_KERNEL);
>> bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>> bio->bi_ioprio = dio->iocb->ki_ioprio;
>> + if (atomic_write)
>> + bio->bi_opf |= REQ_ATOMIC;
>> +
>> bio->bi_private = dio;
>> bio->bi_end_io = iomap_dio_bio_end_io;
>>
>> @@ -397,6 +408,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>> }
>>
>> n = bio->bi_iter.bi_size;
>> + if (atomic_write && n != length) {
>> + /* This bio should have covered the complete length */
>> + ret = -EINVAL;
>> + bio_put(bio);
>> + goto out;
>
> Why? The actual bio can be any length that meets the aligned
> criteria between min and max, yes?

The write also needs to be a power-of-2 in length. atomic write min and
max will always be a power-of-2.

> So it's valid to split a
> RWF_ATOMIC write request up into multiple min unit sized bios, is it
> not?

It is not. In the RFC we sent in May there was a scheme to break up the
atomic write into multiple userspace block-sized bios, but that is no
longer supported.

Now an atomic write only produces a single bio. So userspace may do a
16KB atomic write, for example, and we only ever issue that as a single
16KB operation to the storage device.

> I mean, that's the whole point of the min/max unit setup, isn't
> it?

The point of min/max is to ensure that userspace executes an atomic
write which is guaranteed to be only ever issued as a single write to
the storage device. In addition, the length and position for that write
conforms to the storage device atomic write constraints.

> That the max sized write only guarantees that it will tear at
> min unit boundaries, not within those min unit boundaries?

There is no tearing. As mentioned, the RFC in May did support some
splitting but we decided to drop it.

> If
> I've understood this correctly, then why does this "single bio for
> large atomic write" constraint need to exist?

atomic write means that a write will never we torn.

>
>
>> + }
>> if (dio->flags & IOMAP_DIO_WRITE) {
>> task_io_account_write(n);
>> } else {
>> @@ -554,6 +571,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>> struct blk_plug plug;
>> struct iomap_dio *dio;
>> loff_t ret = 0;
>> + bool is_read = iov_iter_rw(iter) == READ;
>> + bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;
>
> This does not need to be done here, because....
>
>>
>> trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
>>
>> @@ -579,7 +598,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>> if (iocb->ki_flags & IOCB_NOWAIT)
>> iomi.flags |= IOMAP_NOWAIT;
>>
>> - if (iov_iter_rw(iter) == READ) {
>> + if (is_read) {
>> /* reads can always complete inline */
>> dio->flags |= IOMAP_DIO_INLINE_COMP;
>>
>> @@ -605,6 +624,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>> if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
>> dio->flags |= IOMAP_DIO_CALLER_COMP;
>>
>> + if (atomic_write)
>> + iomi.flags |= IOMAP_ATOMIC_WRITE;
>
> .... it is only checked once in the write path, so

ok

>
> if (iocb->ki_flags & IOCB_ATOMIC)
> iomi.flags |= IOMAP_ATOMIC;
>
>> +
>> if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
>> ret = -EAGAIN;
>> if (iomi.pos >= dio->i_size ||
>> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
>> index c16fd55f5595..f9932733c180 100644
>> --- a/fs/iomap/trace.h
>> +++ b/fs/iomap/trace.h
>> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
>> { IOMAP_REPORT, "REPORT" }, \
>> { IOMAP_FAULT, "FAULT" }, \
>> { IOMAP_DIRECT, "DIRECT" }, \
>> - { IOMAP_NOWAIT, "NOWAIT" }
>> + { IOMAP_NOWAIT, "NOWAIT" }, \
>> + { IOMAP_ATOMIC_WRITE, "ATOMIC" }
>
> We already have an IOMAP_WRITE flag, so IOMAP_ATOMIC is the modifier
> for the write IO behaviour (like NOWAIT), not a replacement write
> flag.

The name IOMAP_ATOMIC_WRITE is the issue then. The iomap trace still
just has "ATOMIC" as the trace modifier.

Thanks,
John

2023-10-03 16:48:27

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 16/21] fs: iomap: Atomic write support

On Tue, Oct 03, 2023 at 03:24:23PM +1100, Dave Chinner wrote:
> On Fri, Sep 29, 2023 at 10:27:21AM +0000, John Garry wrote:
> > Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
> > bio is being created and all the rules there need to be followed.
> >
> > It is the task of the FS iomap iter callbacks to ensure that the mapping
> > created adheres to those rules, like size is power-of-2, is at a
> > naturally-aligned offset, etc.
>
> The mapping being returned by the filesystem can span a much greater
> range than the actual IO needs - the iomap itself is not guaranteed
> to be aligned to anything in particular, but the IO location within
> that map can still conform to atomic IO constraints. See how
> iomap_sector() calculates the actual LBA address of the IO from
> the iomap and the current file position the IO is being done at.
>
> hence I think saying "the filesysetm should make sure all IO
> alignment adheres to atomic IO rules is probably wrong. The iomap
> layer doesn't care what the filesystem does, all it cares about is
> whether the IO can be done given the extent map that was returned to
> it.
>
> Indeed, iomap_dio_bio_iter() is doing all these alignment checks for
> normal DIO reads and writes which must be logical block sized
> aligned. i.e. this check:
>
> if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
> !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> return -EINVAL;
>
> Hence I think that atomic IO units, which are similarly defined by
> the bdev, should be checked at the iomap layer, too. e.g, by
> following up with:
>
> if ((dio->iocb->ki_flags & IOCB_ATOMIC) &&
> ((pos | length) & (bdev_atomic_unit_min(iomap->bdev) - 1) ||
> !bdev_iter_is_atomic_aligned(iomap->bdev, dio->submit.iter))
> return -EINVAL;
>
> At this point, filesystems don't really need to know anything about
> atomic IO - if they've allocated a large contiguous extent (e.g. via
> fallocate()), then RWF_ATOMIC will just work for the cases where the
> block device supports it...
>
> This then means that stuff like XFS extent size hints only need to
> check when the hint is set that it is aligned to the underlying
> device atomic IO constraints. Then when it sees the IOMAP_ATOMIC
> modifier, it can fail allocation if it can't get extent size hint
> aligned allocation.
>
> IOWs, I'm starting to think this doesn't need any change to the
> on-disk format for XFS - it can be driven entirely through two
> dynamic mechanisms:
>
> 1. (IOMAP_WRITE | IOMAP_ATOMIC) requests from the direct IO layer
> which causes mapping/allocation to fail if it can't allocate (or
> map) atomic IO compatible extents for the IO.
>
> 2. FALLOC_FL_ATOMIC preallocation flag modifier to tell fallocate()
> to force alignment of all preallocated extents to atomic IO
> constraints.

Ugh, let's not relitigate problems that you (Dave) and I have already
solved.

Back in 2018, our internal proto-users of pmem asked for aligned
allocations so they could use PMD mappings to reduce TLB pressure. At
the time, you and I talked on IRC about whether that should be done via
fallocate flag or setting extszinherit+sunit at mkfs time. We decided
against adding fallocate flags because linux-api bikeshed hell.

Ever since, we've been shipping UEK with a mkfs.xmem scripts that
automates computing the mkfs.xfs geometry CLI options. It works,
mostly, except for the unaligned allocations that one gets when the free
space gets fragmented. The xfsprogs side of the forcealign patchset
moves most of the mkfs.xmem cli option setting logic into mkfs itself,
and the kernel side shuts off the lowspace allocator to fix the
fragmentation problem.

I'd rather fix the remaining quirks and not reinvent solved solutions,
as popular as that is in programming circles.

Why is mandatory allocation alignment for atomic writes different?
Forcealign solves the problem for NVME/SCSI AWU and pmem PMD in the same
way with the same control knobs for sysadmins. I don't want to have
totally separate playbooks for accomplishing nearly the same things.

I don't like encoding hardware details in the fallocate uapi either.
That implies adding FALLOC_FL_HUGEPAGE for pmem, and possibly
FALLOC_FL_{SUNIT,SWIDTH} for users with RAIDs.

> This doesn't require extent size hints at all. The filesystem can
> query the bdev at mount time, store the min/max atomic write sizes,
> and then use them for all requests that have _ATOMIC modifiers set
> on them.
>
> With iomap doing the same "get the atomic constraints from the bdev"
> style lookups for per-IO file offset and size checking, I don't
> think we actually need extent size hints or an on-disk flag to force
> extent size hint alignment.
>
> That doesn't mean extent size hints can't be used - it just means
> that extent size hints have to be constrained to being aligned to
> atomic IOs (e.g. extent size hint must be an integer multiple of the
> max atomic IO size). This then acts as a modifier for _ATOMIC
> context allocations, much like it is a modifier for normal
> allocations now.

(One behavior change that comes with FORCEALIGN is that without it,
extent size hints affect only the alignment of the file range mappings.
With FORCEALIGN, the space allocation itself *and* the mapping are
aligned.)

The one big downside of FORCEALIGN is that the extent size hint can
become misaligned with the AWU (or pagetable) geometry if the fs is
moved to a different computing environment. I prefer not to couple the
interface to the hardware because that leaves open the possibility for
users to discover more use cases.

>
> > In iomap_dio_bio_iter(), ensure that for a non-dsync iocb that the mapping
> > is not dirty nor unmapped.
> >
> > A write should only produce a single bio, so error when it doesn't.
>
> I comment on both these things below.
>
> >
> > Signed-off-by: John Garry <[email protected]>
> > ---
> > fs/iomap/direct-io.c | 26 ++++++++++++++++++++++++--
> > fs/iomap/trace.h | 3 ++-
> > include/linux/iomap.h | 1 +
> > 3 files changed, 27 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > index bcd3f8cf5ea4..6ef25e26f1a1 100644
> > --- a/fs/iomap/direct-io.c
> > +++ b/fs/iomap/direct-io.c
> > @@ -275,10 +275,11 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> > static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > struct iomap_dio *dio)
> > {
> > + bool atomic_write = iter->flags & IOMAP_ATOMIC_WRITE;
> > const struct iomap *iomap = &iter->iomap;
> > struct inode *inode = iter->inode;
> > unsigned int fs_block_size = i_blocksize(inode), pad;
> > - loff_t length = iomap_length(iter);
> > + const loff_t length = iomap_length(iter);
> > loff_t pos = iter->pos;
> > blk_opf_t bio_opf;
> > struct bio *bio;
> > @@ -292,6 +293,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> > return -EINVAL;
> >
> > + if (atomic_write && !iocb_is_dsync(dio->iocb)) {
> > + if (iomap->flags & IOMAP_F_DIRTY)
> > + return -EIO;
> > + if (iomap->type != IOMAP_MAPPED)
> > + return -EIO;
> > + }
>
> How do we get here without space having been allocated for the
> write?
>
> Perhaps what this is trying to do is make RWF_ATOMIC only be valid
> into written space? I mean, this will fail with preallocated space
> (IOMAP_UNWRITTEN) even though we still have exactly the RWF_ATOMIC
> all-or-nothing behaviour guaranteed after a crash because of journal
> recovery behaviour. i.e. if the unwritten conversion gets written to
> the journal, the data will be there. If it isn't written to the
> journal, then the space remains unwritten and there's no data across
> that entire range....
>
> So I'm not really sure that either of these checks are valid or why
> they are actually needed....

This requires O_DSYNC (or RWF_DSYNC) for atomic writes to unwritten or
COW space. We want failures in forcing the log transactions for the
endio processing to be reported to the pwrite caller as EIO, right?

--D

> > +
> > if (iomap->type == IOMAP_UNWRITTEN) {
> > dio->flags |= IOMAP_DIO_UNWRITTEN;
> > need_zeroout = true;
> > @@ -381,6 +389,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > GFP_KERNEL);
> > bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> > bio->bi_ioprio = dio->iocb->ki_ioprio;
> > + if (atomic_write)
> > + bio->bi_opf |= REQ_ATOMIC;
> > +
> > bio->bi_private = dio;
> > bio->bi_end_io = iomap_dio_bio_end_io;
> >
> > @@ -397,6 +408,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > }
> >
> > n = bio->bi_iter.bi_size;
> > + if (atomic_write && n != length) {
> > + /* This bio should have covered the complete length */
> > + ret = -EINVAL;
> > + bio_put(bio);
> > + goto out;
>
> Why? The actual bio can be any length that meets the aligned
> criteria between min and max, yes? So it's valid to split a
> RWF_ATOMIC write request up into multiple min unit sized bios, is it
> not? I mean, that's the whole point of the min/max unit setup, isn't
> it? That the max sized write only guarantees that it will tear at
> min unit boundaries, not within those min unit boundaries? If
> I've understood this correctly, then why does this "single bio for
> large atomic write" constraint need to exist?
>
>
> > + }
> > if (dio->flags & IOMAP_DIO_WRITE) {
> > task_io_account_write(n);
> > } else {
> > @@ -554,6 +571,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > struct blk_plug plug;
> > struct iomap_dio *dio;
> > loff_t ret = 0;
> > + bool is_read = iov_iter_rw(iter) == READ;
> > + bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;
>
> This does not need to be done here, because....
>
> >
> > trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
> >
> > @@ -579,7 +598,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > if (iocb->ki_flags & IOCB_NOWAIT)
> > iomi.flags |= IOMAP_NOWAIT;
> >
> > - if (iov_iter_rw(iter) == READ) {
> > + if (is_read) {
> > /* reads can always complete inline */
> > dio->flags |= IOMAP_DIO_INLINE_COMP;
> >
> > @@ -605,6 +624,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
> > dio->flags |= IOMAP_DIO_CALLER_COMP;
> >
> > + if (atomic_write)
> > + iomi.flags |= IOMAP_ATOMIC_WRITE;
>
> .... it is only checked once in the write path, so
>
> if (iocb->ki_flags & IOCB_ATOMIC)
> iomi.flags |= IOMAP_ATOMIC;
>
> > +
> > if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
> > ret = -EAGAIN;
> > if (iomi.pos >= dio->i_size ||
> > diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> > index c16fd55f5595..f9932733c180 100644
> > --- a/fs/iomap/trace.h
> > +++ b/fs/iomap/trace.h
> > @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
> > { IOMAP_REPORT, "REPORT" }, \
> > { IOMAP_FAULT, "FAULT" }, \
> > { IOMAP_DIRECT, "DIRECT" }, \
> > - { IOMAP_NOWAIT, "NOWAIT" }
> > + { IOMAP_NOWAIT, "NOWAIT" }, \
> > + { IOMAP_ATOMIC_WRITE, "ATOMIC" }
>
> We already have an IOMAP_WRITE flag, so IOMAP_ATOMIC is the modifier
> for the write IO behaviour (like NOWAIT), not a replacement write
> flag.
>
> -Dave.
> --
> Dave Chinner
> [email protected]

2023-10-04 01:16:37

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH 16/21] fs: iomap: Atomic write support

On Tue, Oct 03, 2023 at 09:47:49AM -0700, Darrick J. Wong wrote:
> On Tue, Oct 03, 2023 at 03:24:23PM +1100, Dave Chinner wrote:
> > On Fri, Sep 29, 2023 at 10:27:21AM +0000, John Garry wrote:
> > > Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
> > > bio is being created and all the rules there need to be followed.
> > >
> > > It is the task of the FS iomap iter callbacks to ensure that the mapping
> > > created adheres to those rules, like size is power-of-2, is at a
> > > naturally-aligned offset, etc.
> >
> > The mapping being returned by the filesystem can span a much greater
> > range than the actual IO needs - the iomap itself is not guaranteed
> > to be aligned to anything in particular, but the IO location within
> > that map can still conform to atomic IO constraints. See how
> > iomap_sector() calculates the actual LBA address of the IO from
> > the iomap and the current file position the IO is being done at.
> >
> > hence I think saying "the filesysetm should make sure all IO
> > alignment adheres to atomic IO rules is probably wrong. The iomap
> > layer doesn't care what the filesystem does, all it cares about is
> > whether the IO can be done given the extent map that was returned to
> > it.
> >
> > Indeed, iomap_dio_bio_iter() is doing all these alignment checks for
> > normal DIO reads and writes which must be logical block sized
> > aligned. i.e. this check:
> >
> > if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
> > !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> > return -EINVAL;
> >
> > Hence I think that atomic IO units, which are similarly defined by
> > the bdev, should be checked at the iomap layer, too. e.g, by
> > following up with:
> >
> > if ((dio->iocb->ki_flags & IOCB_ATOMIC) &&
> > ((pos | length) & (bdev_atomic_unit_min(iomap->bdev) - 1) ||
> > !bdev_iter_is_atomic_aligned(iomap->bdev, dio->submit.iter))
> > return -EINVAL;
> >
> > At this point, filesystems don't really need to know anything about
> > atomic IO - if they've allocated a large contiguous extent (e.g. via
> > fallocate()), then RWF_ATOMIC will just work for the cases where the
> > block device supports it...
> >
> > This then means that stuff like XFS extent size hints only need to
> > check when the hint is set that it is aligned to the underlying
> > device atomic IO constraints. Then when it sees the IOMAP_ATOMIC
> > modifier, it can fail allocation if it can't get extent size hint
> > aligned allocation.
> >
> > IOWs, I'm starting to think this doesn't need any change to the
> > on-disk format for XFS - it can be driven entirely through two
> > dynamic mechanisms:
> >
> > 1. (IOMAP_WRITE | IOMAP_ATOMIC) requests from the direct IO layer
> > which causes mapping/allocation to fail if it can't allocate (or
> > map) atomic IO compatible extents for the IO.
> >
> > 2. FALLOC_FL_ATOMIC preallocation flag modifier to tell fallocate()
> > to force alignment of all preallocated extents to atomic IO
> > constraints.
>
> Ugh, let's not relitigate problems that you (Dave) and I have already
> solved.
>
> Back in 2018, our internal proto-users of pmem asked for aligned
> allocations so they could use PMD mappings to reduce TLB pressure. At
> the time, you and I talked on IRC about whether that should be done via
> fallocate flag or setting extszinherit+sunit at mkfs time. We decided
> against adding fallocate flags because linux-api bikeshed hell.

Ok, but I don't see how I'm supposed to correlate a discussion from
5 years ago on a different topic with this one. I can only comment
on what I see in front of me. And what is in front of me is
something that doesn't need on-disk changes to implement....

> Ever since, we've been shipping UEK with a mkfs.xmem scripts that
> automates computing the mkfs.xfs geometry CLI options. It works,
> mostly, except for the unaligned allocations that one gets when the free
> space gets fragmented. The xfsprogs side of the forcealign patchset
> moves most of the mkfs.xmem cli option setting logic into mkfs itself,
> and the kernel side shuts off the lowspace allocator to fix the
> fragmentation problem.
>
> I'd rather fix the remaining quirks and not reinvent solved solutions,
> as popular as that is in programming circles.
>
> Why is mandatory allocation alignment for atomic writes different?
> Forcealign solves the problem for NVME/SCSI AWU and pmem PMD in the same
> way with the same control knobs for sysadmins. I don't want to have
> totally separate playbooks for accomplishing nearly the same things.

Which is fair enough, but that's not the context under which this
has been presented.

Can we please get the forced-align stuff separated from atomic write
support - the atomic write requirements completely overwhelms small
amount of change needed to support physical file offset
alignment....

> I don't like encoding hardware details in the fallocate uapi either.
> That implies adding FALLOC_FL_HUGEPAGE for pmem, and possibly
> FALLOC_FL_{SUNIT,SWIDTH} for users with RAIDs.

No, that's reading way too much into it. FALLOC_FL_ATOMIC would mean
"ensure preallocation is valid for RWF_ATOMIC based IO contrainsts",
nothing more, nothing less. This isn't -hardware specific-, it's
simply a flag to tell the filesystem to align file offsets to
physical storage constraints so the allocated space works works
appropriately for a specific IO API.

IOWs, it is little different from the FALLOC_FL_NOHIDE_STALE flag
for modifying fallocate() behaviour...

> > This doesn't require extent size hints at all. The filesystem can
> > query the bdev at mount time, store the min/max atomic write sizes,
> > and then use them for all requests that have _ATOMIC modifiers set
> > on them.
> >
> > With iomap doing the same "get the atomic constraints from the bdev"
> > style lookups for per-IO file offset and size checking, I don't
> > think we actually need extent size hints or an on-disk flag to force
> > extent size hint alignment.
> >
> > That doesn't mean extent size hints can't be used - it just means
> > that extent size hints have to be constrained to being aligned to
> > atomic IOs (e.g. extent size hint must be an integer multiple of the
> > max atomic IO size). This then acts as a modifier for _ATOMIC
> > context allocations, much like it is a modifier for normal
> > allocations now.
>
> (One behavior change that comes with FORCEALIGN is that without it,
> extent size hints affect only the alignment of the file range mappings.
> With FORCEALIGN, the space allocation itself *and* the mapping are
> aligned.)
>
> The one big downside of FORCEALIGN is that the extent size hint can
> become misaligned with the AWU (or pagetable) geometry if the fs is
> moved to a different computing environment. I prefer not to couple the
> interface to the hardware because that leaves open the possibility for
> users to discover more use cases.

Sure, but this isn't really a "forced" alignment. This is a feature
that is providing "file offset is physically aligned to an
underlying hardware address space" instead of doing the normal thing
of abstracting file data away from the physical layout of the
storage.

If we can have user APIs that say "file data should be physically
aligned to storage" then we don't need on-disk flags to implement
this. Extent size hints could still be used to indicate the required
alignment, but we could also pull it straight from the hardware if
those aren't set. AFAICT only fallocate() and pwritev2() need these
flags for IO, but we could add a fadvise() command to set it on a
struct file, if mmap()/madvise is told to use hugepages we can use
PMD alignment rather than storage hardware alignment, etc.

IOWs actually having APIs that simply say "use physical offset
alignment" without actually saying exactly which hardware alignment
they want allows the filesystem to dynamically select the optimal
alignment for the given application use case rather than requiring
the admin to set up specific configuration at mkfs time....



> > >
> > > Signed-off-by: John Garry <[email protected]>
> > > ---
> > > fs/iomap/direct-io.c | 26 ++++++++++++++++++++++++--
> > > fs/iomap/trace.h | 3 ++-
> > > include/linux/iomap.h | 1 +
> > > 3 files changed, 27 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > > index bcd3f8cf5ea4..6ef25e26f1a1 100644
> > > --- a/fs/iomap/direct-io.c
> > > +++ b/fs/iomap/direct-io.c
> > > @@ -275,10 +275,11 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> > > static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > > struct iomap_dio *dio)
> > > {
> > > + bool atomic_write = iter->flags & IOMAP_ATOMIC_WRITE;
> > > const struct iomap *iomap = &iter->iomap;
> > > struct inode *inode = iter->inode;
> > > unsigned int fs_block_size = i_blocksize(inode), pad;
> > > - loff_t length = iomap_length(iter);
> > > + const loff_t length = iomap_length(iter);
> > > loff_t pos = iter->pos;
> > > blk_opf_t bio_opf;
> > > struct bio *bio;
> > > @@ -292,6 +293,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > > !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> > > return -EINVAL;
> > >
> > > + if (atomic_write && !iocb_is_dsync(dio->iocb)) {
> > > + if (iomap->flags & IOMAP_F_DIRTY)
> > > + return -EIO;
> > > + if (iomap->type != IOMAP_MAPPED)
> > > + return -EIO;
> > > + }
> >
> > How do we get here without space having been allocated for the
> > write?
> >
> > Perhaps what this is trying to do is make RWF_ATOMIC only be valid
> > into written space? I mean, this will fail with preallocated space
> > (IOMAP_UNWRITTEN) even though we still have exactly the RWF_ATOMIC
> > all-or-nothing behaviour guaranteed after a crash because of journal
> > recovery behaviour. i.e. if the unwritten conversion gets written to
> > the journal, the data will be there. If it isn't written to the
> > journal, then the space remains unwritten and there's no data across
> > that entire range....
> >
> > So I'm not really sure that either of these checks are valid or why
> > they are actually needed....
>
> This requires O_DSYNC (or RWF_DSYNC) for atomic writes to unwritten or
> COW space.

COW, maybe - I haven't thought that far through it.

However, for unwritten extents we just don't need O_DSYNC to
guarantee all or nothing writes. The application still has to use
fdatasync() to determine if the IO succeeded, but the actual IO and
unwritten conversion transaction ordering guarantee the
"all-or-nothing" behaviour of a RWF_ATOMIC write that is not using
O_DSYNC.

i.e. It just doesn't matter when the conversion transaction hits
the journal. If it doesn't hit the journal before the crash, the
write never happened. If it does hit the journal, then the cache
flush before the journal write ensures all the data from the
RWF_ATOMIC write is present on disk before the unwritten conversion
hits the journal.

> We want failures in forcing the log transactions for the
> endio processing to be reported to the pwrite caller as EIO, right?

A failure to force the log will result in a filesystem shutdown. It
doesn't matter if that happens during IO completion or sometime
before or during the fdatasync() call the application would still
need to use to guarantee data integrity.

RWF_ATOMIC implies FUA semantics, right? i.e. if the RWF_ATOMIC
write is a pure overwrite, there are no journal or cache flushes
needed to complete the write. If so, batching up all the metadata
updates between data integrity checkpoints can still make
performance much better. If the filesystem flushes the journal
itself, it's no different from an application crash recovery
perspective to using RWF_DSYNC|RWF_ATOMIC and failing in the middle
of a multi-IO update....

Hence I just don't see why RWF_ATOMIC requires O_DSYNC semantics at
all; all RWF_ATOMIC provides is larger "non-tearing" IO granularity
and this doesn't change filesystem data integrity semantics at all.

-Dave.
--
Dave Chinner
[email protected]

2023-10-24 13:02:01

by John Garry

[permalink] [raw]
Subject: Re: [PATCH 16/21] fs: iomap: Atomic write support

On 03/10/2023 05:24, Dave Chinner wrote:

I don't think that this was ever responded to - apologies for that.

>> n = bio->bi_iter.bi_size;
>> + if (atomic_write && n != length) {
>> + /* This bio should have covered the complete length */
>> + ret = -EINVAL;
>> + bio_put(bio);
>> + goto out;
> Why? The actual bio can be any length that meets the aligned
> criteria between min and max, yes?
> So it's valid to split a
> RWF_ATOMIC write request up into multiple min unit sized bios, is it
> not?

It is not.

> I mean, that's the whole point of the min/max unit setup, isn't
> it?

atomic write unit min/max are lower and upper limits for the atomic
write length only.

> That the max sized write only guarantees that it will tear at
> min unit boundaries, not within those min unit boundaries?

We will never split an atomic write nor create multiple bios for an
atomic write. unit min is the minimum size supported for an atomic write
length. It is not also a boundary size which we may split a write. An
atomic write will only ever produce a maximum for a single IO operation.
We do support merging of atomic writes in the block layer, but this is
transparent to the user.

Please let me know if
https://lore.kernel.org/linux-api/[email protected]/T/#mb48328cf84b1643b651b5f1293f443e26f18fbb5
needs to be improved to make this clear.

> If
> I've understood this correctly, then why does this "single bio for
> large atomic write" constraint need to exist?
>
>

Thanks,
John