From: Zhang Yi <[email protected]>
This patch series fix a problem of exposing zeroed data on xfs since the
non-atomic clone operation. This problem was found while I was
developing ext4 buffered IO iomap conversion (ext4 is relying on this
fix [1]), the root cause of this problem and the discussion about the
solution please see [2]. After fix the problem, iomap_zero_range()
doesn't need to update i_size so that ext4 can use it to zero partial
block, e.g. truncate eof block [3].
[1] https://lore.kernel.org/linux-ext4/[email protected]/
[2] https://lore.kernel.org/linux-ext4/[email protected]/
[3] https://lore.kernel.org/linux-ext4/[email protected]/
Thanks,
Yi.
Zhang Yi (4):
xfs: match lock mode in xfs_buffered_write_iomap_begin()
xfs: convert delayed extents to unwritten when zeroing post eof blocks
iomap: don't increase i_size if it's not a write operation
iomap: cleanup iomap_write_iter()
fs/iomap/buffered-io.c | 78 +++++++++++++++++++++---------------------
fs/xfs/xfs_iomap.c | 39 ++++++++++++++++++---
2 files changed, 73 insertions(+), 44 deletions(-)
--
2.39.2
From: Zhang Yi <[email protected]>
The status variable in iomap_write_iter() is confusing and
iomap_write_end() always return 0 or copied bytes, so replace it with a
new written variable to represent the written bytes in each cycle, and
also do some cleanup, no logic changes.
Signed-off-by: Zhang Yi <[email protected]>
---
fs/iomap/buffered-io.c | 31 +++++++++++++++----------------
1 file changed, 15 insertions(+), 16 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 19f91324c690..767af6e67ed4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -851,7 +851,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t length = iomap_length(iter);
size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
loff_t pos = iter->pos;
- ssize_t written = 0;
+ ssize_t total_written = 0;
long status = 0;
struct address_space *mapping = iter->inode->i_mapping;
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -862,6 +862,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
+ size_t written; /* Bytes have been written */
bytes = iov_iter_count(i);
retry:
@@ -906,7 +907,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
flush_dcache_folio(folio);
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
- status = iomap_write_end(iter, pos, bytes, copied, folio);
+ written = iomap_write_end(iter, pos, bytes, copied, folio);
/*
* Update the in-memory inode size after copying the data into
@@ -915,28 +916,26 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
* no stale data is exposed.
*/
old_size = iter->inode->i_size;
- if (pos + status > old_size) {
- i_size_write(iter->inode, pos + status);
+ if (pos + written > old_size) {
+ i_size_write(iter->inode, pos + written);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
- __iomap_put_folio(iter, pos, status, folio);
+ __iomap_put_folio(iter, pos, written, folio);
if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
- if (status < bytes)
- iomap_write_failed(iter->inode, pos + status,
- bytes - status);
- if (unlikely(copied != status))
- iov_iter_revert(i, copied - status);
cond_resched();
- if (unlikely(status == 0)) {
+ if (unlikely(written == 0)) {
/*
* A short copy made iomap_write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
+ iomap_write_failed(iter->inode, pos, bytes);
+ iov_iter_revert(i, copied);
+
if (chunk > PAGE_SIZE)
chunk /= 2;
if (copied) {
@@ -944,17 +943,17 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
goto retry;
}
} else {
- pos += status;
- written += status;
- length -= status;
+ pos += written;
+ total_written += written;
+ length -= written;
}
} while (iov_iter_count(i) && length);
if (status == -EAGAIN) {
- iov_iter_revert(i, written);
+ iov_iter_revert(i, total_written);
return -EAGAIN;
}
- return written ? written : status;
+ return total_written ? total_written : status;
}
ssize_t
--
2.39.2
On Mon, Mar 11, 2024 at 08:22:55PM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> The status variable in iomap_write_iter() is confusing and
> iomap_write_end() always return 0 or copied bytes, so replace it with a
> new written variable to represent the written bytes in each cycle, and
> also do some cleanup, no logic changes.
>
> Signed-off-by: Zhang Yi <[email protected]>
> ---
> fs/iomap/buffered-io.c | 31 +++++++++++++++----------------
> 1 file changed, 15 insertions(+), 16 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 19f91324c690..767af6e67ed4 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -851,7 +851,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> loff_t length = iomap_length(iter);
> size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
> loff_t pos = iter->pos;
> - ssize_t written = 0;
> + ssize_t total_written = 0;
> long status = 0;
> struct address_space *mapping = iter->inode->i_mapping;
> unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
> @@ -862,6 +862,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> size_t offset; /* Offset into folio */
> size_t bytes; /* Bytes to write to folio */
> size_t copied; /* Bytes copied from user */
> + size_t written; /* Bytes have been written */
>
> bytes = iov_iter_count(i);
> retry:
> @@ -906,7 +907,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> flush_dcache_folio(folio);
>
> copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> - status = iomap_write_end(iter, pos, bytes, copied, folio);
> + written = iomap_write_end(iter, pos, bytes, copied, folio);
>
> /*
> * Update the in-memory inode size after copying the data into
> @@ -915,28 +916,26 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> * no stale data is exposed.
> */
> old_size = iter->inode->i_size;
> - if (pos + status > old_size) {
> - i_size_write(iter->inode, pos + status);
> + if (pos + written > old_size) {
> + i_size_write(iter->inode, pos + written);
> iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> }
> - __iomap_put_folio(iter, pos, status, folio);
> + __iomap_put_folio(iter, pos, written, folio);
>
> if (old_size < pos)
> pagecache_isize_extended(iter->inode, old_size, pos);
> - if (status < bytes)
> - iomap_write_failed(iter->inode, pos + status,
> - bytes - status);
> - if (unlikely(copied != status))
> - iov_iter_revert(i, copied - status);
I wish you'd made the variable renaming and the function reorganization
separate patches. The renaming looks correct to me, but moving these
calls adds a logic bomb.
If at some point iomap_write_end actually starts returning partial write
completions (e.g. you wrote 250 bytes, but for some reason the pagecache
only acknowledges 100 bytes were written) then this code no longer
reverts the iter or truncates posteof pagecache correctly...
>
> cond_resched();
> - if (unlikely(status == 0)) {
> + if (unlikely(written == 0)) {
> /*
> * A short copy made iomap_write_end() reject the
> * thing entirely. Might be memory poisoning
> * halfway through, might be a race with munmap,
> * might be severe memory pressure.
> */
> + iomap_write_failed(iter->inode, pos, bytes);
> + iov_iter_revert(i, copied);
..because now we only do that if the pagecache refuses to acknowledge
any bytes written at all. I think it actually works correctly with
today's kernel since __iomap_write_end only returns copied or 0, but the
size_t return type implies that a short acknowledgement is theoretically
possible.
IOWs, doesn't this adds a logic bomb?
--D
> +
> if (chunk > PAGE_SIZE)
> chunk /= 2;
> if (copied) {
> @@ -944,17 +943,17 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> goto retry;
> }
> } else {
> - pos += status;
> - written += status;
> - length -= status;
> + pos += written;
> + total_written += written;
> + length -= written;
> }
> } while (iov_iter_count(i) && length);
>
> if (status == -EAGAIN) {
> - iov_iter_revert(i, written);
> + iov_iter_revert(i, total_written);
> return -EAGAIN;
> }
> - return written ? written : status;
> + return total_written ? total_written : status;
> }
>
> ssize_t
> --
> 2.39.2
>
>
On Mon, Mar 11, 2024 at 09:07:39AM -0700, Darrick J. Wong wrote:
> If at some point iomap_write_end actually starts returning partial write
> completions (e.g. you wrote 250 bytes, but for some reason the pagecache
> only acknowledges 100 bytes were written) then this code no longer
> reverts the iter or truncates posteof pagecache correctly...
I don't think it makes sense to return a partial write from
iomap_write_end. But to make that clear it really should not return
a byte count by a boolean. I've been wanting to make that cleanup
for a while, but it would reach all the way into buffer.c.
On Tue, Mar 12, 2024 at 05:24:00AM -0700, Christoph Hellwig wrote:
> On Mon, Mar 11, 2024 at 09:07:39AM -0700, Darrick J. Wong wrote:
> > If at some point iomap_write_end actually starts returning partial write
> > completions (e.g. you wrote 250 bytes, but for some reason the pagecache
> > only acknowledges 100 bytes were written) then this code no longer
> > reverts the iter or truncates posteof pagecache correctly...
>
> I don't think it makes sense to return a partial write from
> iomap_write_end. But to make that clear it really should not return
> a byte count by a boolean. I've been wanting to make that cleanup
> for a while, but it would reach all the way into buffer.c.
For now, can we change the return types of iomap_write_end_inline and
__iomap_write_end? Then iomap can WARN_ON if the block_write_end return
value isn't 0 or copied:
bool ret;
if (srcmap->type == IOMAP_INLINE) {
ret = iomap_write_end_inline(iter, folio, pos, copied);
} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
size_t bh_written;
bh_written = block_write_end(NULL, iter->inode->i_mapping,
pos, len, copied, &folio->page, NULL);
WARN_ON(bh_written != copied && bh_written != 0);
ret = bh_written == copied;
} else {
ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
}
...
return ret;
Some day later we can circle back to bufferheads, or maybe they'll die
before we get to it. ;)
--D
On 2024/3/13 0:27, Darrick J. Wong wrote:
> On Tue, Mar 12, 2024 at 05:24:00AM -0700, Christoph Hellwig wrote:
>> On Mon, Mar 11, 2024 at 09:07:39AM -0700, Darrick J. Wong wrote:
>>> If at some point iomap_write_end actually starts returning partial write
>>> completions (e.g. you wrote 250 bytes, but for some reason the pagecache
>>> only acknowledges 100 bytes were written) then this code no longer
>>> reverts the iter or truncates posteof pagecache correctly...
>>
>> I don't think it makes sense to return a partial write from
>> iomap_write_end. But to make that clear it really should not return
>> a byte count by a boolean. I've been wanting to make that cleanup
>> for a while, but it would reach all the way into buffer.c.
>
> For now, can we change the return types of iomap_write_end_inline and
> __iomap_write_end? Then iomap can WARN_ON if the block_write_end return
> value isn't 0 or copied:
>
> bool ret;
>
> if (srcmap->type == IOMAP_INLINE) {
> ret = iomap_write_end_inline(iter, folio, pos, copied);
> } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> size_t bh_written;
>
> bh_written = block_write_end(NULL, iter->inode->i_mapping,
> pos, len, copied, &folio->page, NULL);
>
> WARN_ON(bh_written != copied && bh_written != 0);
> ret = bh_written == copied;
> } else {
> ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> }
>
> ...
>
> return ret;
>
> Some day later we can circle back to bufferheads, or maybe they'll die
> before we get to it. ;)
>
It looks great to me for now, we can revise iomap first.
Thanks,
Yi.