2024-03-19 01:18:55

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 0/9] xfs/iomap: fix non-atomic clone operation and don't update size when zeroing range post eof

From: Zhang Yi <[email protected]>

Changes since v2:
- Merge the patch for dropping of xfs_convert_blocks() and the patch
for modifying xfs_bmapi_convert_delalloc().
- Reword the commit message of the second patch.

Changes since v1:
- Make xfs_bmapi_convert_delalloc() to allocate the target offset and
drop the writeback helper xfs_convert_blocks().
- Don't use xfs_iomap_write_direct() to convert delalloc blocks for
zeroing posteof case, use xfs_bmapi_convert_delalloc() instead.
- Fix two off-by-one issues when converting delalloc blocks.
- Add a separate patch to drop the buffered write failure handle in
zeroing and unsharing.
- Add a comments do emphasize updating i_size should under folio lock.
- Make iomap_write_end() to return a boolean, and do some cleanups in
buffered write begin path.

This patch series fix a problem of exposing zeroed data on xfs since the
non-atomic clone operation. This problem was found while I was
developing ext4 buffered IO iomap conversion (ext4 is relying on this
fix [1]), the root cause of this problem and the discussion about the
solution please see [2]. After fix the problem, iomap_zero_range()
doesn't need to update i_size so that ext4 can use it to zero partial
block, e.g. truncate eof block [3].

[1] https://lore.kernel.org/linux-ext4/[email protected]/
[2] https://lore.kernel.org/linux-ext4/[email protected]/
[3] https://lore.kernel.org/linux-ext4/[email protected]/

Thanks,
Yi.

Zhang Yi (9):
xfs: match lock mode in xfs_buffered_write_iomap_begin()
xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional
xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset
xfs: convert delayed extents to unwritten when zeroing post eof blocks
iomap: drop the write failure handles when unsharing and zeroing
iomap: don't increase i_size if it's not a write operation
iomap: use a new variable to handle the written bytes in
iomap_write_iter()
iomap: make iomap_write_end() return a boolean
iomap: do some small logical cleanup in buffered write

fs/iomap/buffered-io.c | 101 ++++++++++++++++++++++-----------------
fs/xfs/libxfs/xfs_bmap.c | 40 ++++++++++++++--
fs/xfs/xfs_aops.c | 54 ++++++---------------
fs/xfs/xfs_iomap.c | 39 +++++++++++++--
4 files changed, 142 insertions(+), 92 deletions(-)

--
2.39.2



2024-03-19 01:19:15

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 7/9] iomap: use a new variable to handle the written bytes in iomap_write_iter()

From: Zhang Yi <[email protected]>

In iomap_write_iter(), the status variable used to receive the return
value from iomap_write_end() is confusing, replace it with a new written
variable to represent the written bytes in each cycle, no logic changes.

Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/iomap/buffered-io.c | 33 +++++++++++++++++----------------
1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e9112dc78d15..291648c61a32 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -851,7 +851,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t length = iomap_length(iter);
size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
loff_t pos = iter->pos;
- ssize_t written = 0;
+ ssize_t total_written = 0;
long status = 0;
struct address_space *mapping = iter->inode->i_mapping;
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -862,6 +862,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
+ size_t written; /* Bytes have been written */

bytes = iov_iter_count(i);
retry:
@@ -906,7 +907,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
flush_dcache_folio(folio);

copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
- status = iomap_write_end(iter, pos, bytes, copied, folio);
+ written = iomap_write_end(iter, pos, bytes, copied, folio);

/*
* Update the in-memory inode size after copying the data into
@@ -916,22 +917,22 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
* unlock and release the folio.
*/
old_size = iter->inode->i_size;
- if (pos + status > old_size) {
- i_size_write(iter->inode, pos + status);
+ if (pos + written > old_size) {
+ i_size_write(iter->inode, pos + written);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
- __iomap_put_folio(iter, pos, status, folio);
+ __iomap_put_folio(iter, pos, written, folio);

if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
- if (status < bytes)
- iomap_write_failed(iter->inode, pos + status,
- bytes - status);
- if (unlikely(copied != status))
- iov_iter_revert(i, copied - status);
+ if (written < bytes)
+ iomap_write_failed(iter->inode, pos + written,
+ bytes - written);
+ if (unlikely(copied != written))
+ iov_iter_revert(i, copied - written);

cond_resched();
- if (unlikely(status == 0)) {
+ if (unlikely(written == 0)) {
/*
* A short copy made iomap_write_end() reject the
* thing entirely. Might be memory poisoning
@@ -945,17 +946,17 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
goto retry;
}
} else {
- pos += status;
- written += status;
- length -= status;
+ pos += written;
+ total_written += written;
+ length -= written;
}
} while (iov_iter_count(i) && length);

if (status == -EAGAIN) {
- iov_iter_revert(i, written);
+ iov_iter_revert(i, total_written);
return -EAGAIN;
}
- return written ? written : status;
+ return total_written ? total_written : status;
}

ssize_t
--
2.39.2


2024-03-19 01:19:24

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 8/9] iomap: make iomap_write_end() return a boolean

From: Zhang Yi <[email protected]>

For now, we can make sure iomap_write_end() always return 0 or copied
bytes, so instead of return written bytes, convert to return a boolean
to indicate the copied bytes have been written to the pagecache.

Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/iomap/buffered-io.c | 50 +++++++++++++++++++++++++++---------------
1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 291648c61a32..004673ea8bc1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -790,7 +790,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
return status;
}

-static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
flush_dcache_folio(folio);
@@ -807,14 +807,14 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
* redo the whole thing.
*/
if (unlikely(copied < len && !folio_test_uptodate(folio)))
- return 0;
+ return false;
iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
filemap_dirty_folio(inode->i_mapping, folio);
- return copied;
+ return true;
}

-static size_t iomap_write_end_inline(const struct iomap_iter *iter,
+static void iomap_write_end_inline(const struct iomap_iter *iter,
struct folio *folio, loff_t pos, size_t copied)
{
const struct iomap *iomap = &iter->iomap;
@@ -829,21 +829,32 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter,
kunmap_local(addr);

mark_inode_dirty(iter->inode);
- return copied;
}

-/* Returns the number of bytes copied. May be 0. Cannot be an errno. */
-static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
+/*
+ * Returns true if all copied bytes have been written to the pagecache,
+ * otherwise return false.
+ */
+static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ bool ret = true;

- if (srcmap->type == IOMAP_INLINE)
- return iomap_write_end_inline(iter, folio, pos, copied);
- if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
- return block_write_end(NULL, iter->inode->i_mapping, pos, len,
- copied, &folio->page, NULL);
- return __iomap_write_end(iter->inode, pos, len, copied, folio);
+ if (srcmap->type == IOMAP_INLINE) {
+ iomap_write_end_inline(iter, folio, pos, copied);
+ } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
+ size_t bh_written;
+
+ bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
+ len, copied, &folio->page, NULL);
+ WARN_ON_ONCE(bh_written != copied && bh_written != 0);
+ ret = bh_written == copied;
+ } else {
+ ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
+ }
+
+ return ret;
}

static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
@@ -907,7 +918,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
flush_dcache_folio(folio);

copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
- written = iomap_write_end(iter, pos, bytes, copied, folio);
+ written = iomap_write_end(iter, pos, bytes, copied, folio) ?
+ copied : 0;

/*
* Update the in-memory inode size after copying the data into
@@ -1285,6 +1297,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
int status;
size_t offset;
size_t bytes = min_t(u64, SIZE_MAX, length);
+ bool ret;

status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
@@ -1296,9 +1309,9 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
if (bytes > folio_size(folio) - offset)
bytes = folio_size(folio) - offset;

- bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ ret = iomap_write_end(iter, pos, bytes, bytes, folio);
__iomap_put_folio(iter, pos, bytes, folio);
- if (WARN_ON_ONCE(bytes == 0))
+ if (WARN_ON_ONCE(!ret))
return -EIO;

cond_resched();
@@ -1347,6 +1360,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
int status;
size_t offset;
size_t bytes = min_t(u64, SIZE_MAX, length);
+ bool ret;

status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
@@ -1361,9 +1375,9 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
folio_zero_range(folio, offset, bytes);
folio_mark_accessed(folio);

- bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ ret = iomap_write_end(iter, pos, bytes, bytes, folio);
__iomap_put_folio(iter, pos, bytes, folio);
- if (WARN_ON_ONCE(bytes == 0))
+ if (WARN_ON_ONCE(!ret))
return -EIO;

pos += bytes;
--
2.39.2


2024-03-19 01:19:47

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 9/9] iomap: do some small logical cleanup in buffered write

From: Zhang Yi <[email protected]>

Since iomap_write_end() can never return a partial write length, the
comperation between written, copied and bytes becomes useless, just
merge them with the unwritten branch.

Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/iomap/buffered-io.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 004673ea8bc1..f2fb89056259 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -937,11 +937,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)

if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
- if (written < bytes)
- iomap_write_failed(iter->inode, pos + written,
- bytes - written);
- if (unlikely(copied != written))
- iov_iter_revert(i, copied - written);

cond_resched();
if (unlikely(written == 0)) {
@@ -951,6 +946,9 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
+ iomap_write_failed(iter->inode, pos, bytes);
+ iov_iter_revert(i, copied);
+
if (chunk > PAGE_SIZE)
chunk /= 2;
if (copied) {
--
2.39.2


2024-03-19 01:36:52

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 2/9] xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional

From: Zhang Yi <[email protected]>

Allow callers to pass a NULLL seq argument if they don't care about
the fork sequence number.

Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/xfs/libxfs/xfs_bmap.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f362345467fa..07dc35de8ce5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4574,7 +4574,8 @@ xfs_bmapi_convert_delalloc(
if (!isnullstartblock(bma.got.br_startblock)) {
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}

@@ -4623,7 +4624,8 @@ xfs_bmapi_convert_delalloc(
ASSERT(!isnullstartblock(bma.got.br_startblock));
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);

if (whichfork == XFS_COW_FORK)
xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
--
2.39.2


2024-03-19 01:37:30

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 6/9] iomap: don't increase i_size if it's not a write operation

From: Zhang Yi <[email protected]>

Increase i_size in iomap_zero_range() and iomap_unshare_iter() is not
needed, the caller should handle it. Especially, when truncate partial
block, we should not increase i_size beyond the new EOF here. It doesn't
affect xfs and gfs2 now because they set the new file size after zero
out, it doesn't matter that a transient increase in i_size, but it will
affect ext4 because it set file size before truncate. So move the i_size
updating logic to iomap_write_iter().

Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/iomap/buffered-io.c | 50 +++++++++++++++++++++---------------------
1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 7e32a204650b..e9112dc78d15 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -837,32 +837,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t old_size = iter->inode->i_size;
- size_t ret;
-
- if (srcmap->type == IOMAP_INLINE) {
- ret = iomap_write_end_inline(iter, folio, pos, copied);
- } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
- ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
- copied, &folio->page, NULL);
- } else {
- ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
- }
-
- /*
- * Update the in-memory inode size after copying the data into the page
- * cache. It's up to the file system to write the updated size to disk,
- * preferably after I/O completion so that no stale data is exposed.
- */
- if (pos + ret > old_size) {
- i_size_write(iter->inode, pos + ret);
- iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
- }
- __iomap_put_folio(iter, pos, ret, folio);

- if (old_size < pos)
- pagecache_isize_extended(iter->inode, old_size, pos);
- return ret;
+ if (srcmap->type == IOMAP_INLINE)
+ return iomap_write_end_inline(iter, folio, pos, copied);
+ if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
+ return block_write_end(NULL, iter->inode->i_mapping, pos, len,
+ copied, &folio->page, NULL);
+ return __iomap_write_end(iter->inode, pos, len, copied, folio);
}

static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
@@ -877,6 +858,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)

do {
struct folio *folio;
+ loff_t old_size;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
@@ -926,6 +908,22 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
status = iomap_write_end(iter, pos, bytes, copied, folio);

+ /*
+ * Update the in-memory inode size after copying the data into
+ * the page cache. It's up to the file system to write the
+ * updated size to disk, preferably after I/O completion so that
+ * no stale data is exposed. Only once that's done can we
+ * unlock and release the folio.
+ */
+ old_size = iter->inode->i_size;
+ if (pos + status > old_size) {
+ i_size_write(iter->inode, pos + status);
+ iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
+ }
+ __iomap_put_folio(iter, pos, status, folio);
+
+ if (old_size < pos)
+ pagecache_isize_extended(iter->inode, old_size, pos);
if (status < bytes)
iomap_write_failed(iter->inode, pos + status,
bytes - status);
@@ -1298,6 +1296,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
bytes = folio_size(folio) - offset;

bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(bytes == 0))
return -EIO;

@@ -1362,6 +1361,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
folio_mark_accessed(folio);

bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(bytes == 0))
return -EIO;

--
2.39.2


2024-03-19 01:37:40

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 3/9] xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset

From: Zhang Yi <[email protected]>

Since xfs_bmapi_convert_delalloc() only attempts to allocate the entire
delalloc extent and require multiple invocations to allocate the target
offset. So xfs_convert_blocks() add a loop to do this job and we call it
in the write back path, but xfs_convert_blocks() isn't a common helper.
Let's do it in xfs_bmapi_convert_delalloc() and drop
xfs_convert_blocks(), preparing for the post EOF delalloc blocks
converting in the buffered write begin path.

Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/xfs/libxfs/xfs_bmap.c | 34 +++++++++++++++++++++++--
fs/xfs/xfs_aops.c | 54 +++++++++++-----------------------------
2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 07dc35de8ce5..042e8d3ab0ba 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4516,8 +4516,8 @@ xfs_bmapi_write(
* invocations to allocate the target offset if a large enough physical extent
* is not available.
*/
-int
-xfs_bmapi_convert_delalloc(
+static int
+__xfs_bmapi_convert_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_off_t offset,
@@ -4648,6 +4648,36 @@ xfs_bmapi_convert_delalloc(
return error;
}

+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in iomap.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ loff_t offset,
+ struct iomap *iomap,
+ unsigned int *seq)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs offset
+ * and put the result into iomap. Allocate in a loop because it may
+ * take several attempts to allocate real blocks for a contiguous
+ * delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = __xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ iomap, seq);
+ if (error)
+ return error;
+ } while (iomap->offset + iomap->length <= offset);
+
+ return 0;
+}
+
int
xfs_bmapi_remap(
struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c..6479e0dac69d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -233,45 +233,6 @@ xfs_imap_valid(
return true;
}

-/*
- * Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->iomap.
- *
- * The current page is held locked so nothing could have removed the block
- * backing offset_fsb, although it could have moved from the COW to the data
- * fork by another thread.
- */
-static int
-xfs_convert_blocks(
- struct iomap_writepage_ctx *wpc,
- struct xfs_inode *ip,
- int whichfork,
- loff_t offset)
-{
- int error;
- unsigned *seq;
-
- if (whichfork == XFS_COW_FORK)
- seq = &XFS_WPC(wpc)->cow_seq;
- else
- seq = &XFS_WPC(wpc)->data_seq;
-
- /*
- * Attempt to allocate whatever delalloc extent currently backs offset
- * and put the result into wpc->iomap. Allocate in a loop because it
- * may take several attempts to allocate real blocks for a contiguous
- * delalloc extent if free space is sufficiently fragmented.
- */
- do {
- error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
- &wpc->iomap, seq);
- if (error)
- return error;
- } while (wpc->iomap.offset + wpc->iomap.length <= offset);
-
- return 0;
-}
-
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
@@ -289,6 +250,7 @@ xfs_map_blocks(
struct xfs_iext_cursor icur;
int retries = 0;
int error = 0;
+ unsigned int *seq;

if (xfs_is_shutdown(mp))
return -EIO;
@@ -386,7 +348,19 @@ xfs_map_blocks(
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, whichfork, offset);
+ /*
+ * Convert a dellalloc extent to a real one. The current page is held
+ * locked so nothing could have removed the block backing offset_fsb,
+ * although it could have moved from the COW to the data fork by another
+ * thread.
+ */
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
+
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
--
2.39.2


2024-03-19 01:38:08

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 5/9] iomap: drop the write failure handles when unsharing and zeroing

From: Zhang Yi <[email protected]>

Unsharing and zeroing can only happen within EOF, so there is never a
need to perform posteof pagecache truncation if write begin fails, also
partial write could never theoretically happened from iomap_write_end(),
so remove both of them.

Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/iomap/buffered-io.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 093c4515b22a..7e32a204650b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -786,7 +786,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,

out_unlock:
__iomap_put_folio(iter, pos, 0, folio);
- iomap_write_failed(iter->inode, pos, len);

return status;
}
@@ -863,8 +862,6 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,

if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
- if (ret < len)
- iomap_write_failed(iter->inode, pos + ret, len - ret);
return ret;
}

@@ -912,8 +909,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
}

status = iomap_write_begin(iter, pos, bytes, &folio);
- if (unlikely(status))
+ if (unlikely(status)) {
+ iomap_write_failed(iter->inode, pos, bytes);
break;
+ }
if (iter->iomap.flags & IOMAP_F_STALE)
break;

@@ -927,6 +926,9 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
status = iomap_write_end(iter, pos, bytes, copied, folio);

+ if (status < bytes)
+ iomap_write_failed(iter->inode, pos + status,
+ bytes - status);
if (unlikely(copied != status))
iov_iter_revert(i, copied - status);

--
2.39.2


2024-03-19 01:39:13

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 4/9] xfs: convert delayed extents to unwritten when zeroing post eof blocks

From: Zhang Yi <[email protected]>

Current clone operation could be non-atomic if the destination of a file
is beyond EOF, user could get a file with corrupted (zeroed) data on
crash.

The problem is about to pre-alloctions. If you write some data into a
file [A, B) (the position letters are increased one by one), and xfs
could pre-allocate some blocks, then we get a delayed extent [A, D).
Then the writeback path allocate blocks and convert this delayed extent
[A, C) since lack of enough contiguous physical blocks, so the extent
[C, D) is still delayed. After that, both the in-memory and the on-disk
file size are B. If we clone file range into [E, F) from another file,
xfs_reflink_zero_posteof() would call iomap_zero_range() to zero out the
range [B, E) beyond EOF and flush range. Since [C, D) is still a delayed
extent, it will be zeroed and the file's in-memory && on-disk size will
be updated to D after flushing and before doing the clone operation.
This is wrong, because user can user can see the size change and read
zeros in the middle of the clone operation.

We need to keep the in-memory and on-disk size before the clone
operation starts, so instead of writing zeroes through the page cache
for delayed ranges beyond EOF, we convert these ranges to unwritten and
invalidating any cached data over that range beyond EOF.

Suggested-by: Dave Chinner <[email protected]>
Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_iomap.c | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ccf83e72d8ca..1a6d05830433 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1035,6 +1035,24 @@ xfs_buffered_write_iomap_begin(
}

if (imap.br_startoff <= offset_fsb) {
+ /*
+ * For zeroing out delayed allocation extent, we trim it if
+ * it's partial beyonds EOF block, or convert it to unwritten
+ * extent if it's all beyonds EOF block.
+ */
+ if ((flags & IOMAP_ZERO) &&
+ isnullstartblock(imap.br_startblock)) {
+ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+ if (offset_fsb >= eof_fsb)
+ goto convert_delay;
+ if (end_fsb > eof_fsb) {
+ end_fsb = eof_fsb;
+ xfs_trim_extent(&imap, offset_fsb,
+ end_fsb - offset_fsb);
+ }
+ }
+
/*
* For reflink files we may need a delalloc reservation when
* overwriting shared extents. This includes zeroing of
@@ -1158,6 +1176,17 @@ xfs_buffered_write_iomap_begin(
xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);

+convert_delay:
+ xfs_iunlock(ip, lockmode);
+ truncate_pagecache(inode, offset);
+ error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset,
+ iomap, NULL);
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap);
+ return 0;
+
found_cow:
seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
--
2.39.2


2024-03-19 20:46:35

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 3/9] xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset

On Tue, Mar 19, 2024 at 09:10:56AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> Since xfs_bmapi_convert_delalloc() only attempts to allocate the entire
> delalloc extent and require multiple invocations to allocate the target
> offset. So xfs_convert_blocks() add a loop to do this job and we call it
> in the write back path, but xfs_convert_blocks() isn't a common helper.
> Let's do it in xfs_bmapi_convert_delalloc() and drop
> xfs_convert_blocks(), preparing for the post EOF delalloc blocks
> converting in the buffered write begin path.
>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/xfs/libxfs/xfs_bmap.c | 34 +++++++++++++++++++++++--
> fs/xfs/xfs_aops.c | 54 +++++++++++-----------------------------
> 2 files changed, 46 insertions(+), 42 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 07dc35de8ce5..042e8d3ab0ba 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4516,8 +4516,8 @@ xfs_bmapi_write(
> * invocations to allocate the target offset if a large enough physical extent
> * is not available.
> */
> -int
> -xfs_bmapi_convert_delalloc(
> +static int

static inline?

> +__xfs_bmapi_convert_delalloc(

Double underscore prefixes read to me like "do this without grabbing
a lock or a resource", not just one step in a loop.

Would you mind changing it to xfs_bmapi_convert_one_delalloc() ?
Then the callsite looks like:

xfs_bmapi_convert_delalloc(...)
{
...
do {
error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset,
iomap, seq);
if (error)
return error;
} while (iomap->offset + iomap->length <= offset);
}

With that renamed,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> struct xfs_inode *ip,
> int whichfork,
> xfs_off_t offset,
> @@ -4648,6 +4648,36 @@ xfs_bmapi_convert_delalloc(
> return error;
> }
>
> +/*
> + * Pass in a dellalloc extent and convert it to real extents, return the real
> + * extent that maps offset_fsb in iomap.
> + */
> +int
> +xfs_bmapi_convert_delalloc(
> + struct xfs_inode *ip,
> + int whichfork,
> + loff_t offset,
> + struct iomap *iomap,
> + unsigned int *seq)
> +{
> + int error;
> +
> + /*
> + * Attempt to allocate whatever delalloc extent currently backs offset
> + * and put the result into iomap. Allocate in a loop because it may
> + * take several attempts to allocate real blocks for a contiguous
> + * delalloc extent if free space is sufficiently fragmented.
> + */
> + do {
> + error = __xfs_bmapi_convert_delalloc(ip, whichfork, offset,
> + iomap, seq);
> + if (error)
> + return error;
> + } while (iomap->offset + iomap->length <= offset);
> +
> + return 0;
> +}
> +
> int
> xfs_bmapi_remap(
> struct xfs_trans *tp,
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 813f85156b0c..6479e0dac69d 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -233,45 +233,6 @@ xfs_imap_valid(
> return true;
> }
>
> -/*
> - * Pass in a dellalloc extent and convert it to real extents, return the real
> - * extent that maps offset_fsb in wpc->iomap.
> - *
> - * The current page is held locked so nothing could have removed the block
> - * backing offset_fsb, although it could have moved from the COW to the data
> - * fork by another thread.
> - */
> -static int
> -xfs_convert_blocks(
> - struct iomap_writepage_ctx *wpc,
> - struct xfs_inode *ip,
> - int whichfork,
> - loff_t offset)
> -{
> - int error;
> - unsigned *seq;
> -
> - if (whichfork == XFS_COW_FORK)
> - seq = &XFS_WPC(wpc)->cow_seq;
> - else
> - seq = &XFS_WPC(wpc)->data_seq;
> -
> - /*
> - * Attempt to allocate whatever delalloc extent currently backs offset
> - * and put the result into wpc->iomap. Allocate in a loop because it
> - * may take several attempts to allocate real blocks for a contiguous
> - * delalloc extent if free space is sufficiently fragmented.
> - */
> - do {
> - error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
> - &wpc->iomap, seq);
> - if (error)
> - return error;
> - } while (wpc->iomap.offset + wpc->iomap.length <= offset);
> -
> - return 0;
> -}
> -
> static int
> xfs_map_blocks(
> struct iomap_writepage_ctx *wpc,
> @@ -289,6 +250,7 @@ xfs_map_blocks(
> struct xfs_iext_cursor icur;
> int retries = 0;
> int error = 0;
> + unsigned int *seq;
>
> if (xfs_is_shutdown(mp))
> return -EIO;
> @@ -386,7 +348,19 @@ xfs_map_blocks(
> trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
> return 0;
> allocate_blocks:
> - error = xfs_convert_blocks(wpc, ip, whichfork, offset);
> + /*
> + * Convert a dellalloc extent to a real one. The current page is held
> + * locked so nothing could have removed the block backing offset_fsb,
> + * although it could have moved from the COW to the data fork by another
> + * thread.
> + */
> + if (whichfork == XFS_COW_FORK)
> + seq = &XFS_WPC(wpc)->cow_seq;
> + else
> + seq = &XFS_WPC(wpc)->data_seq;
> +
> + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
> + &wpc->iomap, seq);
> if (error) {
> /*
> * If we failed to find the extent in the COW fork we might have
> --
> 2.39.2
>
>

2024-03-19 21:00:31

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 4/9] xfs: convert delayed extents to unwritten when zeroing post eof blocks

On Tue, Mar 19, 2024 at 09:10:57AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> Current clone operation could be non-atomic if the destination of a file
> is beyond EOF, user could get a file with corrupted (zeroed) data on
> crash.
>
> The problem is about to pre-alloctions. If you write some data into a

"...is about preallocations."

> file [A, B) (the position letters are increased one by one), and xfs

I think it would help with understandability if you'd pasted the ascii
art from the previous thread(s) into this commit message:

"The problem is about preallocations. If you write some data into a
file:

[A...B)

and XFS decides to preallocate some post-eof blocks, then it can create
a delayed allocation reservation:

[A.........D)

The writeback path tries to convert delayed extents to real ones by
allocating blocks. If there aren't enough contiguous free space, we can
end up with two extents, the first real and the second still delalloc:

[A....C)[C.D)

After that, both the in-memory and the on-disk file sizes are still B.
If we clone into the range [E...F) from another file:

[A....C)[C.D) [E...F)

then xfs_reflink_zero_posteof calls iomap_zero_range to zero out the
range [B, E) beyond EOF and flush it. Since [C, D) is still a delalloc
extent, its pagecache will be zeroed and both the in-memory and on-disk
size will be updated to D after flushing but before cloning. This is
wrong, because the user can see the size change and read the zeroes
while the clone operation is ongoing."

> could pre-allocate some blocks, then we get a delayed extent [A, D).
> Then the writeback path allocate blocks and convert this delayed extent
> [A, C) since lack of enough contiguous physical blocks, so the extent
> [C, D) is still delayed. After that, both the in-memory and the on-disk
> file size are B. If we clone file range into [E, F) from another file,
> xfs_reflink_zero_posteof() would call iomap_zero_range() to zero out the
> range [B, E) beyond EOF and flush range. Since [C, D) is still a delayed
> extent, it will be zeroed and the file's in-memory && on-disk size will
> be updated to D after flushing and before doing the clone operation.
> This is wrong, because user can user can see the size change and read
> zeros in the middle of the clone operation.
>
> We need to keep the in-memory and on-disk size before the clone
> operation starts, so instead of writing zeroes through the page cache
> for delayed ranges beyond EOF, we convert these ranges to unwritten and
> invalidating any cached data over that range beyond EOF.

"...and invalidate any cached data..."

> Suggested-by: Dave Chinner <[email protected]>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/xfs/xfs_iomap.c | 29 +++++++++++++++++++++++++++++
> 1 file changed, 29 insertions(+)
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index ccf83e72d8ca..1a6d05830433 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1035,6 +1035,24 @@ xfs_buffered_write_iomap_begin(
> }
>
> if (imap.br_startoff <= offset_fsb) {
> + /*
> + * For zeroing out delayed allocation extent, we trim it if
> + * it's partial beyonds EOF block, or convert it to unwritten
> + * extent if it's all beyonds EOF block.

"Trim a delalloc extent that extends beyond the EOF block. If it starts
beyond the EOF block, convert it to an unwritten extent."

> + */
> + if ((flags & IOMAP_ZERO) &&
> + isnullstartblock(imap.br_startblock)) {
> + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
> +
> + if (offset_fsb >= eof_fsb)
> + goto convert_delay;
> + if (end_fsb > eof_fsb) {
> + end_fsb = eof_fsb;
> + xfs_trim_extent(&imap, offset_fsb,
> + end_fsb - offset_fsb);
> + }
> + }
> +
> /*
> * For reflink files we may need a delalloc reservation when
> * overwriting shared extents. This includes zeroing of
> @@ -1158,6 +1176,17 @@ xfs_buffered_write_iomap_begin(
> xfs_iunlock(ip, lockmode);
> return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
>
> +convert_delay:
> + xfs_iunlock(ip, lockmode);
> + truncate_pagecache(inode, offset);
> + error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset,
> + iomap, NULL);

Either indent this two tabs (XFS style), or make the second line of
arguments line up with the start of the arguments on the first line
(kernel style).

With those improvements applied,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> + if (error)
> + return error;
> +
> + trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap);
> + return 0;
> +
> found_cow:
> seq = xfs_iomap_inode_sequence(ip, 0);
> if (imap.br_startoff <= offset_fsb) {
> --
> 2.39.2
>
>

2024-03-19 21:01:56

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 2/9] xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional

On Tue, Mar 19, 2024 at 09:10:55AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> Allow callers to pass a NULLL seq argument if they don't care about
> the fork sequence number.
>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>

Aha, you want this because xfs_bmbt_to_iomap will set the iomap validity
cookie for us, whereas writeback wants to track the per-fork cookie in
the xfs writeback structure. Ok.

Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/xfs/libxfs/xfs_bmap.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index f362345467fa..07dc35de8ce5 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4574,7 +4574,8 @@ xfs_bmapi_convert_delalloc(
> if (!isnullstartblock(bma.got.br_startblock)) {
> xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
> xfs_iomap_inode_sequence(ip, flags));
> - *seq = READ_ONCE(ifp->if_seq);
> + if (seq)
> + *seq = READ_ONCE(ifp->if_seq);
> goto out_trans_cancel;
> }
>
> @@ -4623,7 +4624,8 @@ xfs_bmapi_convert_delalloc(
> ASSERT(!isnullstartblock(bma.got.br_startblock));
> xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
> xfs_iomap_inode_sequence(ip, flags));
> - *seq = READ_ONCE(ifp->if_seq);
> + if (seq)
> + *seq = READ_ONCE(ifp->if_seq);
>
> if (whichfork == XFS_COW_FORK)
> xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
> --
> 2.39.2
>
>

2024-03-19 21:03:51

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 5/9] iomap: drop the write failure handles when unsharing and zeroing

On Tue, Mar 19, 2024 at 09:10:58AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> Unsharing and zeroing can only happen within EOF, so there is never a
> need to perform posteof pagecache truncation if write begin fails, also
> partial write could never theoretically happened from iomap_write_end(),
> so remove both of them.
>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>

Makes sense,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/iomap/buffered-io.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 093c4515b22a..7e32a204650b 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -786,7 +786,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
>
> out_unlock:
> __iomap_put_folio(iter, pos, 0, folio);
> - iomap_write_failed(iter->inode, pos, len);
>
> return status;
> }
> @@ -863,8 +862,6 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
>
> if (old_size < pos)
> pagecache_isize_extended(iter->inode, old_size, pos);
> - if (ret < len)
> - iomap_write_failed(iter->inode, pos + ret, len - ret);
> return ret;
> }
>
> @@ -912,8 +909,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> }
>
> status = iomap_write_begin(iter, pos, bytes, &folio);
> - if (unlikely(status))
> + if (unlikely(status)) {
> + iomap_write_failed(iter->inode, pos, bytes);
> break;
> + }
> if (iter->iomap.flags & IOMAP_F_STALE)
> break;
>
> @@ -927,6 +926,9 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> status = iomap_write_end(iter, pos, bytes, copied, folio);
>
> + if (status < bytes)
> + iomap_write_failed(iter->inode, pos + status,
> + bytes - status);
> if (unlikely(copied != status))
> iov_iter_revert(i, copied - status);
>
> --
> 2.39.2
>
>

2024-03-19 21:04:22

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 6/9] iomap: don't increase i_size if it's not a write operation

On Tue, Mar 19, 2024 at 09:10:59AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> Increase i_size in iomap_zero_range() and iomap_unshare_iter() is not
> needed, the caller should handle it. Especially, when truncate partial
> block, we should not increase i_size beyond the new EOF here. It doesn't
> affect xfs and gfs2 now because they set the new file size after zero
> out, it doesn't matter that a transient increase in i_size, but it will
> affect ext4 because it set file size before truncate. So move the i_size
> updating logic to iomap_write_iter().
>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>

Thanks for applying the comment update,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/iomap/buffered-io.c | 50 +++++++++++++++++++++---------------------
> 1 file changed, 25 insertions(+), 25 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 7e32a204650b..e9112dc78d15 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -837,32 +837,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
> size_t copied, struct folio *folio)
> {
> const struct iomap *srcmap = iomap_iter_srcmap(iter);
> - loff_t old_size = iter->inode->i_size;
> - size_t ret;
> -
> - if (srcmap->type == IOMAP_INLINE) {
> - ret = iomap_write_end_inline(iter, folio, pos, copied);
> - } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> - ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
> - copied, &folio->page, NULL);
> - } else {
> - ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> - }
> -
> - /*
> - * Update the in-memory inode size after copying the data into the page
> - * cache. It's up to the file system to write the updated size to disk,
> - * preferably after I/O completion so that no stale data is exposed.
> - */
> - if (pos + ret > old_size) {
> - i_size_write(iter->inode, pos + ret);
> - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> - }
> - __iomap_put_folio(iter, pos, ret, folio);
>
> - if (old_size < pos)
> - pagecache_isize_extended(iter->inode, old_size, pos);
> - return ret;
> + if (srcmap->type == IOMAP_INLINE)
> + return iomap_write_end_inline(iter, folio, pos, copied);
> + if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
> + return block_write_end(NULL, iter->inode->i_mapping, pos, len,
> + copied, &folio->page, NULL);
> + return __iomap_write_end(iter->inode, pos, len, copied, folio);
> }
>
> static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> @@ -877,6 +858,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>
> do {
> struct folio *folio;
> + loff_t old_size;
> size_t offset; /* Offset into folio */
> size_t bytes; /* Bytes to write to folio */
> size_t copied; /* Bytes copied from user */
> @@ -926,6 +908,22 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> status = iomap_write_end(iter, pos, bytes, copied, folio);
>
> + /*
> + * Update the in-memory inode size after copying the data into
> + * the page cache. It's up to the file system to write the
> + * updated size to disk, preferably after I/O completion so that
> + * no stale data is exposed. Only once that's done can we
> + * unlock and release the folio.
> + */
> + old_size = iter->inode->i_size;
> + if (pos + status > old_size) {
> + i_size_write(iter->inode, pos + status);
> + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> + }
> + __iomap_put_folio(iter, pos, status, folio);
> +
> + if (old_size < pos)
> + pagecache_isize_extended(iter->inode, old_size, pos);
> if (status < bytes)
> iomap_write_failed(iter->inode, pos + status,
> bytes - status);
> @@ -1298,6 +1296,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
> bytes = folio_size(folio) - offset;
>
> bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
> + __iomap_put_folio(iter, pos, bytes, folio);
> if (WARN_ON_ONCE(bytes == 0))
> return -EIO;
>
> @@ -1362,6 +1361,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
> folio_mark_accessed(folio);
>
> bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
> + __iomap_put_folio(iter, pos, bytes, folio);
> if (WARN_ON_ONCE(bytes == 0))
> return -EIO;
>
> --
> 2.39.2
>
>

2024-03-19 21:05:13

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 7/9] iomap: use a new variable to handle the written bytes in iomap_write_iter()

On Tue, Mar 19, 2024 at 09:11:00AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> In iomap_write_iter(), the status variable used to receive the return
> value from iomap_write_end() is confusing, replace it with a new written
> variable to represent the written bytes in each cycle, no logic changes.
>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>

Looks good,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/iomap/buffered-io.c | 33 +++++++++++++++++----------------
> 1 file changed, 17 insertions(+), 16 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index e9112dc78d15..291648c61a32 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -851,7 +851,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> loff_t length = iomap_length(iter);
> size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
> loff_t pos = iter->pos;
> - ssize_t written = 0;
> + ssize_t total_written = 0;
> long status = 0;
> struct address_space *mapping = iter->inode->i_mapping;
> unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
> @@ -862,6 +862,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> size_t offset; /* Offset into folio */
> size_t bytes; /* Bytes to write to folio */
> size_t copied; /* Bytes copied from user */
> + size_t written; /* Bytes have been written */
>
> bytes = iov_iter_count(i);
> retry:
> @@ -906,7 +907,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> flush_dcache_folio(folio);
>
> copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> - status = iomap_write_end(iter, pos, bytes, copied, folio);
> + written = iomap_write_end(iter, pos, bytes, copied, folio);
>
> /*
> * Update the in-memory inode size after copying the data into
> @@ -916,22 +917,22 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> * unlock and release the folio.
> */
> old_size = iter->inode->i_size;
> - if (pos + status > old_size) {
> - i_size_write(iter->inode, pos + status);
> + if (pos + written > old_size) {
> + i_size_write(iter->inode, pos + written);
> iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> }
> - __iomap_put_folio(iter, pos, status, folio);
> + __iomap_put_folio(iter, pos, written, folio);
>
> if (old_size < pos)
> pagecache_isize_extended(iter->inode, old_size, pos);
> - if (status < bytes)
> - iomap_write_failed(iter->inode, pos + status,
> - bytes - status);
> - if (unlikely(copied != status))
> - iov_iter_revert(i, copied - status);
> + if (written < bytes)
> + iomap_write_failed(iter->inode, pos + written,
> + bytes - written);
> + if (unlikely(copied != written))
> + iov_iter_revert(i, copied - written);
>
> cond_resched();
> - if (unlikely(status == 0)) {
> + if (unlikely(written == 0)) {
> /*
> * A short copy made iomap_write_end() reject the
> * thing entirely. Might be memory poisoning
> @@ -945,17 +946,17 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> goto retry;
> }
> } else {
> - pos += status;
> - written += status;
> - length -= status;
> + pos += written;
> + total_written += written;
> + length -= written;
> }
> } while (iov_iter_count(i) && length);
>
> if (status == -EAGAIN) {
> - iov_iter_revert(i, written);
> + iov_iter_revert(i, total_written);
> return -EAGAIN;
> }
> - return written ? written : status;
> + return total_written ? total_written : status;
> }
>
> ssize_t
> --
> 2.39.2
>
>

2024-03-19 21:08:23

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 8/9] iomap: make iomap_write_end() return a boolean

On Tue, Mar 19, 2024 at 09:11:01AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> For now, we can make sure iomap_write_end() always return 0 or copied
> bytes, so instead of return written bytes, convert to return a boolean
> to indicate the copied bytes have been written to the pagecache.
>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/iomap/buffered-io.c | 50 +++++++++++++++++++++++++++---------------
> 1 file changed, 32 insertions(+), 18 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 291648c61a32..004673ea8bc1 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -790,7 +790,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
> return status;
> }
>
> -static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
> +static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
> size_t copied, struct folio *folio)
> {
> flush_dcache_folio(folio);
> @@ -807,14 +807,14 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
> * redo the whole thing.
> */
> if (unlikely(copied < len && !folio_test_uptodate(folio)))
> - return 0;
> + return false;
> iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
> iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
> filemap_dirty_folio(inode->i_mapping, folio);
> - return copied;
> + return true;
> }
>
> -static size_t iomap_write_end_inline(const struct iomap_iter *iter,
> +static void iomap_write_end_inline(const struct iomap_iter *iter,
> struct folio *folio, loff_t pos, size_t copied)
> {
> const struct iomap *iomap = &iter->iomap;
> @@ -829,21 +829,32 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter,
> kunmap_local(addr);
>
> mark_inode_dirty(iter->inode);
> - return copied;
> }
>
> -/* Returns the number of bytes copied. May be 0. Cannot be an errno. */
> -static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
> +/*
> + * Returns true if all copied bytes have been written to the pagecache,
> + * otherwise return false.
> + */
> +static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
> size_t copied, struct folio *folio)
> {
> const struct iomap *srcmap = iomap_iter_srcmap(iter);
> + bool ret = true;
>
> - if (srcmap->type == IOMAP_INLINE)
> - return iomap_write_end_inline(iter, folio, pos, copied);
> - if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
> - return block_write_end(NULL, iter->inode->i_mapping, pos, len,
> - copied, &folio->page, NULL);
> - return __iomap_write_end(iter->inode, pos, len, copied, folio);
> + if (srcmap->type == IOMAP_INLINE) {
> + iomap_write_end_inline(iter, folio, pos, copied);
> + } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> + size_t bh_written;
> +
> + bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
> + len, copied, &folio->page, NULL);
> + WARN_ON_ONCE(bh_written != copied && bh_written != 0);
> + ret = bh_written == copied;
> + } else {
> + ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> + }
> +
> + return ret;

This could be cleaned up even further:

if (srcmap->type == IOMAP_INLINE) {
iomap_write_end_inline(iter, folio, pos, copied);
return true;
}

if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
size_t bh_written;

bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
len, copied, &folio->page, NULL);
WARN_ON_ONCE(bh_written != copied && bh_written != 0);
return bh_written == copied;
}

return __iomap_write_end(iter->inode, pos, len, copied, folio);

> }
>
> static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> @@ -907,7 +918,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> flush_dcache_folio(folio);
>
> copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> - written = iomap_write_end(iter, pos, bytes, copied, folio);
> + written = iomap_write_end(iter, pos, bytes, copied, folio) ?
> + copied : 0;
>
> /*
> * Update the in-memory inode size after copying the data into
> @@ -1285,6 +1297,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
> int status;
> size_t offset;
> size_t bytes = min_t(u64, SIZE_MAX, length);
> + bool ret;
>
> status = iomap_write_begin(iter, pos, bytes, &folio);
> if (unlikely(status))
> @@ -1296,9 +1309,9 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
> if (bytes > folio_size(folio) - offset)
> bytes = folio_size(folio) - offset;
>
> - bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
> + ret = iomap_write_end(iter, pos, bytes, bytes, folio);
> __iomap_put_folio(iter, pos, bytes, folio);
> - if (WARN_ON_ONCE(bytes == 0))
> + if (WARN_ON_ONCE(!ret))

If you named this variable "write_end_ok" then the diagnostic output
from the WARN_ONs would say that. That said, it also encodes the line
number so it's not a big deal to leave this as it is.

With at least the first cleanup applied,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> return -EIO;
>
> cond_resched();
> @@ -1347,6 +1360,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
> int status;
> size_t offset;
> size_t bytes = min_t(u64, SIZE_MAX, length);
> + bool ret;
>
> status = iomap_write_begin(iter, pos, bytes, &folio);
> if (status)
> @@ -1361,9 +1375,9 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
> folio_zero_range(folio, offset, bytes);
> folio_mark_accessed(folio);
>
> - bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
> + ret = iomap_write_end(iter, pos, bytes, bytes, folio);
> __iomap_put_folio(iter, pos, bytes, folio);
> - if (WARN_ON_ONCE(bytes == 0))
> + if (WARN_ON_ONCE(!ret))
> return -EIO;
>
> pos += bytes;
> --
> 2.39.2
>
>

2024-03-19 21:09:01

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH v3 9/9] iomap: do some small logical cleanup in buffered write

On Tue, Mar 19, 2024 at 09:11:02AM +0800, Zhang Yi wrote:
> From: Zhang Yi <[email protected]>
>
> Since iomap_write_end() can never return a partial write length, the
> comperation between written, copied and bytes becomes useless, just

comparison

> merge them with the unwritten branch.
>
> Signed-off-by: Zhang Yi <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>

With the spelling error fixed,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/iomap/buffered-io.c | 8 +++-----
> 1 file changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 004673ea8bc1..f2fb89056259 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -937,11 +937,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>
> if (old_size < pos)
> pagecache_isize_extended(iter->inode, old_size, pos);
> - if (written < bytes)
> - iomap_write_failed(iter->inode, pos + written,
> - bytes - written);
> - if (unlikely(copied != written))
> - iov_iter_revert(i, copied - written);
>
> cond_resched();
> if (unlikely(written == 0)) {
> @@ -951,6 +946,9 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> * halfway through, might be a race with munmap,
> * might be severe memory pressure.
> */
> + iomap_write_failed(iter->inode, pos, bytes);
> + iov_iter_revert(i, copied);
> +
> if (chunk > PAGE_SIZE)
> chunk /= 2;
> if (copied) {
> --
> 2.39.2
>
>

2024-03-20 00:11:29

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3 3/9] xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset

> > -xfs_bmapi_convert_delalloc(
> > +static int
>
> static inline?

I'd just leave that to the compiler, no need to second guess all the
decisions.

> Double underscore prefixes read to me like "do this without grabbing
> a lock or a resource", not just one step in a loop.
>
> Would you mind changing it to xfs_bmapi_convert_one_delalloc() ?
> Then the callsite looks like:

Fine with me.


2024-03-20 01:51:53

by Zhang Yi

[permalink] [raw]
Subject: Re: [PATCH v3 3/9] xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset

On 2024/3/20 4:45, Darrick J. Wong wrote:
> On Tue, Mar 19, 2024 at 09:10:56AM +0800, Zhang Yi wrote:
>> From: Zhang Yi <[email protected]>
>>
>> Since xfs_bmapi_convert_delalloc() only attempts to allocate the entire
>> delalloc extent and require multiple invocations to allocate the target
>> offset. So xfs_convert_blocks() add a loop to do this job and we call it
>> in the write back path, but xfs_convert_blocks() isn't a common helper.
>> Let's do it in xfs_bmapi_convert_delalloc() and drop
>> xfs_convert_blocks(), preparing for the post EOF delalloc blocks
>> converting in the buffered write begin path.
>>
>> Signed-off-by: Zhang Yi <[email protected]>
>> Reviewed-by: Christoph Hellwig <[email protected]>
>> ---
>> fs/xfs/libxfs/xfs_bmap.c | 34 +++++++++++++++++++++++--
>> fs/xfs/xfs_aops.c | 54 +++++++++++-----------------------------
>> 2 files changed, 46 insertions(+), 42 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
>> index 07dc35de8ce5..042e8d3ab0ba 100644
>> --- a/fs/xfs/libxfs/xfs_bmap.c
>> +++ b/fs/xfs/libxfs/xfs_bmap.c
>> @@ -4516,8 +4516,8 @@ xfs_bmapi_write(
>> * invocations to allocate the target offset if a large enough physical extent
>> * is not available.
>> */
>> -int
>> -xfs_bmapi_convert_delalloc(
>> +static int
>
> static inline?
>

I'd suggest to leave that to the compiler too.

>> +__xfs_bmapi_convert_delalloc(
>
> Double underscore prefixes read to me like "do this without grabbing
> a lock or a resource", not just one step in a loop.
>
> Would you mind changing it to xfs_bmapi_convert_one_delalloc() ?
> Then the callsite looks like:
>
> xfs_bmapi_convert_delalloc(...)
> {
> ...
> do {
> error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset,
> iomap, seq);
> if (error)
> return error;
> } while (iomap->offset + iomap->length <= offset);
> }
>

Thanks for your suggestions, all subsequent improvements in this series are
also looks fine by me, I will revise them in my next iteration. Christoph,
I will keep your review tag, please let me know if you have different
opinion.

Thanks,
Yi.


2024-03-20 04:22:45

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3 3/9] xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset

On Wed, Mar 20, 2024 at 09:51:26AM +0800, Zhang Yi wrote:
> Thanks for your suggestions, all subsequent improvements in this series are
> also looks fine by me, I will revise them in my next iteration. Christoph,
> I will keep your review tag, please let me know if you have different
> opinion.

Yes, please keep them. All changes so far remain pretty trivial.


2024-03-19 01:37:09

by Zhang Yi

[permalink] [raw]
Subject: [PATCH v3 1/9] xfs: match lock mode in xfs_buffered_write_iomap_begin()

From: Zhang Yi <[email protected]>

Commit 1aa91d9c9933 ("xfs: Add async buffered write support") replace
xfs_ilock(XFS_ILOCK_EXCL) with xfs_ilock_for_iomap() when locking the
writing inode, and a new variable lockmode is used to indicate the lock
mode. Although the lockmode should always be XFS_ILOCK_EXCL, it's still
better to use this variable instead of useing XFS_ILOCK_EXCL directly
when unlocking the inode.

Fixes: 1aa91d9c9933 ("xfs: Add async buffered write support")
Signed-off-by: Zhang Yi <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_iomap.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18c8f168b153..ccf83e72d8ca 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1149,13 +1149,13 @@ xfs_buffered_write_iomap_begin(
* them out if the write happens to fail.
*/
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);

found_imap:
seq = xfs_iomap_inode_sequence(ip, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);

found_cow:
@@ -1165,17 +1165,17 @@ xfs_buffered_write_iomap_begin(
if (error)
goto out_unlock;
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq);
}

xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);

out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
}

--
2.39.2