Hello,
Please find v2 of the left over folio conversion changes.
It would be nice if you could also review Patch-2 and Patch-4.
v1 -> v2
=========
1. Added Patch-2 which removes PAGE_SIZE assumption from mpage_submit_folio.
(IMO, this was a missed left over from previous conversion).
2. Addressed a small review comment from Matthew in the tracepoint patch to take
(inode, folio).
3. Added Reviewed-by from Matthew.
Testing
========
I haven't found any new failures in my fstests testing with "-g auto" with
default config.
Ritesh Harjani (IBM) (5):
ext4: kill unused function ext4_journalled_write_inline_data
ext4: Remove PAGE_SIZE assumption of folio from mpage_submit_folio
ext4: Change remaining tracepoints to use folio
ext4: Make mpage_journal_page_buffers use folio
ext4: Make ext4_write_inline_data_end() use folio
fs/ext4/ext4.h | 10 ++-----
fs/ext4/inline.c | 27 +----------------
fs/ext4/inode.c | 60 ++++++++++++++++++++-----------------
include/trace/events/ext4.h | 26 ++++++++--------
4 files changed, 48 insertions(+), 75 deletions(-)
--
2.40.1
Commit 3f079114bf522 ("ext4: Convert data=journal writeback to use ext4_writepages()")
Added support for writeback of journalled data into ext4_writepages()
and killed function __ext4_journalled_writepage() which used to call
ext4_journalled_write_inline_data() for inline data.
This function got left over by mistake. Hence kill it's definition as
no one uses it.
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
---
fs/ext4/ext4.h | 4 ----
fs/ext4/inline.c | 24 ------------------------
2 files changed, 28 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6948d673bba2..f9a58251ccea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3481,10 +3481,6 @@ extern int ext4_write_inline_data_end(struct inode *inode,
loff_t pos, unsigned len,
unsigned copied,
struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 5854bd5a3352..c0b2dc6514b2 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -823,30 +823,6 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
return ret ? ret : copied;
}
-struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page)
-{
- int ret, no_expand;
- void *kaddr;
- struct ext4_iloc iloc;
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret) {
- ext4_std_error(inode->i_sb, ret);
- return NULL;
- }
-
- ext4_write_lock_xattr(inode, &no_expand);
- kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
- kunmap_atomic(kaddr);
- ext4_write_unlock_xattr(inode, &no_expand);
-
- return iloc.bh;
-}
-
/*
* Try to make the page cache and handle ready for the inline data case.
* We can call this function in 2 cases:
--
2.40.1
mpage_submit_folio() was converted to take folio. Even though
folio_size() in ext4 as of now is PAGE_SIZE, but it's better to
remove that assumption which I am assuming is a missed left over from
patch[1].
[1]: https://lore.kernel.org/linux-ext4/[email protected]/
Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
---
fs/ext4/inode.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ce5f21b6c2b3..b9fa7c30a9a5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1885,7 +1885,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
len = folio_size(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(mpd->inode))
- len = size & ~PAGE_MASK;
+ len = size - folio_pos(folio);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
mpd->wbc->nr_to_write--;
--
2.40.1
This patch converts mpage_journal_page_buffers() to use folio and also
removes the PAGE_SIZE assumption.
Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
---
fs/ext4/inode.c | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43b54aac1c65..bd827af19b55 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2321,11 +2321,11 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
-static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
- int len)
+static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
+ size_t len)
{
- struct buffer_head *page_bufs = page_buffers(page);
- struct inode *inode = page->mapping->host;
+ struct buffer_head *page_bufs = folio_buffers(folio);
+ struct inode *inode = folio->mapping->host;
int ret, err;
ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
@@ -2334,7 +2334,7 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
NULL, write_end_fn);
if (ret == 0)
ret = err;
- err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
+ err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -2344,22 +2344,20 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
static int mpage_journal_page_buffers(handle_t *handle,
struct mpage_da_data *mpd,
- struct page *page)
+ struct folio *folio)
{
struct inode *inode = mpd->inode;
loff_t size = i_size_read(inode);
- int len;
+ size_t len = folio_size(folio);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
mpd->wbc->nr_to_write--;
- if (page->index == size >> PAGE_SHIFT &&
+ if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
+ len = size - folio_pos(folio);
- return ext4_journal_page_buffers(handle, page, len);
+ return ext4_journal_folio_buffers(handle, folio, len);
}
/*
@@ -2499,7 +2497,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
/* Pending dirtying of journalled data? */
if (folio_test_checked(folio)) {
err = mpage_journal_page_buffers(handle,
- mpd, &folio->page);
+ mpd, folio);
if (err < 0)
goto out;
mpd->journalled_more_data = 1;
@@ -6133,7 +6131,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
- if (ext4_journal_page_buffers(handle, &folio->page, len))
+ if (ext4_journal_folio_buffers(handle, folio, len))
goto out_error;
} else {
folio_unlock(folio);
--
2.40.1
ext4_write_inline_data_end() is completely converted to work with folio.
Also all callers of ext4_write_inline_data_end() already works on folio
except ext4_da_write_end(). Mostly for consistency and saving few
instructions maybe, this patch just converts ext4_da_write_end() to work
with folio which makes the last caller of ext4_write_inline_data_end()
also converted to work with folio.
We then make ext4_write_inline_data_end() take folio instead of page.
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
---
fs/ext4/ext4.h | 6 ++----
fs/ext4/inline.c | 3 +--
fs/ext4/inode.c | 23 ++++++++++++++---------
3 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f9a58251ccea..19e3a880ea16 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3477,10 +3477,8 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
- loff_t pos, unsigned len,
- unsigned copied,
- struct page *page);
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c0b2dc6514b2..f48183f91c87 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -741,9 +741,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page)
+ unsigned copied, struct folio *folio)
{
- struct folio *folio = page_folio(page);
handle_t *handle = ext4_journal_current_handle();
int no_expand;
void *kaddr;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bd827af19b55..3988e0be1270 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1287,7 +1287,8 @@ static int ext4_write_end(struct file *file,
if (ext4_has_inline_data(inode) &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
/*
@@ -1395,7 +1396,8 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
if (ext4_has_inline_data(inode))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
copied = 0;
@@ -2942,15 +2944,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
* Check if we should update i_disksize
* when write to the end of file but not require block allocation
*/
-static int ext4_da_should_update_i_disksize(struct page *page,
+static int ext4_da_should_update_i_disksize(struct folio *folio,
unsigned long offset)
{
struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned int idx;
int i;
- bh = page_buffers(page);
+ bh = folio_buffers(folio);
idx = offset >> inode->i_blkbits;
for (i = 0; i < idx; i++)
@@ -2970,17 +2972,19 @@ static int ext4_da_write_end(struct file *file,
loff_t new_i_size;
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
+ struct folio *folio = page_folio(page);
if (write_mode == FALL_BACK_TO_NONDELALLOC)
return ext4_write_end(file, mapping, pos,
- len, copied, page, fsdata);
+ len, copied, &folio->page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
if (write_mode != CONVERT_INLINE_DATA &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
ext4_has_inline_data(inode))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
if (unlikely(copied < len) && !PageUptodate(page))
copied = 0;
@@ -3004,10 +3008,11 @@ static int ext4_da_write_end(struct file *file,
*/
new_i_size = pos + copied;
if (copied && new_i_size > inode->i_size &&
- ext4_da_should_update_i_disksize(page, end))
+ ext4_da_should_update_i_disksize(folio, end))
ext4_update_i_disksize(inode, new_i_size);
- return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ return generic_write_end(file, mapping, pos, len, copied, &folio->page,
+ fsdata);
}
/*
--
2.40.1
ext4_readpage() is converted to ext4_read_folio() hence change the
related tracepoint from trace_ext4_readpage(page) to
trace_ext4_read_folio(folio). Do the same for
trace_ext4_releasepage(page) to trace_ext4_release_folio(folio)
As a minor bit of optimization to avoid an extra dereferencing,
since both of the above functions already were dereferencing
folio->mapping->host, hence change the tracepoint argument to take
(inode, folio).
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
---
fs/ext4/inode.c | 7 ++++---
include/trace/events/ext4.h | 26 +++++++++++++-------------
2 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b9fa7c30a9a5..43b54aac1c65 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3105,7 +3105,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio)
int ret = -EAGAIN;
struct inode *inode = folio->mapping->host;
- trace_ext4_readpage(&folio->page);
+ trace_ext4_read_folio(inode, folio);
if (ext4_has_inline_data(inode))
ret = ext4_readpage_inline(inode, folio);
@@ -3164,9 +3164,10 @@ static void ext4_journalled_invalidate_folio(struct folio *folio,
static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
- journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
+ struct inode *inode = folio->mapping->host;
+ journal_t *journal = EXT4_JOURNAL(inode);
- trace_ext4_releasepage(&folio->page);
+ trace_ext4_release_folio(inode, folio);
/* Page has dirty journalled data -> cannot release */
if (folio_test_checked(folio))
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index ebccf6a6aa1b..54cd509ced0f 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -560,10 +560,10 @@ TRACE_EVENT(ext4_writepages_result,
(unsigned long) __entry->writeback_index)
);
-DECLARE_EVENT_CLASS(ext4__page_op,
- TP_PROTO(struct page *page),
+DECLARE_EVENT_CLASS(ext4__folio_op,
+ TP_PROTO(struct inode *inode, struct folio *folio),
- TP_ARGS(page),
+ TP_ARGS(inode, folio),
TP_STRUCT__entry(
__field( dev_t, dev )
@@ -573,29 +573,29 @@ DECLARE_EVENT_CLASS(ext4__page_op,
),
TP_fast_assign(
- __entry->dev = page->mapping->host->i_sb->s_dev;
- __entry->ino = page->mapping->host->i_ino;
- __entry->index = page->index;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->index = folio->index;
),
- TP_printk("dev %d,%d ino %lu page_index %lu",
+ TP_printk("dev %d,%d ino %lu folio_index %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
(unsigned long) __entry->index)
);
-DEFINE_EVENT(ext4__page_op, ext4_readpage,
+DEFINE_EVENT(ext4__folio_op, ext4_read_folio,
- TP_PROTO(struct page *page),
+ TP_PROTO(struct inode *inode, struct folio *folio),
- TP_ARGS(page)
+ TP_ARGS(inode, folio)
);
-DEFINE_EVENT(ext4__page_op, ext4_releasepage,
+DEFINE_EVENT(ext4__folio_op, ext4_release_folio,
- TP_PROTO(struct page *page),
+ TP_PROTO(struct inode *inode, struct folio *folio),
- TP_ARGS(page)
+ TP_ARGS(inode, folio)
);
DECLARE_EVENT_CLASS(ext4_invalidate_folio_op,
--
2.40.1
On Mon, May 15, 2023 at 04:10:41PM +0530, Ritesh Harjani (IBM) wrote:
> mpage_submit_folio() was converted to take folio. Even though
> folio_size() in ext4 as of now is PAGE_SIZE, but it's better to
> remove that assumption which I am assuming is a missed left over from
> patch[1].
>
> [1]: https://lore.kernel.org/linux-ext4/[email protected]/
>
> Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
On Mon, May 15, 2023 at 04:10:43PM +0530, Ritesh Harjani (IBM) wrote:
> This patch converts mpage_journal_page_buffers() to use folio and also
> removes the PAGE_SIZE assumption.
>
> Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
Now that fsverity supports working on entire folios, call
fsverity_verify_folio() instead of fsverity_verify_page()
Reported-by: Eric Biggers <[email protected]>
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/ext4/readpage.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 6f46823fba61..3e7d160f543f 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode,
folio_size(folio));
if (first_hole == 0) {
if (ext4_need_verity(inode, folio->index) &&
- !fsverity_verify_page(&folio->page))
+ !fsverity_verify_folio(folio))
goto set_error_page;
folio_mark_uptodate(folio);
folio_unlock(folio);
--
2.39.2
"Matthew Wilcox (Oracle)" <[email protected]> writes:
> Now that fsverity supports working on entire folios, call
> fsverity_verify_folio() instead of fsverity_verify_page()
Thanks for catching it.
>
> Reported-by: Eric Biggers <[email protected]>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> ---
> fs/ext4/readpage.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
I agree that we could use fsverity_verify_folio() instead of
fsverity_verify_page() here.
Reviewed-by: Ritesh Harjani (IBM) <[email protected]>
>
> diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
> index 6f46823fba61..3e7d160f543f 100644
> --- a/fs/ext4/readpage.c
> +++ b/fs/ext4/readpage.c
> @@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode,
> folio_size(folio));
> if (first_hole == 0) {
> if (ext4_need_verity(inode, folio->index) &&
> - !fsverity_verify_page(&folio->page))
> + !fsverity_verify_folio(folio))
> goto set_error_page;
> folio_mark_uptodate(folio);
> folio_unlock(folio);
> --
> 2.39.2
On Tue, May 16, 2023 at 08:27:13PM +0100, Matthew Wilcox (Oracle) wrote:
> Now that fsverity supports working on entire folios, call
> fsverity_verify_folio() instead of fsverity_verify_page()
>
> Reported-by: Eric Biggers <[email protected]>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> ---
> fs/ext4/readpage.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
> index 6f46823fba61..3e7d160f543f 100644
> --- a/fs/ext4/readpage.c
> +++ b/fs/ext4/readpage.c
> @@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode,
> folio_size(folio));
> if (first_hole == 0) {
> if (ext4_need_verity(inode, folio->index) &&
> - !fsverity_verify_page(&folio->page))
> + !fsverity_verify_folio(folio))
> goto set_error_page;
> folio_mark_uptodate(folio);
> folio_unlock(folio);
> --
Reviewed-by: Eric Biggers <[email protected]>
(Though I must mention that doing weird things like "PATCH 6/5" makes life hard
for scripts that operate on patch series...)
- Eric
On Mon, 15 May 2023 16:10:39 +0530, Ritesh Harjani (IBM) wrote:
> Please find v2 of the left over folio conversion changes.
> It would be nice if you could also review Patch-2 and Patch-4.
>
> v1 -> v2
> =========
> 1. Added Patch-2 which removes PAGE_SIZE assumption from mpage_submit_folio.
> (IMO, this was a missed left over from previous conversion).
> 2. Addressed a small review comment from Matthew in the tracepoint patch to take
> (inode, folio).
> 3. Added Reviewed-by from Matthew.
>
> [...]
Applied, thanks!
[1/5] ext4: kill unused function ext4_journalled_write_inline_data
commit: 40fa8be3852ff7a7b3708c6a69c9eacf98da6612
[2/5] ext4: Remove PAGE_SIZE assumption of folio from mpage_submit_folio
commit: 732572e95427fe9f68197c60074b750242b84944
[3/5] ext4: Change remaining tracepoints to use folio
commit: 263619455efcd0deb1f8c24f2895f48751a1d679
[4/5] ext4: Make mpage_journal_page_buffers use folio
commit: 13004fbc85f58b05cfeb72372b418b6aa1caddc5
[5/5] ext4: Make ext4_write_inline_data_end() use folio
commit: 04a8b77b66c5bea74b60cc2f5230a851b4ade096
[6/6] ext4: Call fsverity_verify_folio()
commit: 60469ab9340529a607574a7b6ab6483217607a75
Best regards,
--
Theodore Ts'o <[email protected]>
On Mon, May 15, 2023 at 04:10:41PM +0530, Ritesh Harjani (IBM) wrote:
> mpage_submit_folio() was converted to take folio. Even though
> folio_size() in ext4 as of now is PAGE_SIZE, but it's better to
> remove that assumption which I am assuming is a missed left over from
> patch[1].
>
> [1]: https://lore.kernel.org/linux-ext4/[email protected]/
>
> Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
I didn't notice this right away, because the failure is not 100%
reliable, but this commit will sometimes cause "kvm-xfstests -c
ext4/encrypt generic/068" to crash. Reverting the patch fixes the
problem, so I plan to drop this patch from my tree.
- Ted
generic/068 42s ... [01:56:09][ 7.014363] run fstests generic/068 at 2023-06-11 01:56:09
[ 7.538841] EXT4-fs (vdc): Test dummy encryption mode enabled
[ 11.407307] traps: PANIC: double fault, error_code: 0x0
[ 11.407313] double fault: 0000 [#1] PREEMPT SMP NOPTI
[ 11.407315] CPU: 1 PID: 3358 Comm: fsstress Not tainted 6.4.0-rc5-xfstests-lockdep-00069-gfc362247e79f #169
[ 11.407316] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 11.407317] RIP: 0010:__switch_to_asm+0x33/0x80
[ 11.407322] Code: 55 41 56 41 57 48 89 a7 d8 17 00 00 48 8b a6 d8 17 00 00 48 8b 9e 40 04 00 00 65 48 89 1c 25 28 00 00 00 49 c7 c4 10 00 00 00 <e8> 01 00 00 00 cc e8 01 00 00 00 cc 48 83 c4 10 49 ff cc 75 eb 0f
[ 11.407323] RSP: 0018:ffffc90003ec7e18 EFLAGS: 00010046
[ 11.407324] RAX: 0000000000000001 RBX: 961d22f2e2e05800 RCX: 00000002afbf75a9
[ 11.407325] RDX: 0000000000000003 RSI: ffff88800d174080 RDI: ffff88800d0ae200
[ 11.407325] RBP: ffffc90003fd7af0 R08: 0000000000000001 R09: 0000000000000001
[ 11.407326] R10: 00000000000003cc R11: 0000000000000001 R12: 0000000000000010
[ 11.407326] R13: ffffe8ffffc29c50 R14: ffff88807ddee998 R15: ffff88800d174080
[ 11.407327] FS: 00007f144aee4740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000
[ 11.407329] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 11.407330] CR2: ffffc90003ec7e08 CR3: 000000000cb3e004 CR4: 0000000000770ee0
[ 11.407330] PKRU: 55555554
[ 11.407330] Call Trace:
[ 11.407331] <#DF>
[ 11.407332] ? die+0x36/0x80
[ 11.407334] ? exc_double_fault+0xf1/0x1b0
[ 11.407336] ? asm_exc_double_fault+0x23/0x30
[ 11.407338] ? __switch_to_asm+0x33/0x80
[ 11.407339] </#DF>
[ 11.413852] ---[ end trace 0000000000000000 ]---
[ 11.413853] RIP: 0010:__switch_to_asm+0x33/0x80
[ 11.413856] Code: 55 41 56 41 57 48 89 a7 d8 17 00 00 48 8b a6 d8 17 00 00 48 8b 9e 40 04 00 00 65 48 89 1c 25 28 00 00 00 49 c7 c4 10 00 00 00 <e8> 01 00 00 00 cc e8 01 00 00 00 cc 48 83 c4 10 49 ff cc 75 eb 0f
[ 11.413857] RSP: 0018:ffffc90003ec7e18 EFLAGS: 00010046
[ 11.413857] RAX: 0000000000000001 RBX: 961d22f2e2e05800 RCX: 00000002afbf75a9
[ 11.413858] RDX: 0000000000000003 RSI: ffff88800d174080 RDI: ffff88800d0ae200
[ 11.413858] RBP: ffffc90003fd7af0 R08: 0000000000000001 R09: 0000000000000001
[ 11.413859] R10: 00000000000003cc R11: 0000000000000001 R12: 0000000000000010
[ 11.413859] R13: ffffe8ffffc29c50 R14: ffff88807ddee998 R15: ffff88800d174080
[ 11.413860] FS: 00007f144aee4740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000
[ 11.413861] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 11.413862] CR2: ffffc90003ec7e08 CR3: 000000000cb3e004 CR4: 0000000000770ee0
[ 11.413863] PKRU: 55555554
[ 11.413863] Kernel panic - not syncing: Fatal exception in interrupt
[ 11.413889] BUG: unable to handle page fault for address: ffffc90003ebfe88
[ 11.414112] #PF: supervisor read access in kernel mode
[ 11.414320] #PF: error_code(0x0009) - reserved bit violation
[ 11.415151] PGD 5000067 P4D 5000067 PUD 5219067 PMD d278067 PTE 1e914974aa550b07
[ 11.417015] Oops: 0009 [#2] PREEMPT SMP NOPTI
[ 11.417375] CPU: 0 PID: 29 Comm: kworker/u4:2 Tainted: G D 6.4.0-rc5-xfstests-lockdep-00069-gfc362247e79f #169
[ 11.417641] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 11.417962] Workqueue: writeback wb_workfn (flush-254:32)
[ 11.418683] RIP: 0010:timerqueue_add+0x28/0xb0
[ 11.418916] Code: 90 90 66 0f 1f 00 55 53 48 3b 36 48 89 f3 0f 85 96 00 00 00 48 8b 07 48 85 c0 74 55 48 8b 73 18 bd 01 00 00 00 eb 03 48 89 d0 <48> 3b 70 18 48 8d 48 10 7c 06 48 8d 48 08 31 ed 48 8b 11 48 85 d2
[ 11.419173] RSP: 0018:ffffc90000003f00 EFLAGS: 00010082
[ 11.419710] RAX: ffffc90003ebfe70 RBX: ffff88807dbe0210 RCX: ffff88800d07a3e8
[ 11.420219] RDX: ffffc90003ebfe70 RSI: 00000002a849a0e0 RDI: ffff88807dbdfb58
[ 11.420634] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000
[ 11.420877] R10: 0000000000000000 R11: 0000000000000659 R12: ffff88807dbdfa40
[ 11.421082] R13: 0000000000000002 R14: ffff888005d3c180 R15: ffff88807dbdfb00
[ 11.421924] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
[ 11.422165] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 11.422487] CR2: ffffc90003ebfe88 CR3: 000000000c81c001 CR4: 0000000000770ef0
[ 11.422810] PKRU: 55555554
[ 11.423133] Call Trace:
[ 11.423451] <IRQ>
[ 11.423778] ? __die+0x23/0x60
[ 11.424139] ? page_fault_oops+0xa4/0x170
[ 11.424399] ? exc_page_fault+0xfa/0x1e0
[ 11.424741] ? asm_exc_page_fault+0x26/0x30
[ 11.424884] ? timerqueue_add+0x28/0xb0
[ 11.425001] enqueue_hrtimer+0x42/0xa0
[ 11.425097] __hrtimer_run_queues+0x304/0x380
[ 11.425241] hrtimer_interrupt+0xf8/0x230
[ 11.425426] __sysvec_apic_timer_interrupt+0x75/0x190
[ 11.425605] sysvec_apic_timer_interrupt+0x65/0x80
[ 11.425794] </IRQ>
[ 11.425966] <TASK>
[ 11.426139] asm_sysvec_apic_timer_interrupt+0x1a/0x20
[ 11.426344] RIP: 0010:aesni_xts_encrypt+0x2d/0x1d0
[ 11.426529] Code: 00 66 0f 6f 3d 24 ff 93 01 41 0f 10 18 44 8b 8f e0 01 00 00 48 83 e9 40 0f 8c f3 00 00 00 66 0f 6f c3 f3 0f 6f 0a 66 0f ef c1 <f3> 0f 7f 1e 66 0f 70 d3 13 66 0f d4 db 66 0f 72 e2 1f 66 0f db d7
[ 11.426757] RSP: 0018:ffffc9000052f558 EFLAGS: 00010206
[ 11.427074] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000fc0
[ 11.427172] RDX: ffff88801281a000 RSI: ffff88800d7c3000 RDI: ffff88800d180220
[ 11.427406] RBP: ffffc9000052f720 R08: ffffc9000052f780 R09: 0000000000000020
[ 11.427624] R10: ffff88800d1800a0 R11: 0000000000000018 R12: ffffc9000052f580
[ 11.428468] R13: ffff88800d180220 R14: 0000000000001000 R15: 0000000000000001
[ 11.428705] ? aesni_enc+0x13/0x20
[ 11.429027] xts_crypt+0x10f/0x340
[ 11.429349] ? lock_release+0x65/0x100
[ 11.429667] ? do_raw_spin_unlock+0x4e/0xa0
[ 11.429987] ? _raw_spin_unlock+0x23/0x40
[ 11.430312] ? lock_is_held_type+0x9d/0x110
[ 11.430471] fscrypt_crypt_block+0x268/0x320
[ 11.430627] ? mempool_alloc+0x94/0x1e0
[ 11.430803] fscrypt_encrypt_pagecache_blocks+0xde/0x150
[ 11.430991] ext4_bio_write_folio+0x371/0x500
[ 11.431172] mpage_submit_folio+0x6f/0x90
[ 11.431363] mpage_map_and_submit_buffers+0xc5/0x180
[ 11.431558] mpage_map_and_submit_extent+0x55/0x300
[ 11.431739] ext4_do_writepages+0x70d/0x810
[ 11.431981] ext4_writepages+0xf1/0x290
[ 11.432182] do_writepages+0xd2/0x1e0
[ 11.432366] ? __lock_release.isra.0+0x15e/0x2a0
[ 11.432595] __writeback_single_inode+0x54/0x300
[ 11.432817] ? do_raw_spin_unlock+0x4e/0xa0
[ 11.433006] writeback_sb_inodes+0x1fc/0x500
[ 11.433183] wb_writeback+0xf2/0x370
[ 11.433352] wb_do_writeback+0x9e/0x2e0
[ 11.433560] ? set_worker_desc+0xc7/0xd0
[ 11.433772] wb_workfn+0x6a/0x2b0
[ 11.433964] ? __lock_release.isra.0+0x15e/0x2a0
[ 11.434157] ? process_one_work+0x21b/0x540
[ 11.434322] process_one_work+0x286/0x540
[ 11.434500] worker_thread+0x53/0x3c0
[ 11.434678] ? __pfx_worker_thread+0x10/0x10
[ 11.434831] kthread+0xf2/0x130
[ 11.435042] ? __pfx_kthread+0x10/0x10
[ 11.435233] ret_from_fork+0x29/0x50
[ 11.435417] </TASK>
[ 11.435584] CR2: ffffc90003ebfe88
[ 11.435931] ---[ end trace 0000000000000000 ]---
[ 11.436101] RIP: 0010:__switch_to_asm+0x33/0x80
[ 11.436265] Code: 55 41 56 41 57 48 89 a7 d8 17 00 00 48 8b a6 d8 17 00 00 48 8b 9e 40 04 00 00 65 48 89 1c 25 28 00 00 00 49 c7 c4 10 00 00 00 <e8> 01 00 00 00 cc e8 01 00 00 00 cc 48 83 c4 10 49 ff cc 75 eb 0f
[ 11.436367] RSP: 0018:ffffc90003ec7e18 EFLAGS: 00010046
[ 11.436727] RAX: 0000000000000001 RBX: 961d22f2e2e05800 RCX: 00000002afbf75a9
[ 11.436938] RDX: 0000000000000003 RSI: ffff88800d174080 RDI: ffff88800d0ae200
[ 11.437766] RBP: ffffc90003fd7af0 R08: 0000000000000001 R09: 0000000000000001
[ 11.438000] R10: 00000000000003cc R11: 0000000000000001 R12: 0000000000000010
[ 11.438322] R13: ffffe8ffffc29c50 R14: ffff88807ddee998 R15: ffff88800d174080
[ 11.438641] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
[ 11.438967] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 11.439285] CR2: ffffc90003ebfe88 CR3: 000000000c81c001 CR4: 0000000000770ef0
[ 11.439604] PKRU: 55555554
[ 12.433529] Shutting down cpus with NMI
[ 12.433728] Kernel Offset: disabled
QEMU: Terminated
On Sun, Jun 11, 2023 at 01:58:31AM -0400, Theodore Ts'o wrote:
> On Mon, May 15, 2023 at 04:10:41PM +0530, Ritesh Harjani (IBM) wrote:
> > mpage_submit_folio() was converted to take folio. Even though
> > folio_size() in ext4 as of now is PAGE_SIZE, but it's better to
> > remove that assumption which I am assuming is a missed left over from
> > patch[1].
> >
> > [1]: https://lore.kernel.org/linux-ext4/[email protected]/
> >
> > Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
>
> I didn't notice this right away, because the failure is not 100%
> reliable, but this commit will sometimes cause "kvm-xfstests -c
> ext4/encrypt generic/068" to crash. Reverting the patch fixes the
> problem, so I plan to drop this patch from my tree.
Hrm. Well, let's think about how this can go wrong:
@@ -1885,7 +1885,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd,
+struct folio *folio)
len = folio_size(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(mpd->inode))
- len = size & ~PAGE_MASK;
+ len = size - folio_pos(folio);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
mpd->wbc->nr_to_write--;
Just off-camera is:
size = i_size_read(mpd->inode);
Now, nothing is preventing i_size to be truncated to far below this
folio, right? So if that happened before this patch, we'd write some
randomly sized fragment of the page. Now we'll get a negative result
... which is assigned to size_t, so is exabytes in size.
So do we care if we write a random fragment of a page after a truncate?
If so, we should add:
if (folio_pos(folio) >= size)
return 0; /* Do we need to account nr_to_write? */
If we simply don't care that we're doing a spurious write, then we can
do something like:
- len = size & ~PAGE_MASK;
+ len = size & (len - 1);
Please ignore the previous email.
"Theodore Ts'o" <[email protected]> writes:
> On Mon, May 15, 2023 at 04:10:41PM +0530, Ritesh Harjani (IBM) wrote:
>> mpage_submit_folio() was converted to take folio. Even though
>> folio_size() in ext4 as of now is PAGE_SIZE, but it's better to
>> remove that assumption which I am assuming is a missed left over from
>> patch[1].
>>
>> [1]: https://lore.kernel.org/linux-ext4/[email protected]/
>>
>> Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
>
> I didn't notice this right away, because the failure is not 100%
> reliable, but this commit will sometimes cause "kvm-xfstests -c
> ext4/encrypt generic/068" to crash. Reverting the patch fixes the
> problem, so I plan to drop this patch from my tree.
>
Sorry about the crash. I am now able to reproduce the problem on my
setup as well. I will debug this and will update once I have some more info.
From the initial look, it looks like the problem might be occurring when
folio_pos(folio) itself is > i_size_read(inode).
If that is indeed the case, then I think even doing this with folio
conversion (below code after folio conversion) looks incorrect for case
when size is not PAGE_SIZE aligned.
However, I will spend some more time debugging this.
static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
{
size_t len;
loff_t size;
int err;
BUG_ON(folio->index != mpd->first_page);
folio_clear_dirty_for_io(folio);
/*
* We have to be very careful here! Nothing protects writeback path
* against i_size changes and the page can be writeably mapped into
* page tables. So an application can be growing i_size and writing
* data through mmap while writeback runs. folio_clear_dirty_for_io()
* write-protects our page in page tables and the page cannot get
* written to again until we release folio lock. So only after
* folio_clear_dirty_for_io() we are safe to sample i_size for
* ext4_bio_write_page() to zero-out tail of the written page. We rely
* on the barrier provided by TestClearPageDirty in
* folio_clear_dirty_for_io() to make sure i_size is really sampled only
* after page tables are updated.
*/
size = i_size_read(mpd->inode);
len = folio_size(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(mpd->inode))
len = size & ~PAGE_MASK;
err = ext4_bio_write_page(&mpd->io_submit, &folio->page, len);
if (!err)
mpd->wbc->nr_to_write--;
return err;
}
> - Ted
>
> generic/068 42s ... [01:56:09][ 7.014363] run fstests generic/068 at 2023-06-11 01:56:09
> [ 7.538841] EXT4-fs (vdc): Test dummy encryption mode enabled
> [ 11.407307] traps: PANIC: double fault, error_code: 0x0
> [ 11.407313] double fault: 0000 [#1] PREEMPT SMP NOPTI
> [ 11.407315] CPU: 1 PID: 3358 Comm: fsstress Not tainted 6.4.0-rc5-xfstests-lockdep-00069-gfc362247e79f #169
> [ 11.407316] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> [ 11.407317] RIP: 0010:__switch_to_asm+0x33/0x80
> [ 11.407322] Code: 55 41 56 41 57 48 89 a7 d8 17 00 00 48 8b a6 d8 17 00 00 48 8b 9e 40 04 00 00 65 48 89 1c 25 28 00 00 00 49 c7 c4 10 00 00 00 <e8> 01 00 00 00 cc e8 01 00 00 00 cc 48 83 c4 10 49 ff cc 75 eb 0f
> [ 11.407323] RSP: 0018:ffffc90003ec7e18 EFLAGS: 00010046
> [ 11.407324] RAX: 0000000000000001 RBX: 961d22f2e2e05800 RCX: 00000002afbf75a9
> [ 11.407325] RDX: 0000000000000003 RSI: ffff88800d174080 RDI: ffff88800d0ae200
> [ 11.407325] RBP: ffffc90003fd7af0 R08: 0000000000000001 R09: 0000000000000001
> [ 11.407326] R10: 00000000000003cc R11: 0000000000000001 R12: 0000000000000010
> [ 11.407326] R13: ffffe8ffffc29c50 R14: ffff88807ddee998 R15: ffff88800d174080
> [ 11.407327] FS: 00007f144aee4740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000
> [ 11.407329] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 11.407330] CR2: ffffc90003ec7e08 CR3: 000000000cb3e004 CR4: 0000000000770ee0
> [ 11.407330] PKRU: 55555554
> [ 11.407330] Call Trace:
> [ 11.407331] <#DF>
> [ 11.407332] ? die+0x36/0x80
> [ 11.407334] ? exc_double_fault+0xf1/0x1b0
> [ 11.407336] ? asm_exc_double_fault+0x23/0x30
> [ 11.407338] ? __switch_to_asm+0x33/0x80
> [ 11.407339] </#DF>
> [ 11.413852] ---[ end trace 0000000000000000 ]---
> [ 11.413853] RIP: 0010:__switch_to_asm+0x33/0x80
> [ 11.413856] Code: 55 41 56 41 57 48 89 a7 d8 17 00 00 48 8b a6 d8 17 00 00 48 8b 9e 40 04 00 00 65 48 89 1c 25 28 00 00 00 49 c7 c4 10 00 00 00 <e8> 01 00 00 00 cc e8 01 00 00 00 cc 48 83 c4 10 49 ff cc 75 eb 0f
> [ 11.413857] RSP: 0018:ffffc90003ec7e18 EFLAGS: 00010046
> [ 11.413857] RAX: 0000000000000001 RBX: 961d22f2e2e05800 RCX: 00000002afbf75a9
> [ 11.413858] RDX: 0000000000000003 RSI: ffff88800d174080 RDI: ffff88800d0ae200
> [ 11.413858] RBP: ffffc90003fd7af0 R08: 0000000000000001 R09: 0000000000000001
> [ 11.413859] R10: 00000000000003cc R11: 0000000000000001 R12: 0000000000000010
> [ 11.413859] R13: ffffe8ffffc29c50 R14: ffff88807ddee998 R15: ffff88800d174080
> [ 11.413860] FS: 00007f144aee4740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000
> [ 11.413861] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 11.413862] CR2: ffffc90003ec7e08 CR3: 000000000cb3e004 CR4: 0000000000770ee0
> [ 11.413863] PKRU: 55555554
> [ 11.413863] Kernel panic - not syncing: Fatal exception in interrupt
> [ 11.413889] BUG: unable to handle page fault for address: ffffc90003ebfe88
> [ 11.414112] #PF: supervisor read access in kernel mode
> [ 11.414320] #PF: error_code(0x0009) - reserved bit violation
> [ 11.415151] PGD 5000067 P4D 5000067 PUD 5219067 PMD d278067 PTE 1e914974aa550b07
> [ 11.417015] Oops: 0009 [#2] PREEMPT SMP NOPTI
> [ 11.417375] CPU: 0 PID: 29 Comm: kworker/u4:2 Tainted: G D 6.4.0-rc5-xfstests-lockdep-00069-gfc362247e79f #169
> [ 11.417641] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> [ 11.417962] Workqueue: writeback wb_workfn (flush-254:32)
> [ 11.418683] RIP: 0010:timerqueue_add+0x28/0xb0
> [ 11.418916] Code: 90 90 66 0f 1f 00 55 53 48 3b 36 48 89 f3 0f 85 96 00 00 00 48 8b 07 48 85 c0 74 55 48 8b 73 18 bd 01 00 00 00 eb 03 48 89 d0 <48> 3b 70 18 48 8d 48 10 7c 06 48 8d 48 08 31 ed 48 8b 11 48 85 d2
> [ 11.419173] RSP: 0018:ffffc90000003f00 EFLAGS: 00010082
> [ 11.419710] RAX: ffffc90003ebfe70 RBX: ffff88807dbe0210 RCX: ffff88800d07a3e8
> [ 11.420219] RDX: ffffc90003ebfe70 RSI: 00000002a849a0e0 RDI: ffff88807dbdfb58
> [ 11.420634] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000
> [ 11.420877] R10: 0000000000000000 R11: 0000000000000659 R12: ffff88807dbdfa40
> [ 11.421082] R13: 0000000000000002 R14: ffff888005d3c180 R15: ffff88807dbdfb00
> [ 11.421924] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
> [ 11.422165] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 11.422487] CR2: ffffc90003ebfe88 CR3: 000000000c81c001 CR4: 0000000000770ef0
> [ 11.422810] PKRU: 55555554
> [ 11.423133] Call Trace:
> [ 11.423451] <IRQ>
> [ 11.423778] ? __die+0x23/0x60
> [ 11.424139] ? page_fault_oops+0xa4/0x170
> [ 11.424399] ? exc_page_fault+0xfa/0x1e0
> [ 11.424741] ? asm_exc_page_fault+0x26/0x30
> [ 11.424884] ? timerqueue_add+0x28/0xb0
> [ 11.425001] enqueue_hrtimer+0x42/0xa0
> [ 11.425097] __hrtimer_run_queues+0x304/0x380
> [ 11.425241] hrtimer_interrupt+0xf8/0x230
> [ 11.425426] __sysvec_apic_timer_interrupt+0x75/0x190
> [ 11.425605] sysvec_apic_timer_interrupt+0x65/0x80
> [ 11.425794] </IRQ>
> [ 11.425966] <TASK>
> [ 11.426139] asm_sysvec_apic_timer_interrupt+0x1a/0x20
> [ 11.426344] RIP: 0010:aesni_xts_encrypt+0x2d/0x1d0
> [ 11.426529] Code: 00 66 0f 6f 3d 24 ff 93 01 41 0f 10 18 44 8b 8f e0 01 00 00 48 83 e9 40 0f 8c f3 00 00 00 66 0f 6f c3 f3 0f 6f 0a 66 0f ef c1 <f3> 0f 7f 1e 66 0f 70 d3 13 66 0f d4 db 66 0f 72 e2 1f 66 0f db d7
> [ 11.426757] RSP: 0018:ffffc9000052f558 EFLAGS: 00010206
> [ 11.427074] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000fc0
> [ 11.427172] RDX: ffff88801281a000 RSI: ffff88800d7c3000 RDI: ffff88800d180220
> [ 11.427406] RBP: ffffc9000052f720 R08: ffffc9000052f780 R09: 0000000000000020
> [ 11.427624] R10: ffff88800d1800a0 R11: 0000000000000018 R12: ffffc9000052f580
> [ 11.428468] R13: ffff88800d180220 R14: 0000000000001000 R15: 0000000000000001
> [ 11.428705] ? aesni_enc+0x13/0x20
> [ 11.429027] xts_crypt+0x10f/0x340
> [ 11.429349] ? lock_release+0x65/0x100
> [ 11.429667] ? do_raw_spin_unlock+0x4e/0xa0
> [ 11.429987] ? _raw_spin_unlock+0x23/0x40
> [ 11.430312] ? lock_is_held_type+0x9d/0x110
> [ 11.430471] fscrypt_crypt_block+0x268/0x320
> [ 11.430627] ? mempool_alloc+0x94/0x1e0
> [ 11.430803] fscrypt_encrypt_pagecache_blocks+0xde/0x150
> [ 11.430991] ext4_bio_write_folio+0x371/0x500
> [ 11.431172] mpage_submit_folio+0x6f/0x90
> [ 11.431363] mpage_map_and_submit_buffers+0xc5/0x180
> [ 11.431558] mpage_map_and_submit_extent+0x55/0x300
> [ 11.431739] ext4_do_writepages+0x70d/0x810
> [ 11.431981] ext4_writepages+0xf1/0x290
> [ 11.432182] do_writepages+0xd2/0x1e0
> [ 11.432366] ? __lock_release.isra.0+0x15e/0x2a0
> [ 11.432595] __writeback_single_inode+0x54/0x300
> [ 11.432817] ? do_raw_spin_unlock+0x4e/0xa0
> [ 11.433006] writeback_sb_inodes+0x1fc/0x500
> [ 11.433183] wb_writeback+0xf2/0x370
> [ 11.433352] wb_do_writeback+0x9e/0x2e0
> [ 11.433560] ? set_worker_desc+0xc7/0xd0
> [ 11.433772] wb_workfn+0x6a/0x2b0
> [ 11.433964] ? __lock_release.isra.0+0x15e/0x2a0
> [ 11.434157] ? process_one_work+0x21b/0x540
> [ 11.434322] process_one_work+0x286/0x540
> [ 11.434500] worker_thread+0x53/0x3c0
> [ 11.434678] ? __pfx_worker_thread+0x10/0x10
> [ 11.434831] kthread+0xf2/0x130
> [ 11.435042] ? __pfx_kthread+0x10/0x10
> [ 11.435233] ret_from_fork+0x29/0x50
> [ 11.435417] </TASK>
> [ 11.435584] CR2: ffffc90003ebfe88
> [ 11.435931] ---[ end trace 0000000000000000 ]---
> [ 11.436101] RIP: 0010:__switch_to_asm+0x33/0x80
> [ 11.436265] Code: 55 41 56 41 57 48 89 a7 d8 17 00 00 48 8b a6 d8 17 00 00 48 8b 9e 40 04 00 00 65 48 89 1c 25 28 00 00 00 49 c7 c4 10 00 00 00 <e8> 01 00 00 00 cc e8 01 00 00 00 cc 48 83 c4 10 49 ff cc 75 eb 0f
> [ 11.436367] RSP: 0018:ffffc90003ec7e18 EFLAGS: 00010046
> [ 11.436727] RAX: 0000000000000001 RBX: 961d22f2e2e05800 RCX: 00000002afbf75a9
> [ 11.436938] RDX: 0000000000000003 RSI: ffff88800d174080 RDI: ffff88800d0ae200
> [ 11.437766] RBP: ffffc90003fd7af0 R08: 0000000000000001 R09: 0000000000000001
> [ 11.438000] R10: 00000000000003cc R11: 0000000000000001 R12: 0000000000000010
> [ 11.438322] R13: ffffe8ffffc29c50 R14: ffff88807ddee998 R15: ffff88800d174080
> [ 11.438641] FS: 0000000000000000(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
> [ 11.438967] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 11.439285] CR2: ffffc90003ebfe88 CR3: 000000000c81c001 CR4: 0000000000770ef0
> [ 11.439604] PKRU: 55555554
> [ 12.433529] Shutting down cpus with NMI
> [ 12.433728] Kernel Offset: disabled
> QEMU: Terminated
Thanks for letting me know. I will look more into this.
-ritesh
Ritesh Harjani (IBM) <[email protected]> writes:
> Please ignore the previous email.
>
> "Theodore Ts'o" <[email protected]> writes:
>
>> On Mon, May 15, 2023 at 04:10:41PM +0530, Ritesh Harjani (IBM) wrote:
>>> mpage_submit_folio() was converted to take folio. Even though
>>> folio_size() in ext4 as of now is PAGE_SIZE, but it's better to
>>> remove that assumption which I am assuming is a missed left over from
>>> patch[1].
>>>
>>> [1]: https://lore.kernel.org/linux-ext4/[email protected]/
>>>
>>> Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
>>
>> I didn't notice this right away, because the failure is not 100%
>> reliable, but this commit will sometimes cause "kvm-xfstests -c
>> ext4/encrypt generic/068" to crash. Reverting the patch fixes the
>> problem, so I plan to drop this patch from my tree.
>>
>
> Sorry about the crash. I am now able to reproduce the problem on my
> setup as well. I will debug this and will update once I have some more info.
>
> From the initial look, it looks like the problem might be occurring when
> folio_pos(folio) itself is > i_size_read(inode).
>
> If that is indeed the case, then I think even doing this with folio
> conversion (below code after folio conversion) looks incorrect for case
> when size is not PAGE_SIZE aligned.
>
> However, I will spend some more time debugging this.
I am still looking into this. I would like to make sure I go through
all the paths where i_size can be modified.
- buffered-IO
- writeback
- direct-IO
- page fault
- truncate
- fallocate (punch/collapse)
- evict (not relevant though)
It is easily recreatable if we have one thread doing buffered-io +
sync and other thread trying to truncate down inode->i_size.
Kernel panic maybe is happening only with -O encrypt mkfs option +
-o test_dummy_encryption mount option, but the size - folio_pos(folio)
is definitely wrong because inode->i_size is not protected in writeback path.
More on this later...
-ritesh
On Mon, Jun 12, 2023 at 10:55:37PM +0530, Ritesh Harjani wrote:
> It is easily recreatable if we have one thread doing buffered-io +
> sync and other thread trying to truncate down inode->i_size.
> Kernel panic maybe is happening only with -O encrypt mkfs option +
> -o test_dummy_encryption mount option, but the size - folio_pos(folio)
> is definitely wrong because inode->i_size is not protected in writeback path.
Did you not see the email I sent right before you sent your previous
email?
Matthew Wilcox <[email protected]> writes:
> On Mon, Jun 12, 2023 at 10:55:37PM +0530, Ritesh Harjani wrote:
>> It is easily recreatable if we have one thread doing buffered-io +
>> sync and other thread trying to truncate down inode->i_size.
>> Kernel panic maybe is happening only with -O encrypt mkfs option +
>> -o test_dummy_encryption mount option, but the size - folio_pos(folio)
>> is definitely wrong because inode->i_size is not protected in writeback path.
>
> Did you not see the email I sent right before you sent your previous
> email?
Aah yes, Matthew. I had seen that email yesterday after I sent my email.
Sorry I forgot to acknowdledge it today and thanks for pointing things
out.
I couldn't respond to your change because I still had some confusion
around this suggestion -
> So do we care if we write a random fragment of a page after a truncate?
> If so, we should add:
>
> if (folio_pos(folio) >= size)
> return 0; /* Do we need to account nr_to_write? */
I was not sure whether if go with above case then whether it will
work with collapse_range. I initially thought that collapse_range will
truncate the pages between start and end of the file and then
it can also reduce the inode->i_size. That means writeback can find an
inode->i_size smaller than folio_pos(folio) which it is writing to.
But in this case we can't skip the write in writeback case like above
because that write is still required (a spurious write) even though
i_size is reduced as it's corresponding FS blocks are not truncated.
But just now looking at ext4_collapse_range() code it doesn't look like
it is the problem because it waits for any dirty data to be written
before truncate. So no matter which folio_pos(folio) the writeback is
writing, there should not be an issue if we simply return 0 like how
you suggested above.
static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
<...>
ioffset = round_down(offset, PAGE_SIZE);
/*
* Write tail of the last page before removed range since it will get
* removed from the page cache below.
*/
ret = filemap_write_and_wait_range(mapping, ioffset, offset);
if (ret)
goto out_mmap;
/*
* Write data that will be shifted to preserve them when discarding
* page cache below. We are also protected from pages becoming dirty
* by i_rwsem and invalidate_lock.
*/
ret = filemap_write_and_wait_range(mapping, offset + len,
LLONG_MAX);
truncate_pagecache(inode, ioffset);
<... within i_data_sem>
i_size_write(inode, new_size);
<...>
However to avoid problems like this I felt, I will do some more code
reading. And then I was mostly considering your second suggestion which
is this. This will ensure we keep the current behavior as is and not
change that.
> If we simply don't care that we're doing a spurious write, then we can
> do something like:
>
> - len = size & ~PAGE_MASK;
> + len = size & (len - 1);
-ritesh
On Mon, Jun 12, 2023 at 11:55:55PM +0530, Ritesh Harjani wrote:
> Matthew Wilcox <[email protected]> writes:
> I couldn't respond to your change because I still had some confusion
> around this suggestion -
>
> > So do we care if we write a random fragment of a page after a truncate?
> > If so, we should add:
> >
> > if (folio_pos(folio) >= size)
> > return 0; /* Do we need to account nr_to_write? */
>
> I was not sure whether if go with above case then whether it will
> work with collapse_range. I initially thought that collapse_range will
> truncate the pages between start and end of the file and then
> it can also reduce the inode->i_size. That means writeback can find an
> inode->i_size smaller than folio_pos(folio) which it is writing to.
> But in this case we can't skip the write in writeback case like above
> because that write is still required (a spurious write) even though
> i_size is reduced as it's corresponding FS blocks are not truncated.
>
> But just now looking at ext4_collapse_range() code it doesn't look like
> it is the problem because it waits for any dirty data to be written
> before truncate. So no matter which folio_pos(folio) the writeback is
> writing, there should not be an issue if we simply return 0 like how
> you suggested above.
>
> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
>
> <...>
> ioffset = round_down(offset, PAGE_SIZE);
> /*
> * Write tail of the last page before removed range since it will get
> * removed from the page cache below.
> */
>
> ret = filemap_write_and_wait_range(mapping, ioffset, offset);
> if (ret)
> goto out_mmap;
> /*
> * Write data that will be shifted to preserve them when discarding
> * page cache below. We are also protected from pages becoming dirty
> * by i_rwsem and invalidate_lock.
> */
> ret = filemap_write_and_wait_range(mapping, offset + len,
> LLONG_MAX);
> truncate_pagecache(inode, ioffset);
>
> <... within i_data_sem>
> i_size_write(inode, new_size);
>
> <...>
>
>
> However to avoid problems like this I felt, I will do some more code
> reading. And then I was mostly considering your second suggestion which
> is this. This will ensure we keep the current behavior as is and not
> change that.
>
> > If we simply don't care that we're doing a spurious write, then we can
> > do something like:
> >
> > - len = size & ~PAGE_MASK;
> > + len = size & (len - 1);
For all I know, I've found a bug here. I don't know enough about ext4; if
we have truncated a file, and then writeback a page that is past i_size,
will the block its writing to have been freed? Is this potentially a
silent data corruptor?
I am hoping Jan and Ted could correct me if any of my understanding
is incorrect. But here is my view...
Matthew Wilcox <[email protected]> writes:
> On Mon, Jun 12, 2023 at 11:55:55PM +0530, Ritesh Harjani wrote:
>> Matthew Wilcox <[email protected]> writes:
>> I couldn't respond to your change because I still had some confusion
>> around this suggestion -
>>
>> > So do we care if we write a random fragment of a page after a truncate?
>> > If so, we should add:
>> >
>> > if (folio_pos(folio) >= size)
>> > return 0; /* Do we need to account nr_to_write? */
>>
>> I was not sure whether if go with above case then whether it will
>> work with collapse_range. I initially thought that collapse_range will
>> truncate the pages between start and end of the file and then
>> it can also reduce the inode->i_size. That means writeback can find an
>> inode->i_size smaller than folio_pos(folio) which it is writing to.
>> But in this case we can't skip the write in writeback case like above
>> because that write is still required (a spurious write) even though
>> i_size is reduced as it's corresponding FS blocks are not truncated.
>>
>> But just now looking at ext4_collapse_range() code it doesn't look like
>> it is the problem because it waits for any dirty data to be written
>> before truncate. So no matter which folio_pos(folio) the writeback is
>> writing, there should not be an issue if we simply return 0 like how
>> you suggested above.
>>
>> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
>>
>> <...>
>> ioffset = round_down(offset, PAGE_SIZE);
>> /*
>> * Write tail of the last page before removed range since it will get
>> * removed from the page cache below.
>> */
>>
>> ret = filemap_write_and_wait_range(mapping, ioffset, offset);
>> if (ret)
>> goto out_mmap;
>> /*
>> * Write data that will be shifted to preserve them when discarding
>> * page cache below. We are also protected from pages becoming dirty
>> * by i_rwsem and invalidate_lock.
>> */
>> ret = filemap_write_and_wait_range(mapping, offset + len,
>> LLONG_MAX);
>> truncate_pagecache(inode, ioffset);
>>
>> <... within i_data_sem>
>> i_size_write(inode, new_size);
>>
>> <...>
>>
>>
>> However to avoid problems like this I felt, I will do some more code
>> reading. And then I was mostly considering your second suggestion which
>> is this. This will ensure we keep the current behavior as is and not
>> change that.
>>
>> > If we simply don't care that we're doing a spurious write, then we can
>> > do something like:
>> >
>> > - len = size & ~PAGE_MASK;
>> > + len = size & (len - 1);
>
> For all I know, I've found a bug here. I don't know enough about ext4; if
> we have truncated a file, and then writeback a page that is past i_size,
> will the block its writing to have been freed?
I don't think so. If we look at truncate code, it first reduces i_size,
then call truncate_pagecache(inode, newsize) and then we will call
ext4_truncate() which will free the corresponding blocks.
Since writeback happens with folio lock held until completion, hence I
think truncate_pagecache() should block on that folio until it's lock
has been released.
- IIUC, if truncate would have completed then the folio won't be in the
foliocache for writeback to happen. Foliocache is kept consistent
via
- first truncate the folio in the foliocache and then remove/free
the blocks on device.
- Also the reason we update i_size "before" calling truncate_pagecache()
is to synchronize with mmap/pagefault.
> Is this potentially a silent data corruptor?
- Let's consider a case when folio_pos > i_size but both still belongs
to the last block. i.e. it's a straddle write case.
In such case we require writeback to write the data of this last folio
straddling i_size. Because truncate will not remove/free this last folio
straddling i_size & neither the last block will be freed. And I think
writeback is supposed to write this last folio to the disk to keep the
cache and disk data consistent. Because truncate will only zero out
the rest of the folio in the foliocache. But I don't think it will go and
write that folio out (It's not required because i_size means that the
rest of the folio beyond i_size should remain zero).
So, IMO writeback is supposed to write this last folio to the disk. And,
if we skip this writeout, then I think it may cause silent data corruption.
But I am not sure about the rest of the write beyond the last block of
i_size. I think those could just be spurious writes which won't cause
any harm because truncate will eventually first remove this folio from
file mapping and then will release the corresponding disk blocks.
So writing those out should does no harm
-ritesh
On Tue 13-06-23 09:27:38, Ritesh Harjani wrote:
> Matthew Wilcox <[email protected]> writes:
> > On Mon, Jun 12, 2023 at 11:55:55PM +0530, Ritesh Harjani wrote:
> >> Matthew Wilcox <[email protected]> writes:
> >> I couldn't respond to your change because I still had some confusion
> >> around this suggestion -
> >>
> >> > So do we care if we write a random fragment of a page after a truncate?
> >> > If so, we should add:
> >> >
> >> > if (folio_pos(folio) >= size)
> >> > return 0; /* Do we need to account nr_to_write? */
> >>
> >> I was not sure whether if go with above case then whether it will
> >> work with collapse_range. I initially thought that collapse_range will
> >> truncate the pages between start and end of the file and then
> >> it can also reduce the inode->i_size. That means writeback can find an
> >> inode->i_size smaller than folio_pos(folio) which it is writing to.
> >> But in this case we can't skip the write in writeback case like above
> >> because that write is still required (a spurious write) even though
> >> i_size is reduced as it's corresponding FS blocks are not truncated.
> >>
> >> But just now looking at ext4_collapse_range() code it doesn't look like
> >> it is the problem because it waits for any dirty data to be written
> >> before truncate. So no matter which folio_pos(folio) the writeback is
> >> writing, there should not be an issue if we simply return 0 like how
> >> you suggested above.
> >>
> >> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> >>
> >> <...>
> >> ioffset = round_down(offset, PAGE_SIZE);
> >> /*
> >> * Write tail of the last page before removed range since it will get
> >> * removed from the page cache below.
> >> */
> >>
> >> ret = filemap_write_and_wait_range(mapping, ioffset, offset);
> >> if (ret)
> >> goto out_mmap;
> >> /*
> >> * Write data that will be shifted to preserve them when discarding
> >> * page cache below. We are also protected from pages becoming dirty
> >> * by i_rwsem and invalidate_lock.
> >> */
> >> ret = filemap_write_and_wait_range(mapping, offset + len,
> >> LLONG_MAX);
> >> truncate_pagecache(inode, ioffset);
> >>
> >> <... within i_data_sem>
> >> i_size_write(inode, new_size);
> >>
> >> <...>
> >>
> >>
> >> However to avoid problems like this I felt, I will do some more code
> >> reading. And then I was mostly considering your second suggestion which
> >> is this. This will ensure we keep the current behavior as is and not
> >> change that.
> >>
> >> > If we simply don't care that we're doing a spurious write, then we can
> >> > do something like:
> >> >
> >> > - len = size & ~PAGE_MASK;
> >> > + len = size & (len - 1);
> >
> > For all I know, I've found a bug here. I don't know enough about ext4; if
> > we have truncated a file, and then writeback a page that is past i_size,
> > will the block its writing to have been freed?
>
> I don't think so. If we look at truncate code, it first reduces i_size,
> then call truncate_pagecache(inode, newsize) and then we will call
> ext4_truncate() which will free the corresponding blocks.
> Since writeback happens with folio lock held until completion, hence I
> think truncate_pagecache() should block on that folio until it's lock
> has been released.
>
> - IIUC, if truncate would have completed then the folio won't be in the
> foliocache for writeback to happen. Foliocache is kept consistent
> via
> - first truncate the folio in the foliocache and then remove/free
> the blocks on device.
Yes, correct.
> - Also the reason we update i_size "before" calling truncate_pagecache()
> is to synchronize with mmap/pagefault.
Yes, but these days mapping->invalidate_lock works for that instead for
ext4.
> > Is this potentially a silent data corruptor?
>
> - Let's consider a case when folio_pos > i_size but both still belongs
> to the last block. i.e. it's a straddle write case.
> In such case we require writeback to write the data of this last folio
> straddling i_size. Because truncate will not remove/free this last folio
> straddling i_size & neither the last block will be freed. And I think
> writeback is supposed to write this last folio to the disk to keep the
> cache and disk data consistent. Because truncate will only zero out
> the rest of the folio in the foliocache. But I don't think it will go and
> write that folio out (It's not required because i_size means that the
> rest of the folio beyond i_size should remain zero).
>
> So, IMO writeback is supposed to write this last folio to the disk. And,
> if we skip this writeout, then I think it may cause silent data corruption.
>
> But I am not sure about the rest of the write beyond the last block of
> i_size. I think those could just be spurious writes which won't cause
> any harm because truncate will eventually first remove this folio from
> file mapping and then will release the corresponding disk blocks.
> So writing those out should does no harm
Correct. The block straddling i_size must be written out, the blocks fully
beyond new i_size (but below old i_size) may or may not be written out. As
you say these extra racing writes to blocks that will get truncated cause
no harm.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Jan Kara <[email protected]> writes:
> On Tue 13-06-23 09:27:38, Ritesh Harjani wrote:
>> Matthew Wilcox <[email protected]> writes:
>> > On Mon, Jun 12, 2023 at 11:55:55PM +0530, Ritesh Harjani wrote:
>> >> Matthew Wilcox <[email protected]> writes:
>> >> I couldn't respond to your change because I still had some confusion
>> >> around this suggestion -
>> >>
>> >> > So do we care if we write a random fragment of a page after a truncate?
>> >> > If so, we should add:
>> >> >
>> >> > if (folio_pos(folio) >= size)
>> >> > return 0; /* Do we need to account nr_to_write? */
>> >>
>> >> I was not sure whether if go with above case then whether it will
>> >> work with collapse_range. I initially thought that collapse_range will
>> >> truncate the pages between start and end of the file and then
>> >> it can also reduce the inode->i_size. That means writeback can find an
>> >> inode->i_size smaller than folio_pos(folio) which it is writing to.
>> >> But in this case we can't skip the write in writeback case like above
>> >> because that write is still required (a spurious write) even though
>> >> i_size is reduced as it's corresponding FS blocks are not truncated.
>> >>
>> >> But just now looking at ext4_collapse_range() code it doesn't look like
>> >> it is the problem because it waits for any dirty data to be written
>> >> before truncate. So no matter which folio_pos(folio) the writeback is
>> >> writing, there should not be an issue if we simply return 0 like how
>> >> you suggested above.
>> >>
>> >> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
>> >>
>> >> <...>
>> >> ioffset = round_down(offset, PAGE_SIZE);
>> >> /*
>> >> * Write tail of the last page before removed range since it will get
>> >> * removed from the page cache below.
>> >> */
>> >>
>> >> ret = filemap_write_and_wait_range(mapping, ioffset, offset);
>> >> if (ret)
>> >> goto out_mmap;
>> >> /*
>> >> * Write data that will be shifted to preserve them when discarding
>> >> * page cache below. We are also protected from pages becoming dirty
>> >> * by i_rwsem and invalidate_lock.
>> >> */
>> >> ret = filemap_write_and_wait_range(mapping, offset + len,
>> >> LLONG_MAX);
>> >> truncate_pagecache(inode, ioffset);
>> >>
>> >> <... within i_data_sem>
>> >> i_size_write(inode, new_size);
>> >>
>> >> <...>
>> >>
>> >>
>> >> However to avoid problems like this I felt, I will do some more code
>> >> reading. And then I was mostly considering your second suggestion which
>> >> is this. This will ensure we keep the current behavior as is and not
>> >> change that.
>> >>
>> >> > If we simply don't care that we're doing a spurious write, then we can
>> >> > do something like:
>> >> >
>> >> > - len = size & ~PAGE_MASK;
>> >> > + len = size & (len - 1);
>> >
>> > For all I know, I've found a bug here. I don't know enough about ext4; if
>> > we have truncated a file, and then writeback a page that is past i_size,
>> > will the block its writing to have been freed?
>>
>> I don't think so. If we look at truncate code, it first reduces i_size,
>> then call truncate_pagecache(inode, newsize) and then we will call
>> ext4_truncate() which will free the corresponding blocks.
>> Since writeback happens with folio lock held until completion, hence I
>> think truncate_pagecache() should block on that folio until it's lock
>> has been released.
>>
>> - IIUC, if truncate would have completed then the folio won't be in the
>> foliocache for writeback to happen. Foliocache is kept consistent
>> via
>> - first truncate the folio in the foliocache and then remove/free
>> the blocks on device.
>
> Yes, correct.
>
>> - Also the reason we update i_size "before" calling truncate_pagecache()
>> is to synchronize with mmap/pagefault.
>
> Yes, but these days mapping->invalidate_lock works for that instead for
> ext4.
>
>> > Is this potentially a silent data corruptor?
>>
>> - Let's consider a case when folio_pos > i_size but both still belongs
>> to the last block. i.e. it's a straddle write case.
>> In such case we require writeback to write the data of this last folio
>> straddling i_size. Because truncate will not remove/free this last folio
>> straddling i_size & neither the last block will be freed. And I think
>> writeback is supposed to write this last folio to the disk to keep the
>> cache and disk data consistent. Because truncate will only zero out
>> the rest of the folio in the foliocache. But I don't think it will go and
>> write that folio out (It's not required because i_size means that the
>> rest of the folio beyond i_size should remain zero).
>>
>> So, IMO writeback is supposed to write this last folio to the disk. And,
>> if we skip this writeout, then I think it may cause silent data corruption.
>>
>> But I am not sure about the rest of the write beyond the last block of
>> i_size. I think those could just be spurious writes which won't cause
>> any harm because truncate will eventually first remove this folio from
>> file mapping and then will release the corresponding disk blocks.
>> So writing those out should does no harm
>
> Correct. The block straddling i_size must be written out, the blocks fully
> beyond new i_size (but below old i_size) may or may not be written out. As
> you say these extra racing writes to blocks that will get truncated cause
> no harm.
>
Thanks Jan for confirming. So, I think we should make below change.
(note the code which was doing "size - folio_pos(folio)" in
mpage_submit_folio() is dropped by Ted in the latest tree).
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43be684dabcb..006eba9be5e6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1859,9 +1859,9 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
*/
size = i_size_read(mpd->inode);
len = folio_size(folio);
- if (folio_pos(folio) + len > size &&
+ if ((folio_pos(folio) >= size || (folio_pos(folio) + len > size)) &&
!ext4_verity_in_progress(mpd->inode))
- len = size & ~PAGE_MASK;
+ len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
mpd->wbc->nr_to_write--;
@@ -2329,9 +2329,9 @@ static int mpage_journal_page_buffers(handle_t *handle,
folio_clear_checked(folio);
mpd->wbc->nr_to_write--;
- if (folio_pos(folio) + len > size &&
+ if ((folio_pos(folio) >= size || (folio_pos(folio) + len > size)) &&
!ext4_verity_in_progress(inode))
- len = size - folio_pos(folio);
+ len = size & (len - 1);
return ext4_journal_folio_buffers(handle, folio, len);
}
I will give it some more thoughts and testing.
-ritesh
On Wed, Jun 14, 2023 at 01:09:59AM +0530, Ritesh Harjani wrote:
> Jan Kara <[email protected]> writes:
>
> > On Tue 13-06-23 09:27:38, Ritesh Harjani wrote:
> >> Matthew Wilcox <[email protected]> writes:
> >> > On Mon, Jun 12, 2023 at 11:55:55PM +0530, Ritesh Harjani wrote:
> >> >> Matthew Wilcox <[email protected]> writes:
> >> >> I couldn't respond to your change because I still had some confusion
> >> >> around this suggestion -
> >> >>
> >> >> > So do we care if we write a random fragment of a page after a truncate?
> >> >> > If so, we should add:
> >> >> >
> >> >> > if (folio_pos(folio) >= size)
> >> >> > return 0; /* Do we need to account nr_to_write? */
> >> >>
> >> >> I was not sure whether if go with above case then whether it will
> >> >> work with collapse_range. I initially thought that collapse_range will
> >> >> truncate the pages between start and end of the file and then
> >> >> it can also reduce the inode->i_size. That means writeback can find an
> >> >> inode->i_size smaller than folio_pos(folio) which it is writing to.
> >> >> But in this case we can't skip the write in writeback case like above
> >> >> because that write is still required (a spurious write) even though
> >> >> i_size is reduced as it's corresponding FS blocks are not truncated.
> >> >>
> >> >> But just now looking at ext4_collapse_range() code it doesn't look like
> >> >> it is the problem because it waits for any dirty data to be written
> >> >> before truncate. So no matter which folio_pos(folio) the writeback is
> >> >> writing, there should not be an issue if we simply return 0 like how
> >> >> you suggested above.
> >> >>
> >> >> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
> >> >>
> >> >> <...>
> >> >> ioffset = round_down(offset, PAGE_SIZE);
> >> >> /*
> >> >> * Write tail of the last page before removed range since it will get
> >> >> * removed from the page cache below.
> >> >> */
> >> >>
> >> >> ret = filemap_write_and_wait_range(mapping, ioffset, offset);
> >> >> if (ret)
> >> >> goto out_mmap;
> >> >> /*
> >> >> * Write data that will be shifted to preserve them when discarding
> >> >> * page cache below. We are also protected from pages becoming dirty
> >> >> * by i_rwsem and invalidate_lock.
> >> >> */
> >> >> ret = filemap_write_and_wait_range(mapping, offset + len,
> >> >> LLONG_MAX);
> >> >> truncate_pagecache(inode, ioffset);
> >> >>
> >> >> <... within i_data_sem>
> >> >> i_size_write(inode, new_size);
> >> >>
> >> >> <...>
> >> >>
> >> >>
> >> >> However to avoid problems like this I felt, I will do some more code
> >> >> reading. And then I was mostly considering your second suggestion which
> >> >> is this. This will ensure we keep the current behavior as is and not
> >> >> change that.
> >> >>
> >> >> > If we simply don't care that we're doing a spurious write, then we can
> >> >> > do something like:
> >> >> >
> >> >> > - len = size & ~PAGE_MASK;
> >> >> > + len = size & (len - 1);
> >> >
> >> > For all I know, I've found a bug here. I don't know enough about ext4; if
> >> > we have truncated a file, and then writeback a page that is past i_size,
> >> > will the block its writing to have been freed?
> >>
> >> I don't think so. If we look at truncate code, it first reduces i_size,
> >> then call truncate_pagecache(inode, newsize) and then we will call
> >> ext4_truncate() which will free the corresponding blocks.
> >> Since writeback happens with folio lock held until completion, hence I
> >> think truncate_pagecache() should block on that folio until it's lock
> >> has been released.
> >>
> >> - IIUC, if truncate would have completed then the folio won't be in the
> >> foliocache for writeback to happen. Foliocache is kept consistent
> >> via
> >> - first truncate the folio in the foliocache and then remove/free
> >> the blocks on device.
> >
> > Yes, correct.
> >
> >> - Also the reason we update i_size "before" calling truncate_pagecache()
> >> is to synchronize with mmap/pagefault.
> >
> > Yes, but these days mapping->invalidate_lock works for that instead for
> > ext4.
> >
> >> > Is this potentially a silent data corruptor?
> >>
> >> - Let's consider a case when folio_pos > i_size but both still belongs
> >> to the last block. i.e. it's a straddle write case.
> >> In such case we require writeback to write the data of this last folio
> >> straddling i_size. Because truncate will not remove/free this last folio
> >> straddling i_size & neither the last block will be freed. And I think
> >> writeback is supposed to write this last folio to the disk to keep the
> >> cache and disk data consistent. Because truncate will only zero out
> >> the rest of the folio in the foliocache. But I don't think it will go and
> >> write that folio out (It's not required because i_size means that the
> >> rest of the folio beyond i_size should remain zero).
> >>
> >> So, IMO writeback is supposed to write this last folio to the disk. And,
> >> if we skip this writeout, then I think it may cause silent data corruption.
> >>
> >> But I am not sure about the rest of the write beyond the last block of
> >> i_size. I think those could just be spurious writes which won't cause
> >> any harm because truncate will eventually first remove this folio from
> >> file mapping and then will release the corresponding disk blocks.
> >> So writing those out should does no harm
> >
> > Correct. The block straddling i_size must be written out, the blocks fully
> > beyond new i_size (but below old i_size) may or may not be written out. As
> > you say these extra racing writes to blocks that will get truncated cause
> > no harm.
> >
>
> Thanks Jan for confirming. So, I think we should make below change.
> (note the code which was doing "size - folio_pos(folio)" in
> mpage_submit_folio() is dropped by Ted in the latest tree).
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 43be684dabcb..006eba9be5e6 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1859,9 +1859,9 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
> */
> size = i_size_read(mpd->inode);
> len = folio_size(folio);
> - if (folio_pos(folio) + len > size &&
> + if ((folio_pos(folio) >= size || (folio_pos(folio) + len > size)) &&
> !ext4_verity_in_progress(mpd->inode))
> - len = size & ~PAGE_MASK;
> + len = size & (len - 1);
> err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
> if (!err)
> mpd->wbc->nr_to_write--;
> @@ -2329,9 +2329,9 @@ static int mpage_journal_page_buffers(handle_t *handle,
> folio_clear_checked(folio);
> mpd->wbc->nr_to_write--;
>
> - if (folio_pos(folio) + len > size &&
> + if ((folio_pos(folio) >= size || (folio_pos(folio) + len > size)) &&
> !ext4_verity_in_progress(inode))
> - len = size - folio_pos(folio);
> + len = size & (len - 1);
>
> return ext4_journal_folio_buffers(handle, folio, len);
> }
>
>
> I will give it some more thoughts and testing.
Why should ext4 be different from other filesystems which simply do:
if (folio_pos(folio) >= size)
return 0;
Matthew Wilcox <[email protected]> writes:
> On Wed, Jun 14, 2023 at 01:09:59AM +0530, Ritesh Harjani wrote:
>> Jan Kara <[email protected]> writes:
>>
>> > On Tue 13-06-23 09:27:38, Ritesh Harjani wrote:
>> >> Matthew Wilcox <[email protected]> writes:
>> >> > On Mon, Jun 12, 2023 at 11:55:55PM +0530, Ritesh Harjani wrote:
>> >> >> Matthew Wilcox <[email protected]> writes:
>> >> >> I couldn't respond to your change because I still had some confusion
>> >> >> around this suggestion -
>> >> >>
>> >> >> > So do we care if we write a random fragment of a page after a truncate?
>> >> >> > If so, we should add:
>> >> >> >
>> >> >> > if (folio_pos(folio) >= size)
>> >> >> > return 0; /* Do we need to account nr_to_write? */
>> >> >>
>> >> >> I was not sure whether if go with above case then whether it will
>> >> >> work with collapse_range. I initially thought that collapse_range will
>> >> >> truncate the pages between start and end of the file and then
>> >> >> it can also reduce the inode->i_size. That means writeback can find an
>> >> >> inode->i_size smaller than folio_pos(folio) which it is writing to.
>> >> >> But in this case we can't skip the write in writeback case like above
>> >> >> because that write is still required (a spurious write) even though
>> >> >> i_size is reduced as it's corresponding FS blocks are not truncated.
>> >> >>
>> >> >> But just now looking at ext4_collapse_range() code it doesn't look like
>> >> >> it is the problem because it waits for any dirty data to be written
>> >> >> before truncate. So no matter which folio_pos(folio) the writeback is
>> >> >> writing, there should not be an issue if we simply return 0 like how
>> >> >> you suggested above.
>> >> >>
>> >> >> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
>> >> >>
>> >> >> <...>
>> >> >> ioffset = round_down(offset, PAGE_SIZE);
>> >> >> /*
>> >> >> * Write tail of the last page before removed range since it will get
>> >> >> * removed from the page cache below.
>> >> >> */
>> >> >>
>> >> >> ret = filemap_write_and_wait_range(mapping, ioffset, offset);
>> >> >> if (ret)
>> >> >> goto out_mmap;
>> >> >> /*
>> >> >> * Write data that will be shifted to preserve them when discarding
>> >> >> * page cache below. We are also protected from pages becoming dirty
>> >> >> * by i_rwsem and invalidate_lock.
>> >> >> */
>> >> >> ret = filemap_write_and_wait_range(mapping, offset + len,
>> >> >> LLONG_MAX);
>> >> >> truncate_pagecache(inode, ioffset);
>> >> >>
>> >> >> <... within i_data_sem>
>> >> >> i_size_write(inode, new_size);
>> >> >>
>> >> >> <...>
>> >> >>
>> >> >>
>> >> >> However to avoid problems like this I felt, I will do some more code
>> >> >> reading. And then I was mostly considering your second suggestion which
>> >> >> is this. This will ensure we keep the current behavior as is and not
>> >> >> change that.
>> >> >>
>> >> >> > If we simply don't care that we're doing a spurious write, then we can
>> >> >> > do something like:
>> >> >> >
>> >> >> > - len = size & ~PAGE_MASK;
>> >> >> > + len = size & (len - 1);
>> >> >
>> >> > For all I know, I've found a bug here. I don't know enough about ext4; if
>> >> > we have truncated a file, and then writeback a page that is past i_size,
>> >> > will the block its writing to have been freed?
>> >>
>> >> I don't think so. If we look at truncate code, it first reduces i_size,
>> >> then call truncate_pagecache(inode, newsize) and then we will call
>> >> ext4_truncate() which will free the corresponding blocks.
>> >> Since writeback happens with folio lock held until completion, hence I
>> >> think truncate_pagecache() should block on that folio until it's lock
>> >> has been released.
>> >>
>> >> - IIUC, if truncate would have completed then the folio won't be in the
>> >> foliocache for writeback to happen. Foliocache is kept consistent
>> >> via
>> >> - first truncate the folio in the foliocache and then remove/free
>> >> the blocks on device.
>> >
>> > Yes, correct.
>> >
>> >> - Also the reason we update i_size "before" calling truncate_pagecache()
>> >> is to synchronize with mmap/pagefault.
>> >
>> > Yes, but these days mapping->invalidate_lock works for that instead for
>> > ext4.
>> >
>> >> > Is this potentially a silent data corruptor?
>> >>
>> >> - Let's consider a case when folio_pos > i_size but both still belongs
>> >> to the last block. i.e. it's a straddle write case.
>> >> In such case we require writeback to write the data of this last folio
>> >> straddling i_size. Because truncate will not remove/free this last folio
>> >> straddling i_size & neither the last block will be freed. And I think
>> >> writeback is supposed to write this last folio to the disk to keep the
>> >> cache and disk data consistent. Because truncate will only zero out
>> >> the rest of the folio in the foliocache. But I don't think it will go and
>> >> write that folio out (It's not required because i_size means that the
>> >> rest of the folio beyond i_size should remain zero).
>> >>
>> >> So, IMO writeback is supposed to write this last folio to the disk. And,
>> >> if we skip this writeout, then I think it may cause silent data corruption.
>> >>
>> >> But I am not sure about the rest of the write beyond the last block of
>> >> i_size. I think those could just be spurious writes which won't cause
>> >> any harm because truncate will eventually first remove this folio from
>> >> file mapping and then will release the corresponding disk blocks.
>> >> So writing those out should does no harm
>> >
>> > Correct. The block straddling i_size must be written out, the blocks fully
>> > beyond new i_size (but below old i_size) may or may not be written out. As
>> > you say these extra racing writes to blocks that will get truncated cause
>> > no harm.
>> >
>>
>> Thanks Jan for confirming. So, I think we should make below change.
>> (note the code which was doing "size - folio_pos(folio)" in
>> mpage_submit_folio() is dropped by Ted in the latest tree).
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 43be684dabcb..006eba9be5e6 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -1859,9 +1859,9 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
>> */
>> size = i_size_read(mpd->inode);
>> len = folio_size(folio);
>> - if (folio_pos(folio) + len > size &&
>> + if ((folio_pos(folio) >= size || (folio_pos(folio) + len > size)) &&
>> !ext4_verity_in_progress(mpd->inode))
>> - len = size & ~PAGE_MASK;
>> + len = size & (len - 1);
>> err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
>> if (!err)
>> mpd->wbc->nr_to_write--;
>> @@ -2329,9 +2329,9 @@ static int mpage_journal_page_buffers(handle_t *handle,
>> folio_clear_checked(folio);
>> mpd->wbc->nr_to_write--;
>>
>> - if (folio_pos(folio) + len > size &&
>> + if ((folio_pos(folio) >= size || (folio_pos(folio) + len > size)) &&
>> !ext4_verity_in_progress(inode))
>> - len = size - folio_pos(folio);
>> + len = size & (len - 1);
>>
>> return ext4_journal_folio_buffers(handle, folio, len);
>> }
>>
>>
>> I will give it some more thoughts and testing.
>
> Why should ext4 be different from other filesystems which simply do:
>
> if (folio_pos(folio) >= size)
> return 0;
Yes, this case was bothering me and I was just thinking of this case. So,
since folio_pos(folio) starts at some pagesize boundary, then anyways
truncate will remove the entire page. So we need not bother about
writing this out.
Also should we just reduce nr_to_write because we could have written
that page-out but we know truncate is anyway going to remove it?
So the code should look like this then?
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43be684dabcb..976e84507236 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1840,7 +1840,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
{
size_t len;
loff_t size;
- int err;
+ int err = 0;
BUG_ON(folio->index != mpd->first_page);
folio_clear_dirty_for_io(folio);
@@ -1859,10 +1859,19 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
*/
size = i_size_read(mpd->inode);
len = folio_size(folio);
+
+ /*
+ * Truncate should take care of truncating the entire folio anyways.
+ * So don't bother writing it out.
+ */
+ if (folio_pos(folio) >= size)
+ goto out;
+
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(mpd->inode))
- len = size & ~PAGE_MASK;
+ len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
+out:
if (!err)
mpd->wbc->nr_to_write--;
@@ -2329,9 +2338,16 @@ static int mpage_journal_page_buffers(handle_t *handle,
folio_clear_checked(folio);
mpd->wbc->nr_to_write--;
+ /*
+ * Truncate should take care of truncating the entire folio anyways.
+ * So don't bother writing it out.
+ */
+ if (folio_pos(folio) >= size)
+ return 0;
+
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
- len = size - folio_pos(folio);
+ len = size & (len - 1);
return ext4_journal_folio_buffers(handle, folio, len);
}
I will have to read more on returning 0 from
mpage_journal_page_buffers() function to make sure we don't need any
special handling for folio in data=journal mode.
-ritesh