From: "Matthew Wilcox (Oracle)" <[email protected]>
This series adds a readahead address_space operation to eventually
replace the readpages operation. The key difference is that
pages are added to the page cache as they are allocated (and
then looked up by the filesystem) instead of passing them on a
list to the readpages operation and having the filesystem add
them to the page cache. It's a net reduction in code for each
implementation, more efficient than walking a list, and solves
the direct-write vs buffered-read problem reported by yu kuai at
https://lore.kernel.org/linux-fsdevel/[email protected]/
The only unconverted filesystems are those which use fscache.
Their conversion is pending Dave Howells' rewrite which will make the
conversion substantially easier.
v6:
- Name the private members of readahead_control with a leading underscore
(suggested by Christoph Hellwig)
- Fix whitespace in rst file
- Remove misleading comment in btrfs patch
- Add readahead_next() API and use it in iomap
- Add iomap_readahead kerneldoc.
- Fix the mpage_readahead kerneldoc
- Make various readahead functions return void
- Keep readahead_index() and readahead_offset() pointing to the start of
this batch through the body. No current user requires this, but it's
less surprising.
- Add kerneldoc for page_cache_readahead_limit
- Make page_idx an unsigned long, and rename it to just 'i'
- Get rid of page_offset local variable
- Add patch to call memalloc_nofs_save() before allocating pages (suggested
by Michal Hocko)
- Resplit a lot of patches for more logical progression and easier review
(suggested by John Hubbard)
- Added sign-offs where received, and I deemed still relevant
v5 switched to passing a readahead_control struct (mirroring the
writepages_control struct passed to writepages). This has a number of
advantages:
- It fixes a number of bugs in various implementations, eg forgetting to
increment 'start', an off-by-one error in 'nr_pages' or treating 'start'
as a byte offset instead of a page offset.
- It allows us to change the arguments without changing all the
implementations of ->readahead which just call mpage_readahead() or
iomap_readahead()
- Figuring out which pages haven't been attempted by the implementation
is more natural this way.
- There's less code in each implementation.
Matthew Wilcox (Oracle) (19):
mm: Return void from various readahead functions
mm: Ignore return value of ->readpages
mm: Use readahead_control to pass arguments
mm: Rearrange readahead loop
mm: Remove 'page_offset' from readahead loop
mm: rename readahead loop variable to 'i'
mm: Put readahead pages in cache earlier
mm: Add readahead address space operation
mm: Add page_cache_readahead_limit
fs: Convert mpage_readpages to mpage_readahead
btrfs: Convert from readpages to readahead
erofs: Convert uncompressed files from readpages to readahead
erofs: Convert compressed files from readpages to readahead
ext4: Convert from readpages to readahead
f2fs: Convert from readpages to readahead
fuse: Convert from readpages to readahead
iomap: Restructure iomap_readpages_actor
iomap: Convert from readpages to readahead
mm: Use memalloc_nofs_save in readahead path
Documentation/filesystems/locking.rst | 6 +-
Documentation/filesystems/vfs.rst | 13 ++
drivers/staging/exfat/exfat_super.c | 7 +-
fs/block_dev.c | 7 +-
fs/btrfs/extent_io.c | 46 ++-----
fs/btrfs/extent_io.h | 3 +-
fs/btrfs/inode.c | 16 +--
fs/erofs/data.c | 39 ++----
fs/erofs/zdata.c | 29 ++--
fs/ext2/inode.c | 10 +-
fs/ext4/ext4.h | 3 +-
fs/ext4/inode.c | 23 ++--
fs/ext4/readpage.c | 22 ++-
fs/ext4/verity.c | 35 +----
fs/f2fs/data.c | 50 +++----
fs/f2fs/f2fs.h | 5 +-
fs/f2fs/verity.c | 35 +----
fs/fat/inode.c | 7 +-
fs/fuse/file.c | 46 +++----
fs/gfs2/aops.c | 23 ++--
fs/hpfs/file.c | 7 +-
fs/iomap/buffered-io.c | 118 +++++++----------
fs/iomap/trace.h | 2 +-
fs/isofs/inode.c | 7 +-
fs/jfs/inode.c | 7 +-
fs/mpage.c | 38 ++----
fs/nilfs2/inode.c | 15 +--
fs/ocfs2/aops.c | 34 ++---
fs/omfs/file.c | 7 +-
fs/qnx6/inode.c | 7 +-
fs/reiserfs/inode.c | 8 +-
fs/udf/inode.c | 7 +-
fs/xfs/xfs_aops.c | 13 +-
fs/zonefs/super.c | 7 +-
include/linux/fs.h | 2 +
include/linux/iomap.h | 3 +-
include/linux/mpage.h | 4 +-
include/linux/pagemap.h | 90 +++++++++++++
include/trace/events/erofs.h | 6 +-
include/trace/events/f2fs.h | 6 +-
mm/internal.h | 8 +-
mm/migrate.c | 2 +-
mm/readahead.c | 184 +++++++++++++++++---------
43 files changed, 474 insertions(+), 533 deletions(-)
base-commit: 11a48a5a18c63fd7621bb050228cebf13566e4d8
--
2.25.0
From: "Matthew Wilcox (Oracle)" <[email protected]>
Use the new readahead operation in fuse. Switching away from the
read_cache_pages() helper gets rid of an implicit call to put_page(),
so we can get rid of the get_page() call in fuse_readpages_fill().
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/fuse/file.c | 46 +++++++++++++++++++---------------------------
1 file changed, 19 insertions(+), 27 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9d67b830fb7a..f64f98708b5e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -923,9 +923,8 @@ struct fuse_fill_data {
unsigned int max_pages;
};
-static int fuse_readpages_fill(void *_data, struct page *page)
+static int fuse_readpages_fill(struct fuse_fill_data *data, struct page *page)
{
- struct fuse_fill_data *data = _data;
struct fuse_io_args *ia = data->ia;
struct fuse_args_pages *ap = &ia->ap;
struct inode *inode = data->inode;
@@ -941,10 +940,8 @@ static int fuse_readpages_fill(void *_data, struct page *page)
fc->max_pages);
fuse_send_readpages(ia, data->file);
data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
- if (!ia) {
- unlock_page(page);
+ if (!ia)
return -ENOMEM;
- }
ap = &ia->ap;
}
@@ -954,7 +951,6 @@ static int fuse_readpages_fill(void *_data, struct page *page)
return -EIO;
}
- get_page(page);
ap->pages[ap->num_pages] = page;
ap->descs[ap->num_pages].length = PAGE_SIZE;
ap->num_pages++;
@@ -962,37 +958,33 @@ static int fuse_readpages_fill(void *_data, struct page *page)
return 0;
}
-static int fuse_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void fuse_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_data data;
- int err;
+ struct page *page;
- err = -EIO;
if (is_bad_inode(inode))
- goto out;
+ return;
- data.file = file;
+ data.file = rac->file;
data.inode = inode;
- data.nr_pages = nr_pages;
- data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
-;
+ data.nr_pages = readahead_count(rac);
+ data.max_pages = min_t(unsigned int, data.nr_pages, fc->max_pages);
data.ia = fuse_io_alloc(NULL, data.max_pages);
- err = -ENOMEM;
if (!data.ia)
- goto out;
+ return;
- err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
- if (!err) {
- if (data.ia->ap.num_pages)
- fuse_send_readpages(data.ia, file);
- else
- fuse_io_free(data.ia);
+ readahead_for_each(rac, page) {
+ if (fuse_readpages_fill(&data, page) != 0)
+ return;
}
-out:
- return err;
+
+ if (data.ia->ap.num_pages)
+ fuse_send_readpages(data.ia, rac->file);
+ else
+ fuse_io_free(data.ia);
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -3373,10 +3365,10 @@ static const struct file_operations fuse_file_operations = {
static const struct address_space_operations fuse_file_aops = {
.readpage = fuse_readpage,
+ .readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
.launder_page = fuse_launder_page,
- .readpages = fuse_readpages,
.set_page_dirty = __set_page_dirty_nobuffers,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
--
2.25.0
From: "Matthew Wilcox (Oracle)" <[email protected]>
Use the new readahead operation in fuse. Switching away from the
read_cache_pages() helper gets rid of an implicit call to put_page(),
so we can get rid of the get_page() call in fuse_readpages_fill().
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/fuse/file.c | 46 +++++++++++++++++++---------------------------
1 file changed, 19 insertions(+), 27 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9d67b830fb7a..f64f98708b5e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -923,9 +923,8 @@ struct fuse_fill_data {
unsigned int max_pages;
};
-static int fuse_readpages_fill(void *_data, struct page *page)
+static int fuse_readpages_fill(struct fuse_fill_data *data, struct page *page)
{
- struct fuse_fill_data *data = _data;
struct fuse_io_args *ia = data->ia;
struct fuse_args_pages *ap = &ia->ap;
struct inode *inode = data->inode;
@@ -941,10 +940,8 @@ static int fuse_readpages_fill(void *_data, struct page *page)
fc->max_pages);
fuse_send_readpages(ia, data->file);
data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
- if (!ia) {
- unlock_page(page);
+ if (!ia)
return -ENOMEM;
- }
ap = &ia->ap;
}
@@ -954,7 +951,6 @@ static int fuse_readpages_fill(void *_data, struct page *page)
return -EIO;
}
- get_page(page);
ap->pages[ap->num_pages] = page;
ap->descs[ap->num_pages].length = PAGE_SIZE;
ap->num_pages++;
@@ -962,37 +958,33 @@ static int fuse_readpages_fill(void *_data, struct page *page)
return 0;
}
-static int fuse_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void fuse_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_data data;
- int err;
+ struct page *page;
- err = -EIO;
if (is_bad_inode(inode))
- goto out;
+ return;
- data.file = file;
+ data.file = rac->file;
data.inode = inode;
- data.nr_pages = nr_pages;
- data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
-;
+ data.nr_pages = readahead_count(rac);
+ data.max_pages = min_t(unsigned int, data.nr_pages, fc->max_pages);
data.ia = fuse_io_alloc(NULL, data.max_pages);
- err = -ENOMEM;
if (!data.ia)
- goto out;
+ return;
- err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
- if (!err) {
- if (data.ia->ap.num_pages)
- fuse_send_readpages(data.ia, file);
- else
- fuse_io_free(data.ia);
+ readahead_for_each(rac, page) {
+ if (fuse_readpages_fill(&data, page) != 0)
+ return;
}
-out:
- return err;
+
+ if (data.ia->ap.num_pages)
+ fuse_send_readpages(data.ia, rac->file);
+ else
+ fuse_io_free(data.ia);
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -3373,10 +3365,10 @@ static const struct file_operations fuse_file_operations = {
static const struct address_space_operations fuse_file_aops = {
.readpage = fuse_readpage,
+ .readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
.launder_page = fuse_launder_page,
- .readpages = fuse_readpages,
.set_page_dirty = __set_page_dirty_nobuffers,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
--
2.25.0
From: "Matthew Wilcox (Oracle)" <[email protected]>
By putting the 'have we reached the end of the page' condition at the end
of the loop instead of the beginning, we can remove the 'submit the last
page' code from iomap_readpages(). Also check that iomap_readpage_actor()
didn't return 0, which would lead to an endless loop.
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/iomap/buffered-io.c | 25 ++++++++++++-------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index cb3511eb152a..44303f370b2d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -400,15 +400,9 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
- loff_t done, ret;
+ loff_t ret, done = 0;
- for (done = 0; done < length; done += ret) {
- if (ctx->cur_page && offset_in_page(pos + done) == 0) {
- if (!ctx->cur_page_in_bio)
- unlock_page(ctx->cur_page);
- put_page(ctx->cur_page);
- ctx->cur_page = NULL;
- }
+ while (done < length) {
if (!ctx->cur_page) {
ctx->cur_page = iomap_next_page(inode, ctx->pages,
pos, length, &done);
@@ -418,6 +412,15 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
}
ret = iomap_readpage_actor(inode, pos + done, length - done,
ctx, iomap, srcmap);
+ if (WARN_ON(ret == 0))
+ break;
+ done += ret;
+ if (offset_in_page(pos + done) == 0) {
+ if (!ctx->cur_page_in_bio)
+ unlock_page(ctx->cur_page);
+ put_page(ctx->cur_page);
+ ctx->cur_page = NULL;
+ }
}
return done;
@@ -451,11 +454,7 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
done:
if (ctx.bio)
submit_bio(ctx.bio);
- if (ctx.cur_page) {
- if (!ctx.cur_page_in_bio)
- unlock_page(ctx.cur_page);
- put_page(ctx.cur_page);
- }
+ BUG_ON(ctx.cur_page);
/*
* Check that we didn't lose a page due to the arcance calling
--
2.25.0
From: "Matthew Wilcox (Oracle)" <[email protected]>
Use the new readahead operation in f2fs
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/f2fs/data.c | 50 +++++++++++++++----------------------
fs/f2fs/f2fs.h | 5 ++--
include/trace/events/f2fs.h | 6 ++---
3 files changed, 25 insertions(+), 36 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b27b72107911..87964e4cb6b8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2159,13 +2159,11 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
* use ->readpage() or do the necessary surgery to decouple ->readpages()
* from read-ahead.
*/
-int f2fs_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead)
+int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac,
+ struct page *page)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
- struct inode *inode = mapping->host;
struct f2fs_map_blocks map;
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct compress_ctx cc = {
@@ -2179,6 +2177,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
.nr_cpages = 0,
};
#endif
+ unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
int ret = 0;
@@ -2192,15 +2191,9 @@ int f2fs_mpage_readpages(struct address_space *mapping,
map.m_may_create = false;
for (; nr_pages; nr_pages--) {
- if (pages) {
- page = list_last_entry(pages, struct page, lru);
-
+ if (rac) {
+ page = readahead_page(rac);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping,
- page_index(page),
- readahead_gfp_mask(mapping)))
- goto next_page;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -2210,7 +2203,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ rac);
f2fs_destroy_compress_ctx(&cc);
if (ret)
goto set_error_page;
@@ -2233,7 +2226,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
#endif
ret = f2fs_read_single_page(inode, page, max_nr_pages, &map,
- &bio, &last_block_in_bio, is_readahead);
+ &bio, &last_block_in_bio, rac);
if (ret) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
@@ -2242,8 +2235,10 @@ int f2fs_mpage_readpages(struct address_space *mapping,
zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
next_page:
- if (pages)
+#endif
+ if (rac)
put_page(page);
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -2253,16 +2248,15 @@ int f2fs_mpage_readpages(struct address_space *mapping,
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ rac);
f2fs_destroy_compress_ctx(&cc);
}
}
#endif
}
- BUG_ON(pages && !list_empty(pages));
if (bio)
__submit_bio(F2FS_I_SB(inode), bio, DATA);
- return pages ? 0 : ret;
+ return ret;
}
static int f2fs_read_data_page(struct file *file, struct page *page)
@@ -2281,28 +2275,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
if (ret == -EAGAIN)
- ret = f2fs_mpage_readpages(page_file_mapping(page),
- NULL, page, 1, false);
+ ret = f2fs_mpage_readpages(inode, NULL, page);
return ret;
}
-static int f2fs_read_data_pages(struct file *file,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void f2fs_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
- struct page *page = list_last_entry(pages, struct page, lru);
+ struct inode *inode = rac->mapping->host;
- trace_f2fs_readpages(inode, page, nr_pages);
+ trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac));
if (!f2fs_is_compress_backend_ready(inode))
- return 0;
+ return;
/* If the file has inline data, skip readpages */
if (f2fs_has_inline_data(inode))
- return 0;
+ return;
- return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+ f2fs_mpage_readpages(inode, rac, NULL);
}
int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
@@ -3784,7 +3774,7 @@ static void f2fs_swap_deactivate(struct file *file)
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
- .readpages = f2fs_read_data_pages,
+ .readahead = f2fs_readahead,
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5355be6b6755..b5e72dee8826 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3344,9 +3344,8 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
-int f2fs_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead);
+int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac,
+ struct page *page);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
int op_flags, bool for_write);
struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 67a97838c2a0..d72da4a33883 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1375,9 +1375,9 @@ TRACE_EVENT(f2fs_writepages,
TRACE_EVENT(f2fs_readpages,
- TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage),
+ TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage),
- TP_ARGS(inode, page, nrpage),
+ TP_ARGS(inode, start, nrpage),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1389,7 +1389,7 @@ TRACE_EVENT(f2fs_readpages,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->start = page->index;
+ __entry->start = start;
__entry->nrpage = nrpage;
),
--
2.25.0
From: "Matthew Wilcox (Oracle)" <[email protected]>
Use the new readahead operation in ext4
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/ext4/ext4.h | 3 +--
fs/ext4/inode.c | 23 ++++++++++-------------
fs/ext4/readpage.c | 22 ++++++++--------------
3 files changed, 19 insertions(+), 29 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9a2ee2428ecc..3af755da101d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3276,8 +3276,7 @@ static inline void ext4_set_de_type(struct super_block *sb,
/* readpages.c */
extern int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead);
+ struct readahead_control *rac, struct page *page);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1305b810c44a..7770e38e17e7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3218,7 +3218,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
static int ext4_readpage(struct file *file, struct page *page)
{
int ret = -EAGAIN;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = file_inode(file);
trace_ext4_readpage(page);
@@ -3226,23 +3226,20 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
- false);
+ return ext4_mpage_readpages(page->mapping, NULL, page);
return ret;
}
-static int
-ext4_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void ext4_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = rac->mapping->host;
- /* If the file has inline data, no need to do readpages. */
+ /* If the file has inline data, no need to do readahead. */
if (ext4_has_inline_data(inode))
- return 0;
+ return;
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+ ext4_mpage_readpages(rac->mapping, rac, NULL);
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -3587,7 +3584,7 @@ static int ext4_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3604,7 +3601,7 @@ static const struct address_space_operations ext4_aops = {
static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3620,7 +3617,7 @@ static const struct address_space_operations ext4_journalled_aops = {
static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c1769afbf799..e14841ade612 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -7,8 +7,8 @@
*
* This was originally taken from fs/mpage.c
*
- * The intent is the ext4_mpage_readpages() function here is intended
- * to replace mpage_readpages() in the general case, not just for
+ * The ext4_mpage_readahead() function here is intended to
+ * replace mpage_readahead() in the general case, not just for
* encrypted files. It has some limitations (see below), where it
* will fall back to read_block_full_page(), but these limitations
* should only be hit when page_size != block_size.
@@ -222,8 +222,7 @@ static inline loff_t ext4_readpage_limit(struct inode *inode)
}
int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead)
+ struct readahead_control *rac, struct page *page)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
@@ -241,6 +240,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
+ unsigned int nr_pages = rac ? readahead_count(rac) : 1;
map.m_pblk = 0;
map.m_lblk = 0;
@@ -251,14 +251,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
- if (pages) {
- page = lru_to_page(pages);
-
+ if (rac) {
+ page = readahead_page(rac);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping)))
- goto next_page;
}
if (page_has_buffers(page))
@@ -381,7 +376,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, REQ_OP_READ,
- is_readahead ? REQ_RAHEAD : 0);
+ rac ? REQ_RAHEAD : 0);
}
length = first_hole << blkbits;
@@ -406,10 +401,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
else
unlock_page(page);
next_page:
- if (pages)
+ if (rac)
put_page(page);
}
- BUG_ON(pages && !list_empty(pages));
if (bio)
submit_bio(bio);
return 0;
--
2.25.0
On Mon, Feb 17, 2020 at 10:45:41AM -0800, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <[email protected]>
>
> This series adds a readahead address_space operation to eventually
> replace the readpages operation. The key difference is that
> pages are added to the page cache as they are allocated (and
> then looked up by the filesystem) instead of passing them on a
> list to the readpages operation and having the filesystem add
> them to the page cache. It's a net reduction in code for each
> implementation, more efficient than walking a list, and solves
> the direct-write vs buffered-read problem reported by yu kuai at
> https://lore.kernel.org/linux-fsdevel/[email protected]/
>
> The only unconverted filesystems are those which use fscache.
> Their conversion is pending Dave Howells' rewrite which will make the
> conversion substantially easier.
Latest version in your git tree:
$ ▶ glo -n 5 willy/readahead
4be497096c04 mm: Use memalloc_nofs_save in readahead path
ff63497fcb98 iomap: Convert from readpages to readahead
26aee60e89b5 iomap: Restructure iomap_readpages_actor
8115bcca7312 fuse: Convert from readpages to readahead
3db3d10d9ea1 f2fs: Convert from readpages to readahead
$
merged into a 5.6-rc2 tree fails at boot on my test vm:
[ 2.423116] ------------[ cut here ]------------
[ 2.424957] list_add double add: new=ffffea000efff4c8, prev=ffff8883bfffee60, next=ffffea000efff4c8.
[ 2.428259] WARNING: CPU: 4 PID: 1 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
[ 2.430617] CPU: 4 PID: 1 Comm: sh Not tainted 5.6.0-rc2-dgc+ #1800
[ 2.432405] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 2.434744] RIP: 0010:__list_add_valid+0x67/0x70
[ 2.436107] Code: c6 4c 89 ca 48 c7 c7 10 41 58 82 e8 55 29 89 ff 0f 0b 31 c0 c3 48 89 f2 4c 89 c1 48 89 fe 48 c7 c7 60 41 58 82 e8 3b 29 89 ff <0f> 0b 31 c7
[ 2.441161] RSP: 0000:ffffc900018a3bb0 EFLAGS: 00010082
[ 2.442548] RAX: 0000000000000000 RBX: ffffea000efff4c0 RCX: 0000000000000256
[ 2.444432] RDX: 0000000000000001 RSI: 0000000000000086 RDI: ffffffff8288a8b0
[ 2.446315] RBP: ffffea000efff4c8 R08: ffffc900018a3a65 R09: 0000000000000256
[ 2.448199] R10: 0000000000000008 R11: ffffc900018a3a65 R12: ffffea000efff4c8
[ 2.450072] R13: ffff8883bfffee60 R14: 0000000000000010 R15: 0000000000000001
[ 2.451959] FS: 0000000000000000(0000) GS:ffff8883b9c00000(0000) knlGS:0000000000000000
[ 2.454083] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2.455604] CR2: 00000000ffffffff CR3: 00000003b9a37002 CR4: 0000000000060ee0
[ 2.457484] Call Trace:
[ 2.458171] __pagevec_lru_add_fn+0x15f/0x2c0
[ 2.459376] pagevec_lru_move_fn+0x87/0xd0
[ 2.460500] ? pagevec_move_tail_fn+0x2d0/0x2d0
[ 2.461712] lru_add_drain_cpu+0x8d/0x160
[ 2.462787] lru_add_drain+0x18/0x20
[ 2.463757] shift_arg_pages+0xb8/0x180
[ 2.464789] ? vprintk_emit+0x101/0x1c0
[ 2.465813] ? printk+0x58/0x6f
[ 2.466659] setup_arg_pages+0x205/0x240
[ 2.467716] load_elf_binary+0x34a/0x1560
[ 2.468789] ? get_user_pages_remote+0x159/0x280
[ 2.470024] ? selinux_inode_permission+0x10d/0x1e0
[ 2.471323] ? _raw_read_unlock+0xa/0x20
[ 2.472375] ? load_misc_binary+0x2b2/0x410
[ 2.473492] search_binary_handler+0x60/0xe0
[ 2.474634] __do_execve_file.isra.0+0x512/0x850
[ 2.475888] ? rest_init+0xc6/0xc6
[ 2.476801] do_execve+0x21/0x30
[ 2.477671] try_to_run_init_process+0x10/0x34
[ 2.478855] kernel_init+0xe2/0xfa
[ 2.479776] ret_from_fork+0x1f/0x30
[ 2.480737] ---[ end trace e77079de9b22dc6a ]---
I just dropped the ext4 conversion from my local tree so I can boot
the machine and test XFS. Might have some more info when that
crashes and burns...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Feb 18, 2020 at 03:56:33PM +1100, Dave Chinner wrote:
> Latest version in your git tree:
>
> $ ▶ glo -n 5 willy/readahead
> 4be497096c04 mm: Use memalloc_nofs_save in readahead path
> ff63497fcb98 iomap: Convert from readpages to readahead
> 26aee60e89b5 iomap: Restructure iomap_readpages_actor
> 8115bcca7312 fuse: Convert from readpages to readahead
> 3db3d10d9ea1 f2fs: Convert from readpages to readahead
> $
>
> merged into a 5.6-rc2 tree fails at boot on my test vm:
>
> [ 2.423116] ------------[ cut here ]------------
> [ 2.424957] list_add double add: new=ffffea000efff4c8, prev=ffff8883bfffee60, next=ffffea000efff4c8.
> [ 2.428259] WARNING: CPU: 4 PID: 1 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
> [ 2.457484] Call Trace:
> [ 2.458171] __pagevec_lru_add_fn+0x15f/0x2c0
> [ 2.459376] pagevec_lru_move_fn+0x87/0xd0
> [ 2.460500] ? pagevec_move_tail_fn+0x2d0/0x2d0
> [ 2.461712] lru_add_drain_cpu+0x8d/0x160
> [ 2.462787] lru_add_drain+0x18/0x20
Are you sure that was 4be497096c04 ? I ask because there was a
version pushed to that git tree that did contain a list double-add
(due to a mismerge when shuffling patches). I noticed it and fixed
it, and 4be497096c04 doesn't have that problem. I also test with
CONFIG_DEBUG_LIST turned on, but this problem you hit is going to be
probabilistic because it'll depend on the timing between whatever other
list is being used and the page actually being added to the LRU.
On 2/17/20 10:45 AM, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <[email protected]>
>
> This series adds a readahead address_space operation to eventually
> replace the readpages operation. The key difference is that
> pages are added to the page cache as they are allocated (and
> then looked up by the filesystem) instead of passing them on a
> list to the readpages operation and having the filesystem add
> them to the page cache. It's a net reduction in code for each
> implementation, more efficient than walking a list, and solves
> the direct-write vs buffered-read problem reported by yu kuai at
> https://lore.kernel.org/linux-fsdevel/[email protected]/
>
> The only unconverted filesystems are those which use fscache.
> Their conversion is pending Dave Howells' rewrite which will make the
> conversion substantially easier.
Hi Matthew,
I see that Dave Chinner is reviewing this series, but I'm trying out his recent
advice about code reviews [1], and so I'm not going to read his comments first.
So you may see some duplication or contradictions this time around.
[1] Somewhere in this thread, "[LSF/MM/BPF TOPIC] FS Maintainers Don't Scale":
https://lore.kernel.org/r/20200131052520.GC6869@magnolia
thanks,
--
John Hubbard
NVIDIA
>
> v6:
> - Name the private members of readahead_control with a leading underscore
> (suggested by Christoph Hellwig)
> - Fix whitespace in rst file
> - Remove misleading comment in btrfs patch
> - Add readahead_next() API and use it in iomap
> - Add iomap_readahead kerneldoc.
> - Fix the mpage_readahead kerneldoc
> - Make various readahead functions return void
> - Keep readahead_index() and readahead_offset() pointing to the start of
> this batch through the body. No current user requires this, but it's
> less surprising.
> - Add kerneldoc for page_cache_readahead_limit
> - Make page_idx an unsigned long, and rename it to just 'i'
> - Get rid of page_offset local variable
> - Add patch to call memalloc_nofs_save() before allocating pages (suggested
> by Michal Hocko)
> - Resplit a lot of patches for more logical progression and easier review
> (suggested by John Hubbard)
> - Added sign-offs where received, and I deemed still relevant
>
> v5 switched to passing a readahead_control struct (mirroring the
> writepages_control struct passed to writepages). This has a number of
> advantages:
> - It fixes a number of bugs in various implementations, eg forgetting to
> increment 'start', an off-by-one error in 'nr_pages' or treating 'start'
> as a byte offset instead of a page offset.
> - It allows us to change the arguments without changing all the
> implementations of ->readahead which just call mpage_readahead() or
> iomap_readahead()
> - Figuring out which pages haven't been attempted by the implementation
> is more natural this way.
> - There's less code in each implementation.
>
> Matthew Wilcox (Oracle) (19):
> mm: Return void from various readahead functions
> mm: Ignore return value of ->readpages
> mm: Use readahead_control to pass arguments
> mm: Rearrange readahead loop
> mm: Remove 'page_offset' from readahead loop
> mm: rename readahead loop variable to 'i'
> mm: Put readahead pages in cache earlier
> mm: Add readahead address space operation
> mm: Add page_cache_readahead_limit
> fs: Convert mpage_readpages to mpage_readahead
> btrfs: Convert from readpages to readahead
> erofs: Convert uncompressed files from readpages to readahead
> erofs: Convert compressed files from readpages to readahead
> ext4: Convert from readpages to readahead
> f2fs: Convert from readpages to readahead
> fuse: Convert from readpages to readahead
> iomap: Restructure iomap_readpages_actor
> iomap: Convert from readpages to readahead
> mm: Use memalloc_nofs_save in readahead path
>
> Documentation/filesystems/locking.rst | 6 +-
> Documentation/filesystems/vfs.rst | 13 ++
> drivers/staging/exfat/exfat_super.c | 7 +-
> fs/block_dev.c | 7 +-
> fs/btrfs/extent_io.c | 46 ++-----
> fs/btrfs/extent_io.h | 3 +-
> fs/btrfs/inode.c | 16 +--
> fs/erofs/data.c | 39 ++----
> fs/erofs/zdata.c | 29 ++--
> fs/ext2/inode.c | 10 +-
> fs/ext4/ext4.h | 3 +-
> fs/ext4/inode.c | 23 ++--
> fs/ext4/readpage.c | 22 ++-
> fs/ext4/verity.c | 35 +----
> fs/f2fs/data.c | 50 +++----
> fs/f2fs/f2fs.h | 5 +-
> fs/f2fs/verity.c | 35 +----
> fs/fat/inode.c | 7 +-
> fs/fuse/file.c | 46 +++----
> fs/gfs2/aops.c | 23 ++--
> fs/hpfs/file.c | 7 +-
> fs/iomap/buffered-io.c | 118 +++++++----------
> fs/iomap/trace.h | 2 +-
> fs/isofs/inode.c | 7 +-
> fs/jfs/inode.c | 7 +-
> fs/mpage.c | 38 ++----
> fs/nilfs2/inode.c | 15 +--
> fs/ocfs2/aops.c | 34 ++---
> fs/omfs/file.c | 7 +-
> fs/qnx6/inode.c | 7 +-
> fs/reiserfs/inode.c | 8 +-
> fs/udf/inode.c | 7 +-
> fs/xfs/xfs_aops.c | 13 +-
> fs/zonefs/super.c | 7 +-
> include/linux/fs.h | 2 +
> include/linux/iomap.h | 3 +-
> include/linux/mpage.h | 4 +-
> include/linux/pagemap.h | 90 +++++++++++++
> include/trace/events/erofs.h | 6 +-
> include/trace/events/f2fs.h | 6 +-
> mm/internal.h | 8 +-
> mm/migrate.c | 2 +-
> mm/readahead.c | 184 +++++++++++++++++---------
> 43 files changed, 474 insertions(+), 533 deletions(-)
>
>
> base-commit: 11a48a5a18c63fd7621bb050228cebf13566e4d8
>
On Tue, Feb 18, 2020 at 05:42:30AM -0800, Matthew Wilcox wrote:
> On Tue, Feb 18, 2020 at 03:56:33PM +1100, Dave Chinner wrote:
> > Latest version in your git tree:
> >
> > $ ▶ glo -n 5 willy/readahead
> > 4be497096c04 mm: Use memalloc_nofs_save in readahead path
> > ff63497fcb98 iomap: Convert from readpages to readahead
> > 26aee60e89b5 iomap: Restructure iomap_readpages_actor
> > 8115bcca7312 fuse: Convert from readpages to readahead
> > 3db3d10d9ea1 f2fs: Convert from readpages to readahead
> > $
> >
> > merged into a 5.6-rc2 tree fails at boot on my test vm:
> >
> > [ 2.423116] ------------[ cut here ]------------
> > [ 2.424957] list_add double add: new=ffffea000efff4c8, prev=ffff8883bfffee60, next=ffffea000efff4c8.
> > [ 2.428259] WARNING: CPU: 4 PID: 1 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
> > [ 2.457484] Call Trace:
> > [ 2.458171] __pagevec_lru_add_fn+0x15f/0x2c0
> > [ 2.459376] pagevec_lru_move_fn+0x87/0xd0
> > [ 2.460500] ? pagevec_move_tail_fn+0x2d0/0x2d0
> > [ 2.461712] lru_add_drain_cpu+0x8d/0x160
> > [ 2.462787] lru_add_drain+0x18/0x20
>
> Are you sure that was 4be497096c04 ? I ask because there was a
Yes, because it's the only version I've actually merged into my
working tree, compiled and tried to run. :P
> version pushed to that git tree that did contain a list double-add
> (due to a mismerge when shuffling patches). I noticed it and fixed
> it, and 4be497096c04 doesn't have that problem. I also test with
> CONFIG_DEBUG_LIST turned on, but this problem you hit is going to be
> probabilistic because it'll depend on the timing between whatever other
> list is being used and the page actually being added to the LRU.
I'll see if I can reproduce it.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On 2/17/20 10:46 AM, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <[email protected]>
>
> By putting the 'have we reached the end of the page' condition at the end
> of the loop instead of the beginning, we can remove the 'submit the last
> page' code from iomap_readpages(). Also check that iomap_readpage_actor()
> didn't return 0, which would lead to an endless loop.
Also added a new WARN_ON() and BUG(), although I'm wondering about the BUG
below...
>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> ---
> fs/iomap/buffered-io.c | 25 ++++++++++++-------------
> 1 file changed, 12 insertions(+), 13 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index cb3511eb152a..44303f370b2d 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -400,15 +400,9 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
> void *data, struct iomap *iomap, struct iomap *srcmap)
> {
> struct iomap_readpage_ctx *ctx = data;
> - loff_t done, ret;
> + loff_t ret, done = 0;
>
> - for (done = 0; done < length; done += ret) {
nit: this "for" loop was perfect just the way it was. :) I'd vote here for reverting
the change to a "while" loop. Because with this change, now the code has to
separately initialize "done", separately increment "done", and the beauty of a
for loop is that the loop init and control is all clearly in one place. For things
that follow that model (as in this case!), that's a Good Thing.
And I don't see any technical reason (even in the following patch) that requires
this change.
> - if (ctx->cur_page && offset_in_page(pos + done) == 0) {
> - if (!ctx->cur_page_in_bio)
> - unlock_page(ctx->cur_page);
> - put_page(ctx->cur_page);
> - ctx->cur_page = NULL;
> - }
> + while (done < length) {
> if (!ctx->cur_page) {
> ctx->cur_page = iomap_next_page(inode, ctx->pages,
> pos, length, &done);
> @@ -418,6 +412,15 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
> }
> ret = iomap_readpage_actor(inode, pos + done, length - done,
> ctx, iomap, srcmap);
> + if (WARN_ON(ret == 0))
> + break;
> + done += ret;
> + if (offset_in_page(pos + done) == 0) {
> + if (!ctx->cur_page_in_bio)
> + unlock_page(ctx->cur_page);
> + put_page(ctx->cur_page);
> + ctx->cur_page = NULL;
> + }
> }
>
> return done;
> @@ -451,11 +454,7 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
> done:
> if (ctx.bio)
> submit_bio(ctx.bio);
> - if (ctx.cur_page) {
> - if (!ctx.cur_page_in_bio)
> - unlock_page(ctx.cur_page);
> - put_page(ctx.cur_page);
> - }
> + BUG_ON(ctx.cur_page);
Is a full BUG_ON() definitely called for here? Seems like a WARN might suffice...
>
> /*
> * Check that we didn't lose a page due to the arcance calling
>
thanks,
--
John Hubbard
NVIDIA
On Wed, Feb 19, 2020 at 08:26:52AM +1100, Dave Chinner wrote:
> On Tue, Feb 18, 2020 at 05:42:30AM -0800, Matthew Wilcox wrote:
> > On Tue, Feb 18, 2020 at 03:56:33PM +1100, Dave Chinner wrote:
> > > Latest version in your git tree:
> > >
> > > $ ▶ glo -n 5 willy/readahead
> > > 4be497096c04 mm: Use memalloc_nofs_save in readahead path
> > > ff63497fcb98 iomap: Convert from readpages to readahead
> > > 26aee60e89b5 iomap: Restructure iomap_readpages_actor
> > > 8115bcca7312 fuse: Convert from readpages to readahead
> > > 3db3d10d9ea1 f2fs: Convert from readpages to readahead
> > > $
> > >
> > > merged into a 5.6-rc2 tree fails at boot on my test vm:
> > >
> > > [ 2.423116] ------------[ cut here ]------------
> > > [ 2.424957] list_add double add: new=ffffea000efff4c8, prev=ffff8883bfffee60, next=ffffea000efff4c8.
> > > [ 2.428259] WARNING: CPU: 4 PID: 1 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
> > > [ 2.457484] Call Trace:
> > > [ 2.458171] __pagevec_lru_add_fn+0x15f/0x2c0
> > > [ 2.459376] pagevec_lru_move_fn+0x87/0xd0
> > > [ 2.460500] ? pagevec_move_tail_fn+0x2d0/0x2d0
> > > [ 2.461712] lru_add_drain_cpu+0x8d/0x160
> > > [ 2.462787] lru_add_drain+0x18/0x20
> >
> > Are you sure that was 4be497096c04 ? I ask because there was a
>
> Yes, because it's the only version I've actually merged into my
> working tree, compiled and tried to run. :P
>
> > version pushed to that git tree that did contain a list double-add
> > (due to a mismerge when shuffling patches). I noticed it and fixed
> > it, and 4be497096c04 doesn't have that problem. I also test with
> > CONFIG_DEBUG_LIST turned on, but this problem you hit is going to be
> > probabilistic because it'll depend on the timing between whatever other
> > list is being used and the page actually being added to the LRU.
>
> I'll see if I can reproduce it.
Just updated to a current TOT Linus kernel and your latest branch,
and so far this is 100% reproducable.
Not sure how I'm going to debug it yet, because it's init that is
triggering it....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Feb 18, 2020 at 07:48:32PM -0800, Matthew Wilcox wrote:
> On Wed, Feb 19, 2020 at 02:45:25PM +1100, Dave Chinner wrote:
> > On Wed, Feb 19, 2020 at 08:26:52AM +1100, Dave Chinner wrote:
> > > On Tue, Feb 18, 2020 at 05:42:30AM -0800, Matthew Wilcox wrote:
> > > > On Tue, Feb 18, 2020 at 03:56:33PM +1100, Dave Chinner wrote:
> > > > > Latest version in your git tree:
> > > > >
> > > > > $ ▶ glo -n 5 willy/readahead
> > > > > 4be497096c04 mm: Use memalloc_nofs_save in readahead path
> > > > > ff63497fcb98 iomap: Convert from readpages to readahead
> > > > > 26aee60e89b5 iomap: Restructure iomap_readpages_actor
> > > > > 8115bcca7312 fuse: Convert from readpages to readahead
> > > > > 3db3d10d9ea1 f2fs: Convert from readpages to readahead
> > > > > $
> > > > >
> > > > > merged into a 5.6-rc2 tree fails at boot on my test vm:
> > > > >
> > > > > [ 2.423116] ------------[ cut here ]------------
> > > > > [ 2.424957] list_add double add: new=ffffea000efff4c8, prev=ffff8883bfffee60, next=ffffea000efff4c8.
> > > > > [ 2.428259] WARNING: CPU: 4 PID: 1 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
> > > > > [ 2.457484] Call Trace:
> > > > > [ 2.458171] __pagevec_lru_add_fn+0x15f/0x2c0
> > > > > [ 2.459376] pagevec_lru_move_fn+0x87/0xd0
> > > > > [ 2.460500] ? pagevec_move_tail_fn+0x2d0/0x2d0
> > > > > [ 2.461712] lru_add_drain_cpu+0x8d/0x160
> > > > > [ 2.462787] lru_add_drain+0x18/0x20
> > > >
> > > > Are you sure that was 4be497096c04 ? I ask because there was a
> > >
> > > Yes, because it's the only version I've actually merged into my
> > > working tree, compiled and tried to run. :P
> > >
> > > > version pushed to that git tree that did contain a list double-add
> > > > (due to a mismerge when shuffling patches). I noticed it and fixed
> > > > it, and 4be497096c04 doesn't have that problem. I also test with
> > > > CONFIG_DEBUG_LIST turned on, but this problem you hit is going to be
> > > > probabilistic because it'll depend on the timing between whatever other
> > > > list is being used and the page actually being added to the LRU.
> > >
> > > I'll see if I can reproduce it.
> >
> > Just updated to a current TOT Linus kernel and your latest branch,
> > and so far this is 100% reproducable.
> >
> > Not sure how I'm going to debug it yet, because it's init that is
> > triggering it....
>
> Eric found it ...
Yeah, just saw that and am applying his patch to test it...
> still not sure why I don't see it.
No readahead configured on your device?
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Feb 18, 2020 at 07:17:18PM -0800, John Hubbard wrote:
> > - for (done = 0; done < length; done += ret) {
>
> nit: this "for" loop was perfect just the way it was. :) I'd vote here for reverting
> the change to a "while" loop. Because with this change, now the code has to
> separately initialize "done", separately increment "done", and the beauty of a
> for loop is that the loop init and control is all clearly in one place. For things
> that follow that model (as in this case!), that's a Good Thing.
>
> And I don't see any technical reason (even in the following patch) that requires
> this change.
It's doing the increment in the wrong place. We want the increment done in
the middle of the loop, before we check whether we've got to the end of
the page. Not at the end of the loop.
> > + BUG_ON(ctx.cur_page);
>
> Is a full BUG_ON() definitely called for here? Seems like a WARN might suffice...
Dave made a similar comment; I'll pick this up there.
On Wed, Feb 19, 2020 at 02:29:00PM +1100, Dave Chinner wrote:
> On Mon, Feb 17, 2020 at 10:46:11AM -0800, Matthew Wilcox wrote:
> > @@ -418,6 +412,15 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
> > }
> > ret = iomap_readpage_actor(inode, pos + done, length - done,
> > ctx, iomap, srcmap);
> > + if (WARN_ON(ret == 0))
> > + break;
>
> This error case now leaks ctx->cur_page....
Yes ... and I see the consequence. I mean, this is a "shouldn't happen",
so do we want to put effort into cleanup here ...
> > @@ -451,11 +454,7 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
> > done:
> > if (ctx.bio)
> > submit_bio(ctx.bio);
> > - if (ctx.cur_page) {
> > - if (!ctx.cur_page_in_bio)
> > - unlock_page(ctx.cur_page);
> > - put_page(ctx.cur_page);
> > - }
> > + BUG_ON(ctx.cur_page);
>
> And so will now trigger both a warn and a bug....
... or do we just want to run slap bang into this bug?
Option 1: Remove the check for 'ret == 0' altogether, as we had it before.
That puts us into endless loop territory for a failure mode, and it's not
parallel with iomap_readpage().
Option 2: Remove the WARN_ON from the check. Then we just hit the BUG_ON,
but we don't know why we did it.
Option 3: Set cur_page to NULL. We'll hit the WARN_ON, avoid the BUG_ON,
might end up with a page in the page cache which is never unlocked.
Option 4: Do the unlock/put page dance before setting the cur_page to NULL.
We might double-unlock the page.
There are probably other options here too.
On Tue, Feb 18, 2020 at 10:04:15PM -0800, Matthew Wilcox wrote:
> On Wed, Feb 19, 2020 at 02:29:00PM +1100, Dave Chinner wrote:
> > On Mon, Feb 17, 2020 at 10:46:11AM -0800, Matthew Wilcox wrote:
> > > @@ -418,6 +412,15 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
> > > }
> > > ret = iomap_readpage_actor(inode, pos + done, length - done,
> > > ctx, iomap, srcmap);
> > > + if (WARN_ON(ret == 0))
> > > + break;
> >
> > This error case now leaks ctx->cur_page....
>
> Yes ... and I see the consequence. I mean, this is a "shouldn't happen",
> so do we want to put effort into cleanup here ...
Well, the normal thing for XFS is that a production kernel cleans up
and handles the error gracefully with a WARN_ON_ONCE, while a debug
kernel build will chuck a tanty and burn the house down so to make
the developers aware that there is a "should not happen" situation
occurring....
> > > @@ -451,11 +454,7 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
> > > done:
> > > if (ctx.bio)
> > > submit_bio(ctx.bio);
> > > - if (ctx.cur_page) {
> > > - if (!ctx.cur_page_in_bio)
> > > - unlock_page(ctx.cur_page);
> > > - put_page(ctx.cur_page);
> > > - }
> > > + BUG_ON(ctx.cur_page);
> >
> > And so will now trigger both a warn and a bug....
>
> ... or do we just want to run slap bang into this bug?
>
> Option 1: Remove the check for 'ret == 0' altogether, as we had it before.
> That puts us into endless loop territory for a failure mode, and it's not
> parallel with iomap_readpage().
>
> Option 2: Remove the WARN_ON from the check. Then we just hit the BUG_ON,
> but we don't know why we did it.
>
> Option 3: Set cur_page to NULL. We'll hit the WARN_ON, avoid the BUG_ON,
> might end up with a page in the page cache which is never unlocked.
None of these are appealing.
> Option 4: Do the unlock/put page dance before setting the cur_page to NULL.
> We might double-unlock the page.
why would we double unlock the page?
Oh, the readahead cursor doesn't handle the case of partial page
submission, which would result in IO completion unlocking the page.
Ok, that's what the ctx.cur_page_in_bio check is used to detect i.e.
if we've got a page that the readahead cursor points at, and we
haven't actually added it to a bio, then we can leave it to the
read_pages() to unlock and clean up. If it's in a bio, then IO
completion will unlock it and so we only have to drop the submission
reference and move the readahead cursor forwards so read_pages()
doesn't try to unlock this page. i.e:
/* clean up partial page submission failures */
if (ctx.cur_page && ctx.cur_page_in_bio) {
put_page(ctx.cur_page);
readahead_next(rac);
}
looks to me like it will handle the case of "ret == 0" in the actor
function just fine.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Feb 19, 2020 at 05:40:05PM +1100, Dave Chinner wrote:
> Ok, that's what the ctx.cur_page_in_bio check is used to detect i.e.
> if we've got a page that the readahead cursor points at, and we
> haven't actually added it to a bio, then we can leave it to the
> read_pages() to unlock and clean up. If it's in a bio, then IO
> completion will unlock it and so we only have to drop the submission
> reference and move the readahead cursor forwards so read_pages()
> doesn't try to unlock this page. i.e:
>
> /* clean up partial page submission failures */
> if (ctx.cur_page && ctx.cur_page_in_bio) {
> put_page(ctx.cur_page);
> readahead_next(rac);
> }
>
> looks to me like it will handle the case of "ret == 0" in the actor
> function just fine.
Here's what I ended up with:
@@ -400,15 +400,9 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
- loff_t done, ret;
-
- for (done = 0; done < length; done += ret) {
- if (ctx->cur_page && offset_in_page(pos + done) == 0) {
- if (!ctx->cur_page_in_bio)
- unlock_page(ctx->cur_page);
- put_page(ctx->cur_page);
- ctx->cur_page = NULL;
- }
+ loff_t ret, done = 0;
+
+ while (done < length) {
if (!ctx->cur_page) {
ctx->cur_page = iomap_next_page(inode, ctx->pages,
pos, length, &done);
@@ -418,6 +412,20 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
}
ret = iomap_readpage_actor(inode, pos + done, length - done,
ctx, iomap, srcmap);
+ done += ret;
+
+ /* Keep working on a partial page */
+ if (ret && offset_in_page(pos + done))
+ continue;
+
+ if (!ctx->cur_page_in_bio)
+ unlock_page(ctx->cur_page);
+ put_page(ctx->cur_page);
+ ctx->cur_page = NULL;
+
+ /* Don't loop forever if we made no progress */
+ if (WARN_ON(!ret))
+ break;
}
return done;
@@ -451,11 +459,7 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
done:
if (ctx.bio)
submit_bio(ctx.bio);
- if (ctx.cur_page) {
- if (!ctx.cur_page_in_bio)
- unlock_page(ctx.cur_page);
- put_page(ctx.cur_page);
- }
+ BUG_ON(ctx.cur_page);
/*
* Check that we didn't lose a page due to the arcance calling
so we'll WARN if we get a ret == 0 (matching ->readpage), and we'll
BUG if we ever see a page being leaked out of readpages_actor, which
is a thing that should never happen and we definitely want to be noisy
about if it does.