From: "Matthew Wilcox (Oracle)" <[email protected]>
This replaces ->readpages with a saner interface:
- Return void instead of an ignored error code.
- Pages are already in the page cache when ->readahead is called.
- Implementation looks up the pages in the page cache instead of
having them passed in a linked list.
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
Documentation/filesystems/locking.rst | 6 +++++-
Documentation/filesystems/vfs.rst | 13 +++++++++++++
include/linux/fs.h | 2 ++
include/linux/pagemap.h | 18 ++++++++++++++++++
mm/readahead.c | 8 +++++++-
5 files changed, 45 insertions(+), 2 deletions(-)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 5057e4d9dcd1..0ebc4491025a 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -239,6 +239,7 @@ prototypes::
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ void (*readahead)(struct readahead_control *);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -271,7 +272,8 @@ writepage: yes, unlocks (see below)
readpage: yes, unlocks
writepages:
set_page_dirty no
-readpages:
+readahead: yes, unlocks
+readpages: no
write_begin: locks the page exclusive
write_end: yes, unlocks exclusive
bmap:
@@ -295,6 +297,8 @@ the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
completion.
+->readahead() unlocks the pages like ->readpage().
+
->readpages() populates the pagecache with the passed pages and starts
I/O against them. They come unlocked upon I/O completion.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7d4d09dd5e6d..81ab30fbe45c 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined:
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ void (*readahead)(struct readahead_control *);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -781,12 +782,24 @@ cache in your filesystem. The following members are defined:
If defined, it should set the PageDirty flag, and the
PAGECACHE_TAG_DIRTY tag in the radix tree.
+``readahead``
+ Called by the VM to read pages associated with the address_space
+ object. The pages are consecutive in the page cache and are
+ locked. The implementation should decrement the page refcount
+ after starting I/O on each page. Usually the page will be
+ unlocked by the I/O completion handler. If the function does
+ not attempt I/O on some pages, the caller will decrement the page
+ refcount and unlock the pages for you. Set PageUptodate if the
+ I/O completes successfully. Setting PageError on any page will
+ be ignored; simply unlock the page if an I/O error occurs.
+
``readpages``
called by the VM to read pages associated with the address_space
object. This is essentially just a vector version of readpage.
Instead of just one page, several pages are requested.
readpages is only used for read-ahead, so read errors are
ignored. If anything goes wrong, feel free to give up.
+ This interface is deprecated; implement readahead instead.
``write_begin``
Called by the generic buffered write code to ask the filesystem
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3cd4fe6b845e..d4e2d2964346 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -292,6 +292,7 @@ enum positive_aop_returns {
struct page;
struct address_space;
struct writeback_control;
+struct readahead_control;
/*
* Write life time hint values.
@@ -375,6 +376,7 @@ struct address_space_operations {
*/
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
+ void (*readahead)(struct readahead_control *);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3613154e79e4..bd4291f78f41 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -665,6 +665,24 @@ static inline void readahead_next(struct readahead_control *rac)
#define readahead_for_each(rac, page) \
for (; (page = readahead_page(rac)); readahead_next(rac))
+/* The byte offset into the file of this readahead block */
+static inline loff_t readahead_offset(struct readahead_control *rac)
+{
+ return (loff_t)rac->_start * PAGE_SIZE;
+}
+
+/* The number of bytes in this readahead block */
+static inline loff_t readahead_length(struct readahead_control *rac)
+{
+ return (loff_t)rac->_nr_pages * PAGE_SIZE;
+}
+
+/* The index of the first page in this readahead block */
+static inline unsigned int readahead_index(struct readahead_control *rac)
+{
+ return rac->_start;
+}
+
/* The number of pages in this readahead block */
static inline unsigned int readahead_count(struct readahead_control *rac)
{
diff --git a/mm/readahead.c b/mm/readahead.c
index 9e430daae42f..975ff5e387be 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -121,7 +121,13 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages)
blk_start_plug(&plug);
- if (aops->readpages) {
+ if (aops->readahead) {
+ aops->readahead(rac);
+ readahead_for_each(rac, page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else if (aops->readpages) {
aops->readpages(rac->file, rac->mapping, pages,
readahead_count(rac));
/* Clean up the remaining pages */
--
2.25.0
On Tue, Feb 18, 2020 at 05:21:47PM +1100, Dave Chinner wrote:
> On Mon, Feb 17, 2020 at 10:45:54AM -0800, Matthew Wilcox wrote:
> > From: "Matthew Wilcox (Oracle)" <[email protected]>
> >
> > This replaces ->readpages with a saner interface:
> > - Return void instead of an ignored error code.
> > - Pages are already in the page cache when ->readahead is called.
>
> Might read better as:
>
> - Page cache is already populates with locked pages when
> ->readahead is called.
Will do.
> > - Implementation looks up the pages in the page cache instead of
> > having them passed in a linked list.
>
> Add:
>
> - cleanup of unused readahead handled by ->readahead caller, not
> the method implementation.
The readpages caller does that cleanup too, so it's not an advantage
to the readahead interface.
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
goto out;
}
> > ``readpages``
> > called by the VM to read pages associated with the address_space
> > object. This is essentially just a vector version of readpage.
> > Instead of just one page, several pages are requested.
> > readpages is only used for read-ahead, so read errors are
> > ignored. If anything goes wrong, feel free to give up.
> > + This interface is deprecated; implement readahead instead.
>
> What is the removal schedule for the deprecated interface?
I mentioned that in the cover letter; once Dave Howells has the fscache
branch merged, I'll do the remaining filesystems. Should be within the
next couple of merge windows.
> > +/* The byte offset into the file of this readahead block */
> > +static inline loff_t readahead_offset(struct readahead_control *rac)
> > +{
> > + return (loff_t)rac->_start * PAGE_SIZE;
> > +}
>
> Urk. Didn't an early page use "offset" for the page index? That
> was was "mm: Remove 'page_offset' from readahead loop" did, right?
>
> That's just going to cause confusion to have different units for
> readahead "offsets"....
We are ... not consistent anywhere in the VM/VFS with our naming.
Unfortunately.
$ grep -n offset mm/filemap.c
391: * @start: offset in bytes where the range starts
...
815: pgoff_t offset = old->index;
...
2020: unsigned long offset; /* offset into pagecache page */
...
2257: *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
That last one's my favourite. Not to mention the fine distinction you
and I discussed recently between offset_in_page() and page_offset().
Best of all, even our types encode the ambiguity of an 'offset'. We have
pgoff_t and loff_t (replacing the earlier off_t).
So, new rule. 'pos' is the number of bytes into a file. 'index' is the
number of PAGE_SIZE pages into a file. We don't use the word 'offset'
at all. 'length' as a byte count and 'count' as a page count seem like
fine names to me.
> > - if (aops->readpages) {
> > + if (aops->readahead) {
> > + aops->readahead(rac);
> > + readahead_for_each(rac, page) {
> > + unlock_page(page);
> > + put_page(page);
> > + }
>
> This needs a comment to explain the unwinding that needs to be done
> here. I'm not going to remember in a year's time that this is just
> for the pages that weren't submitted by ->readahead....
ACK.
On 2/17/20 10:45 AM, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <[email protected]>
>
> This replaces ->readpages with a saner interface:
> - Return void instead of an ignored error code.
> - Pages are already in the page cache when ->readahead is called.
> - Implementation looks up the pages in the page cache instead of
> having them passed in a linked list.
>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
> ---
> Documentation/filesystems/locking.rst | 6 +++++-
> Documentation/filesystems/vfs.rst | 13 +++++++++++++
> include/linux/fs.h | 2 ++
> include/linux/pagemap.h | 18 ++++++++++++++++++
> mm/readahead.c | 8 +++++++-
> 5 files changed, 45 insertions(+), 2 deletions(-)
>
Looks nice,
Reviewed-by: John Hubbard <[email protected]>
thanks,
--
John Hubbard
NVIDIA
> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
> index 5057e4d9dcd1..0ebc4491025a 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -239,6 +239,7 @@ prototypes::
> int (*readpage)(struct file *, struct page *);
> int (*writepages)(struct address_space *, struct writeback_control *);
> int (*set_page_dirty)(struct page *page);
> + void (*readahead)(struct readahead_control *);
> int (*readpages)(struct file *filp, struct address_space *mapping,
> struct list_head *pages, unsigned nr_pages);
> int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -271,7 +272,8 @@ writepage: yes, unlocks (see below)
> readpage: yes, unlocks
> writepages:
> set_page_dirty no
> -readpages:
> +readahead: yes, unlocks
> +readpages: no
> write_begin: locks the page exclusive
> write_end: yes, unlocks exclusive
> bmap:
> @@ -295,6 +297,8 @@ the request handler (/dev/loop).
> ->readpage() unlocks the page, either synchronously or via I/O
> completion.
>
> +->readahead() unlocks the pages like ->readpage().
> +
> ->readpages() populates the pagecache with the passed pages and starts
> I/O against them. They come unlocked upon I/O completion.
>
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 7d4d09dd5e6d..81ab30fbe45c 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined:
> int (*readpage)(struct file *, struct page *);
> int (*writepages)(struct address_space *, struct writeback_control *);
> int (*set_page_dirty)(struct page *page);
> + void (*readahead)(struct readahead_control *);
> int (*readpages)(struct file *filp, struct address_space *mapping,
> struct list_head *pages, unsigned nr_pages);
> int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -781,12 +782,24 @@ cache in your filesystem. The following members are defined:
> If defined, it should set the PageDirty flag, and the
> PAGECACHE_TAG_DIRTY tag in the radix tree.
>
> +``readahead``
> + Called by the VM to read pages associated with the address_space
> + object. The pages are consecutive in the page cache and are
> + locked. The implementation should decrement the page refcount
> + after starting I/O on each page. Usually the page will be
> + unlocked by the I/O completion handler. If the function does
> + not attempt I/O on some pages, the caller will decrement the page
> + refcount and unlock the pages for you. Set PageUptodate if the
> + I/O completes successfully. Setting PageError on any page will
> + be ignored; simply unlock the page if an I/O error occurs.
> +
> ``readpages``
> called by the VM to read pages associated with the address_space
> object. This is essentially just a vector version of readpage.
> Instead of just one page, several pages are requested.
> readpages is only used for read-ahead, so read errors are
> ignored. If anything goes wrong, feel free to give up.
> + This interface is deprecated; implement readahead instead.
>
> ``write_begin``
> Called by the generic buffered write code to ask the filesystem
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 3cd4fe6b845e..d4e2d2964346 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -292,6 +292,7 @@ enum positive_aop_returns {
> struct page;
> struct address_space;
> struct writeback_control;
> +struct readahead_control;
>
> /*
> * Write life time hint values.
> @@ -375,6 +376,7 @@ struct address_space_operations {
> */
> int (*readpages)(struct file *filp, struct address_space *mapping,
> struct list_head *pages, unsigned nr_pages);
> + void (*readahead)(struct readahead_control *);
>
> int (*write_begin)(struct file *, struct address_space *mapping,
> loff_t pos, unsigned len, unsigned flags,
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 3613154e79e4..bd4291f78f41 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -665,6 +665,24 @@ static inline void readahead_next(struct readahead_control *rac)
> #define readahead_for_each(rac, page) \
> for (; (page = readahead_page(rac)); readahead_next(rac))
>
> +/* The byte offset into the file of this readahead block */
> +static inline loff_t readahead_offset(struct readahead_control *rac)
> +{
> + return (loff_t)rac->_start * PAGE_SIZE;
> +}
> +
> +/* The number of bytes in this readahead block */
> +static inline loff_t readahead_length(struct readahead_control *rac)
> +{
> + return (loff_t)rac->_nr_pages * PAGE_SIZE;
> +}
> +
> +/* The index of the first page in this readahead block */
> +static inline unsigned int readahead_index(struct readahead_control *rac)
> +{
> + return rac->_start;
> +}
> +
> /* The number of pages in this readahead block */
> static inline unsigned int readahead_count(struct readahead_control *rac)
> {
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 9e430daae42f..975ff5e387be 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -121,7 +121,13 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages)
>
> blk_start_plug(&plug);
>
> - if (aops->readpages) {
> + if (aops->readahead) {
> + aops->readahead(rac);
> + readahead_for_each(rac, page) {
> + unlock_page(page);
> + put_page(page);
> + }
> + } else if (aops->readpages) {
> aops->readpages(rac->file, rac->mapping, pages,
> readahead_count(rac));
> /* Clean up the remaining pages */
>
On Tue, Feb 18, 2020 at 07:10:44PM -0800, Eric Biggers wrote:
> > +``readahead``
> > + Called by the VM to read pages associated with the address_space
> > + object. The pages are consecutive in the page cache and are
> > + locked. The implementation should decrement the page refcount
> > + after starting I/O on each page. Usually the page will be
> > + unlocked by the I/O completion handler. If the function does
> > + not attempt I/O on some pages, the caller will decrement the page
> > + refcount and unlock the pages for you. Set PageUptodate if the
> > + I/O completes successfully. Setting PageError on any page will
> > + be ignored; simply unlock the page if an I/O error occurs.
> > +
>
> This is unclear about how "not attempting I/O" works and how that affects who is
> responsible for putting and unlocking the pages. How does the caller know which
> pages were not attempted? Can any arbitrary subset of pages be not attempted,
> or just the last N pages?
Changed to:
``readahead``
Called by the VM to read pages associated with the address_space
object. The pages are consecutive in the page cache and are
locked. The implementation should decrement the page refcount
after starting I/O on each page. Usually the page will be
unlocked by the I/O completion handler. If the filesystem decides
to stop attempting I/O before reaching the end of the readahead
window, it can simply return. The caller will decrement the page
refcount and unlock the remaining pages for you. Set PageUptodate
if the I/O completes successfully. Setting PageError on any page
will be ignored; simply unlock the page if an I/O error occurs.