LinuxLists.cc - [PATCH] mm : sync ra->ra_pages with bdi->ra

2020-08-14 09:05:23

Subject: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

Some system(like android) will turbo read during startup via expanding the
readahead window and then set it back to normal(128kb as usual). However, some
files in the system process context will keep to be opened since it is opened
up and has no chance to sync with the updated value as it is almost impossible
to change the files attached to the inode(processes are unaware of these things)

We sync ra->ra_pages with bdi->ra_pages when read. Furthermore, in consideration
of the scenario of eio and fadvise(...,POSIX_FADV_SEQUENTIAL).We introduce a
seq_read_fact to record the factors of above two cases.

Signed-off-by: Zhaoyang Huang <[email protected]>
---
include/linux/fs.h | 17 +++++++++++++++++
mm/fadvise.c | 4 +++-
mm/filemap.c | 19 +++++++++++++------
mm/readahead.c | 38 ++++++++++++++++++++++++++++++++++----
4 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd28e76..e3cdc5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -66,6 +66,7 @@
struct fscrypt_operations;
struct fs_context;
struct fs_parameter_description;
+struct file_ra_state;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -81,6 +82,7 @@
extern int sysctl_protected_hardlinks;
extern int sysctl_protected_fifos;
extern int sysctl_protected_regular;
+extern void ra_pages_sync(struct file_ra_state *ra, struct address_space *mapping);

typedef __kernel_rwf_t rwf_t;

@@ -900,11 +902,26 @@ struct file_ra_state {
there are only # of pages ahead */

unsigned int ra_pages; /* Maximum readahead window */
+ int seq_read_fact; /* turbo factor of sequential read */
unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
loff_t prev_pos; /* Cache last read() position */
};

/*
+ * ra->seq_read_fact == -1 indicates eio happens
+ */
+#define RA_PAGES(ra) \
+({ \
+ unsigned int ra_pages; \
+ if (ra->seq_read_fact != -1) \
+ ra_pages = ra->ra_pages * ra->seq_read_fact; \
+ else \
+ ra_pages = ra->ra_pages; \
+ ra_pages; \
+})
+
+
+/*
* Check if @index falls in the readahead windows.
*/
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd0..b06e3ca 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -78,6 +78,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
+ file->f_ra.seq_read_fact = 1;
spin_lock(&file->f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(&file->f_lock);
@@ -88,7 +89,8 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
- file->f_ra.ra_pages = bdi->ra_pages * 2;
+ file->f_ra.ra_pages = bdi->ra_pages;
+ file->f_ra.seq_read_fact = 2;
spin_lock(&file->f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(&file->f_lock);
diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577..425d2a2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2048,6 +2048,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
static void shrink_readahead_size_eio(struct file *filp,
struct file_ra_state *ra)
{
+ ra->seq_read_fact = -1;
ra->ra_pages /= 4;
}

@@ -2473,13 +2474,16 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
- if (!ra->ra_pages)
+ if (!RA_PAGES(ra))
return fpin;

+ /* sync ra->ra_pages with bdi->ra_pages*/
+ ra_pages_sync(ra, mapping);
+
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ RA_PAGES(ra));
return fpin;
}

@@ -2498,9 +2502,9 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
- ra->size = ra->ra_pages;
- ra->async_size = ra->ra_pages / 4;
+ ra->start = max_t(long, 0, offset - RA_PAGES(ra) / 2);
+ ra->size = RA_PAGES(ra);
+ ra->async_size = RA_PAGES(ra) / 4;
ra_submit(ra, mapping, file);
return fpin;
}
@@ -2519,6 +2523,9 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
pgoff_t offset = vmf->pgoff;

+ /* sync ra->ra_pages with bdi->ra_pages*/
+ ra_pages_sync(ra, mapping);
+
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
@@ -2527,7 +2534,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
if (PageReadahead(page)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
- page, offset, ra->ra_pages);
+ page, offset, RA_PAGES(ra));
}
return fpin;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index a459365..e994c5a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -32,10 +32,27 @@
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
+ ra->seq_read_fact = 1;
ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

+/*sync ra->ra_pages with bdi->ra_pages*/
+void ra_pages_sync(struct file_ra_state *ra,
+ struct address_space *mapping)
+{
+ unsigned int ra_pages = ra->ra_pages;
+ if (ra->seq_read_fact == -1)
+ return;
+
+ ra_pages = inode_to_bdi(mapping->host)->ra_pages * ra->seq_read_fact;
+ if (RA_PAGES(ra) != ra_pages) {
+ ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
+ }
+ return;
+}
+EXPORT_SYMBOL_GPL(ra_pages_sync);
+
/*
* see if a page needs releasing upon read_cache_pages() failure
* - the caller of read_cache_pages() may have set PG_private or PG_fscache
@@ -228,11 +245,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;

+ /* sync ra->ra_pages with bdi->ra_pages*/
+ ra_pages_sync(ra, mapping);
+
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
- max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+ max_pages = max_t(unsigned long, bdi->io_pages, RA_PAGES(ra));
nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
@@ -384,10 +404,14 @@ static int try_context_readahead(struct address_space *mapping,
unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
- unsigned long max_pages = ra->ra_pages;
+ unsigned long max_pages;
unsigned long add_pages;
pgoff_t prev_offset;

+ /* sync ra->ra_pages with bdi->ra_pages*/
+ ra_pages_sync(ra, mapping);
+
+ max_pages = RA_PAGES(ra);
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
@@ -510,9 +534,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
pgoff_t offset, unsigned long req_size)
{
/* no read-ahead */
- if (!ra->ra_pages)
+ if (!RA_PAGES(ra))
return;

+ /* sync ra->ra_pages with bdi->ra_pages*/
+ ra_pages_sync(ra, mapping);
+
if (blk_cgroup_congested())
return;

@@ -549,9 +576,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
unsigned long req_size)
{
/* no read-ahead */
- if (!ra->ra_pages)
+ if (!RA_PAGES(ra))
return;

+ /* sync ra->ra_pages with bdi->ra_pages*/
+ ra_pages_sync(ra, mapping);
+
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
--
1.9.1

2020-08-14 11:49:45

by Matthew Wilcox

[permalink] [raw]

Subject: Re: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

On Fri, Aug 14, 2020 at 05:03:44PM +0800, Zhaoyang Huang wrote:
> Some system(like android) will turbo read during startup via expanding the
> readahead window and then set it back to normal(128kb as usual). However, some
> files in the system process context will keep to be opened since it is opened
> up and has no chance to sync with the updated value as it is almost impossible
> to change the files attached to the inode(processes are unaware of these things)

Hang on, you're saying that the _only_ way to do turbo-read is the way
that userspace currently does it, and userspace cannot be modified,
only the kernel? I think somebody has their priorities wrong!

> We sync ra->ra_pages with bdi->ra_pages when read. Furthermore, in consideration
> of the scenario of eio and fadvise(...,POSIX_FADV_SEQUENTIAL).We introduce a
> seq_read_fact to record the factors of above two cases.

No, this is far too complicated.

2020-08-14 20:31:23

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

On Fri, 14 Aug 2020 17:03:44 +0800 Zhaoyang Huang <[email protected]> wrote:

> Some system(like android) will turbo read during startup via expanding the
> readahead window and then set it back to normal(128kb as usual). However, some
> files in the system process context will keep to be opened since it is opened
> up and has no chance to sync with the updated value as it is almost impossible
> to change the files attached to the inode(processes are unaware of these things)

How about making VM_READAHEAD_PAGES a variable? Provide a kernel boot
parameter to alter the default setting. Userspace can then restore the
usual value later in boot?

2020-08-15 22:14:58

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

On Fri, 14 Aug 2020 13:10:34 -0700 Andrew Morton <[email protected]> wrote:

> On Fri, 14 Aug 2020 17:03:44 +0800 Zhaoyang Huang <[email protected]> wrote:
>
> > Some system(like android) will turbo read during startup via expanding the
> > readahead window and then set it back to normal(128kb as usual). However, some
> > files in the system process context will keep to be opened since it is opened
> > up and has no chance to sync with the updated value as it is almost impossible
> > to change the files attached to the inode(processes are unaware of these things)
>
> How about making VM_READAHEAD_PAGES a variable?

Or make it settable in Kconfig?

2020-08-17 11:31:56

by Zhaoyang Huang

[permalink] [raw]

Subject: Re: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

On Sat, Aug 15, 2020 at 12:15 PM Andrew Morton
<[email protected]> wrote:
>
> On Fri, 14 Aug 2020 13:10:34 -0700 Andrew Morton <[email protected]> wrote:
>
> > On Fri, 14 Aug 2020 17:03:44 +0800 Zhaoyang Huang <[email protected]> wrote:
> >
> > > Some system(like android) will turbo read during startup via expanding the
> > > readahead window and then set it back to normal(128kb as usual). However, some
> > > files in the system process context will keep to be opened since it is opened
> > > up and has no chance to sync with the updated value as it is almost impossible
> > > to change the files attached to the inode(processes are unaware of these things)
> >
> > How about making VM_READAHEAD_PAGES a variable?
>
> Or make it settable in Kconfig?
I don't think so. The scenario I gave before is a dynamic process,
can't be solved via menuconfig thing.

2020-08-19 06:56:21

by Zhaoyang Huang

[permalink] [raw]

Subject: Re: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

On Fri, Aug 14, 2020 at 5:03 PM Zhaoyang Huang <[email protected]> wrote:
>
> Some system(like android) will turbo read during startup via expanding the
> readahead window and then set it back to normal(128kb as usual). However, some
> files in the system process context will keep to be opened since it is opened
> up and has no chance to sync with the updated value as it is almost impossible
> to change the files attached to the inode(processes are unaware of these things)
>
> We sync ra->ra_pages with bdi->ra_pages when read. Furthermore, in consideration
> of the scenario of eio and fadvise(...,POSIX_FADV_SEQUENTIAL).We introduce a
> seq_read_fact to record the factors of above two cases.
>
> Signed-off-by: Zhaoyang Huang <[email protected]>
> ---
> include/linux/fs.h | 17 +++++++++++++++++
> mm/fadvise.c | 4 +++-
> mm/filemap.c | 19 +++++++++++++------
> mm/readahead.c | 38 ++++++++++++++++++++++++++++++++++----
> 4 files changed, 67 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index dd28e76..e3cdc5a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -66,6 +66,7 @@
> struct fscrypt_operations;
> struct fs_context;
> struct fs_parameter_description;
> +struct file_ra_state;
>
> extern void __init inode_init(void);
> extern void __init inode_init_early(void);
> @@ -81,6 +82,7 @@
> extern int sysctl_protected_hardlinks;
> extern int sysctl_protected_fifos;
> extern int sysctl_protected_regular;
> +extern void ra_pages_sync(struct file_ra_state *ra, struct address_space *mapping);
>
> typedef __kernel_rwf_t rwf_t;
>
> @@ -900,11 +902,26 @@ struct file_ra_state {
> there are only # of pages ahead */
>
> unsigned int ra_pages; /* Maximum readahead window */
> + int seq_read_fact; /* turbo factor of sequential read */
> unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
> loff_t prev_pos; /* Cache last read() position */
> };
>
> /*
> + * ra->seq_read_fact == -1 indicates eio happens
> + */
> +#define RA_PAGES(ra) \
> +({ \
> + unsigned int ra_pages; \
> + if (ra->seq_read_fact != -1) \
> + ra_pages = ra->ra_pages * ra->seq_read_fact; \
> + else \
> + ra_pages = ra->ra_pages; \
> + ra_pages; \
> +})
> +
> +
> +/*
> * Check if @index falls in the readahead windows.
> */
> static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
> diff --git a/mm/fadvise.c b/mm/fadvise.c
> index 467bcd0..b06e3ca 100644
> --- a/mm/fadvise.c
> +++ b/mm/fadvise.c
> @@ -78,6 +78,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
> switch (advice) {
> case POSIX_FADV_NORMAL:
> file->f_ra.ra_pages = bdi->ra_pages;
> + file->f_ra.seq_read_fact = 1;
> spin_lock(&file->f_lock);
> file->f_mode &= ~FMODE_RANDOM;
> spin_unlock(&file->f_lock);
> @@ -88,7 +89,8 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
> spin_unlock(&file->f_lock);
> break;
> case POSIX_FADV_SEQUENTIAL:
> - file->f_ra.ra_pages = bdi->ra_pages * 2;
> + file->f_ra.ra_pages = bdi->ra_pages;
> + file->f_ra.seq_read_fact = 2;
> spin_lock(&file->f_lock);
> file->f_mode &= ~FMODE_RANDOM;
> spin_unlock(&file->f_lock);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index d78f577..425d2a2 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2048,6 +2048,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
> static void shrink_readahead_size_eio(struct file *filp,
> struct file_ra_state *ra)
> {
> + ra->seq_read_fact = -1;
> ra->ra_pages /= 4;
> }
>
> @@ -2473,13 +2474,16 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> /* If we don't want any read-ahead, don't bother */
> if (vmf->vma->vm_flags & VM_RAND_READ)
> return fpin;
> - if (!ra->ra_pages)
> + if (!RA_PAGES(ra))
> return fpin;
>
> + /* sync ra->ra_pages with bdi->ra_pages*/
> + ra_pages_sync(ra, mapping);
> +
> if (vmf->vma->vm_flags & VM_SEQ_READ) {
> fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> page_cache_sync_readahead(mapping, ra, file, offset,
> - ra->ra_pages);
> + RA_PAGES(ra));
> return fpin;
> }
>
> @@ -2498,9 +2502,9 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> * mmap read-around
> */
> fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> - ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
> - ra->size = ra->ra_pages;
> - ra->async_size = ra->ra_pages / 4;
> + ra->start = max_t(long, 0, offset - RA_PAGES(ra) / 2);
> + ra->size = RA_PAGES(ra);
> + ra->async_size = RA_PAGES(ra) / 4;
> ra_submit(ra, mapping, file);
> return fpin;
> }
> @@ -2519,6 +2523,9 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> struct file *fpin = NULL;
> pgoff_t offset = vmf->pgoff;
>
> + /* sync ra->ra_pages with bdi->ra_pages*/
> + ra_pages_sync(ra, mapping);
> +
> /* If we don't want any read-ahead, don't bother */
> if (vmf->vma->vm_flags & VM_RAND_READ)
> return fpin;
> @@ -2527,7 +2534,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> if (PageReadahead(page)) {
> fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> page_cache_async_readahead(mapping, ra, file,
> - page, offset, ra->ra_pages);
> + page, offset, RA_PAGES(ra));
> }
> return fpin;
> }
> diff --git a/mm/readahead.c b/mm/readahead.c
> index a459365..e994c5a 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -32,10 +32,27 @@
> file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
> {
> ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
> + ra->seq_read_fact = 1;
> ra->prev_pos = -1;
> }
> EXPORT_SYMBOL_GPL(file_ra_state_init);
>
> +/*sync ra->ra_pages with bdi->ra_pages*/
> +void ra_pages_sync(struct file_ra_state *ra,
> + struct address_space *mapping)
> +{
> + unsigned int ra_pages = ra->ra_pages;
> + if (ra->seq_read_fact == -1)
> + return;
> +
> + ra_pages = inode_to_bdi(mapping->host)->ra_pages * ra->seq_read_fact;
> + if (RA_PAGES(ra) != ra_pages) {
> + ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
> + }
> + return;
> +}
> +EXPORT_SYMBOL_GPL(ra_pages_sync);
> +
> /*
> * see if a page needs releasing upon read_cache_pages() failure
> * - the caller of read_cache_pages() may have set PG_private or PG_fscache
> @@ -228,11 +245,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
> if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
> return -EINVAL;
>
> + /* sync ra->ra_pages with bdi->ra_pages*/
> + ra_pages_sync(ra, mapping);
> +
> /*
> * If the request exceeds the readahead window, allow the read to
> * be up to the optimal hardware IO size
> */
> - max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
> + max_pages = max_t(unsigned long, bdi->io_pages, RA_PAGES(ra));
> nr_to_read = min(nr_to_read, max_pages);
> while (nr_to_read) {
> unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
> @@ -384,10 +404,14 @@ static int try_context_readahead(struct address_space *mapping,
> unsigned long req_size)
> {
> struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
> - unsigned long max_pages = ra->ra_pages;
> + unsigned long max_pages;
> unsigned long add_pages;
> pgoff_t prev_offset;
>
> + /* sync ra->ra_pages with bdi->ra_pages*/
> + ra_pages_sync(ra, mapping);
> +
> + max_pages = RA_PAGES(ra);
> /*
> * If the request exceeds the readahead window, allow the read to
> * be up to the optimal hardware IO size
> @@ -510,9 +534,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
> pgoff_t offset, unsigned long req_size)
> {
> /* no read-ahead */
> - if (!ra->ra_pages)
> + if (!RA_PAGES(ra))
> return;
>
> + /* sync ra->ra_pages with bdi->ra_pages*/
> + ra_pages_sync(ra, mapping);
> +
> if (blk_cgroup_congested())
> return;
>
> @@ -549,9 +576,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
> unsigned long req_size)
> {
> /* no read-ahead */
> - if (!ra->ra_pages)
> + if (!RA_PAGES(ra))
> return;
>
> + /* sync ra->ra_pages with bdi->ra_pages*/
> + ra_pages_sync(ra, mapping);
> +
> /*
> * Same bit is used for PG_readahead and PG_reclaim.
> */
> --
> 1.9.1
>
Any further comments on this commit? In response to previous comments,
static parameter configuration can NOT solve the problem, as more
systems would like to adjust the IO behaviour(via bdi not per file)
from userspace.