2022-02-01 15:20:43

by NeilBrown

[permalink] [raw]
Subject: [PATCH 1/3] fuse: remove reliance on bdi congestion

The bdi congestion tracking in not widely used and will be removed.

Fuse is one of a small number of filesystems that uses it, setting both
the sync (read) and async (write) congestion flags at what it determines
are appropriate times.

The only remaining effect of the sync flag is to cause read-ahead to be
skipped.
The only remaining effect of the async flag is to cause (some)
WB_SYNC_NONE writes to be skipped.

So instead of setting the flags, change:
- .readahead to do nothing if the flag would be set
- .writepages to do nothing if WB_SYNC_NONE and the flag would be set
- .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE
and the flag would be set.

The writepages change causes a behavioural change in that pageout() can
now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will
be called on the page which (I think) will further delay the next attempt
at writeout. This might be a good thing.

Signed-off-by: NeilBrown <[email protected]>
---
fs/fuse/control.c | 17 -----------------
fs/fuse/dax.c | 3 +++
fs/fuse/dev.c | 8 --------
fs/fuse/file.c | 11 +++++++++++
4 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 000d2e5627e9..7cede9a3bc96 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
{
unsigned val;
struct fuse_conn *fc;
- struct fuse_mount *fm;
ssize_t ret;

ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
-
- /*
- * Get any fuse_mount belonging to this fuse_conn; s_bdi is
- * shared between all of them
- */
-
- if (!list_empty(&fc->mounts)) {
- fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
- if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- } else {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
- }
spin_unlock(&fc->bg_lock);
up_read(&fc->killsb);
fuse_conn_put(fc);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 182b24a14804..5f74e2585f50 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -781,6 +781,9 @@ static int fuse_dax_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);

+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return 0;
return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
}

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cd54a529460d..e1b4a846c90d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}

- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
@@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req)
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
queued = true;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 829094451774..b22a948be422 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -958,6 +958,8 @@ static void fuse_readahead(struct readahead_control *rac)

if (fuse_is_bad(inode))
return;
+ if (fc->num_background >= fc->congestion_threshold)
+ return;

max_pages = min_t(unsigned int, fc->max_pages,
fc->max_read / PAGE_SIZE);
@@ -1958,6 +1960,7 @@ static int fuse_writepage_locked(struct page *page)

static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
int err;

if (fuse_page_is_writeback(page->mapping->host, page->index)) {
@@ -1973,6 +1976,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}

+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return AOP_WRITEPAGE_ACTIVATE;
+
err = fuse_writepage_locked(page);
unlock_page(page);

@@ -2226,6 +2233,10 @@ static int fuse_writepages(struct address_space *mapping,
if (fuse_is_bad(inode))
goto out;

+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return AOP_WRITEPAGE_ACTIVATE;
+
data.inode = inode;
data.wpa = NULL;
data.ff = NULL;



2022-02-01 15:23:15

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/3] fuse: remove reliance on bdi congestion

On Mon, Jan 31, 2022 at 03:03:53PM +1100, NeilBrown wrote:
> diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
> index 182b24a14804..5f74e2585f50 100644
> --- a/fs/fuse/dax.c
> +++ b/fs/fuse/dax.c
> @@ -781,6 +781,9 @@ static int fuse_dax_writepages(struct address_space *mapping,
> struct inode *inode = mapping->host;
> struct fuse_conn *fc = get_fuse_conn(inode);
>
> + if (wbc->sync_mode == WB_SYNC_NONE &&
> + fc->num_background >= fc->congestion_threshold)
> + return 0;
> return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);

This makes no sense. Doing writeback for DAX means flushing the
CPU cache (in a terribly inefficient way), but it's not going to
be doing anything in the background; it's a sync operation.

> +++ b/fs/fuse/file.c
> @@ -958,6 +958,8 @@ static void fuse_readahead(struct readahead_control *rac)
>
> if (fuse_is_bad(inode))
> return;
> + if (fc->num_background >= fc->congestion_threshold)
> + return;

This seems like a bad idea to me. If we don't even start reads on
readahead pages, they'll get ->readpage called on them one at a time
and the reading thread will block. It's going to lead to some nasty
performance problems, exactly when you don't want them. Better to
queue the reads internally and wait for congestion to ease before
submitting the read.

2022-02-01 16:15:01

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/3] fuse: remove reliance on bdi congestion

On Mon, 31 Jan 2022 at 05:47, NeilBrown <[email protected]> wrote:

> > > +++ b/fs/fuse/file.c
> > > @@ -958,6 +958,8 @@ static void fuse_readahead(struct readahead_control *rac)
> > >
> > > if (fuse_is_bad(inode))
> > > return;
> > > + if (fc->num_background >= fc->congestion_threshold)
> > > + return;
> >
> > This seems like a bad idea to me. If we don't even start reads on
> > readahead pages, they'll get ->readpage called on them one at a time
> > and the reading thread will block. It's going to lead to some nasty
> > performance problems, exactly when you don't want them. Better to
> > queue the reads internally and wait for congestion to ease before
> > submitting the read.
> >
>
> Isn't that exactly what happens now? page_cache_async_ra() sees that
> inode_read_congested() returns true, so it doesn't start readahead.
> ???

I agree.

Fuse throttles async requests even before allocating them, which
precludes placing them on any queue. I guess it was done to limit the
amount of kernel memory pinned by a task (sync requests allow just one
request per task).

This has worked well, and I haven't heard complaints about performance
loss due to readahead throttling.

Thanks,
Miklos

2022-02-02 02:45:05

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/3] fuse: remove reliance on bdi congestion

On Tue, Feb 01, 2022 at 10:00:23AM +1100, NeilBrown wrote:
> On Tue, 01 Feb 2022, Matthew Wilcox wrote:
> > On Mon, Jan 31, 2022 at 03:47:41PM +1100, NeilBrown wrote:
> > > On Mon, 31 Jan 2022, Matthew Wilcox wrote:
> > > > > +++ b/fs/fuse/file.c
> > > > > @@ -958,6 +958,8 @@ static void fuse_readahead(struct readahead_control *rac)
> > > > >
> > > > > if (fuse_is_bad(inode))
> > > > > return;
> > > > > + if (fc->num_background >= fc->congestion_threshold)
> > > > > + return;
> > > >
> > > > This seems like a bad idea to me. If we don't even start reads on
> > > > readahead pages, they'll get ->readpage called on them one at a time
> > > > and the reading thread will block. It's going to lead to some nasty
> > > > performance problems, exactly when you don't want them. Better to
> > > > queue the reads internally and wait for congestion to ease before
> > > > submitting the read.
> > > >
> > >
> > > Isn't that exactly what happens now? page_cache_async_ra() sees that
> > > inode_read_congested() returns true, so it doesn't start readahead.
> > > ???
> >
> > It's rather different. Imagine the readahead window has expanded to
> > 256kB (64 pages). Today, we see congestion and don't do anything.
> > That means we miss the async readahed opportunity, find a missing
> > page and end up calling into page_cache_sync_ra(), by which time
> > we may or may not be congested.
> >
> > If the inode_read_congested() in page_cache_async_ra() is removed and
> > the patch above is added to replace it, we'll allocate those 64 pages and
> > add them to the page cache. But then we'll return without starting IO.
> > When we hit one of those !uptodate pages, we'll call ->readpage on it,
> > but we won't do anything to the other 63 pages. So we'll go through a
> > protracted slow period of sending 64 reads, one at a time, whether or
> > not congestion has eased. Then we'll hit a missing page and proceed
> > to the sync ra case as above.
>
> Hmmm... where is all this documented?
> The entry for readahead in vfs.rst says:
>
> If the filesystem decides to stop attempting I/O before reaching the
> end of the readahead window, it can simply return.
>
> but you are saying that if it simply returns, it'll most likely just get
> called again. So maybe it shouldn't say that?

That's not what I'm saying at all. I'm saying that if ->readahead fails
to read the page, ->readpage will be called to read the page (if it's
actually accessed).

> What do other filesystems do?
> ext4 sets REQ_RAHEAD, but otherwise just pushes ahead and submits all
> requests. btrfs seems much the same.
> xfs uses iomp_readahead .. which again sets REQ_RAHEAD but otherwise
> just does a normal read.
>
> The effect of REQ_RAHEAD seems to be primarily to avoid retries on
> failure.
>
> So it seems that core read-ahead code it not set up to expect readahead
> to fail, though it is (begrudgingly) permitted.

Well, yes. The vast majority of reads don't fail.

> The current inode_read_congested() test in page_cache_async_ra() seems
> to be just delaying the inevitable (and in fairness, the comment does
> say "Defer...."). Maybe just blocking on the congestion is an equally
> good way to delay it...

I don't think we should _block_ for an async read request. We're in the
context of a process which has read a different page. Maybe what we
need is a readahead_abandon() call that removes the just-added pages
from the page cache, so we fall back to doing a sync readahead?

> I note that ->readahead isn't told if the read-ahead is async or not, so
> my patch will drop sync read-ahead on congestion, which the current code
> doesn't do.

Now that we have a readahead_control, it's simple to add that
information to it.

> So maybe this congestion tracking really is useful, and we really want
> to keep it.
>
> I really would like to see that high-level documentation!!

I've done my best to add documentation. There's more than before
I started.

2022-02-07 11:25:39

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH 1/3] fuse: remove reliance on bdi congestion

On Tue, 01 Feb 2022, Matthew Wilcox wrote:
> On Tue, Feb 01, 2022 at 02:28:32PM +1100, NeilBrown wrote:
> > On Tue, 01 Feb 2022, Matthew Wilcox wrote:
> > > On Tue, Feb 01, 2022 at 10:00:23AM +1100, NeilBrown wrote:
> > > > I really would like to see that high-level documentation!!
> > >
> > > I've done my best to add documentation. There's more than before
> > > I started.
> >
> > I guess it's my turn then - if I can manage to understand it.
>
> It always works out better when two people are interested in the
> documentation.
>
>

Please review...

From: NeilBrown <[email protected]>
Subject: [PATCH] MM: document and polish read-ahead code.

Add some "big-picture" documentation for read-ahead and polish the code
to make it fit this documentation.

The meaning of ->async_size is clarified to match its name.
i.e. Any request to ->readahead() has a sync part and an async part.
The caller will wait for the sync pages to complete, but will not wait
for the async pages. The first async page is still marked PG_readahead

- When ->readhead does not consume all pages, any remaining async pages
are now discarded with delete_from_page_cache(). This make it
possible for the filesystem to delay readahead due e.g. to congestion.
- in try_context_readahead(), the async_sync is set correctly rather
than being set to 1. Prior to Commit 2cad40180197 ("readahead: make
context readahead more conservative") it was set to ra->size which
is not correct (that implies no sync component). As this was too
high and caused problems it was reduced to 1, again incorrect but less
problematic. The setting provided with this patch does not restore
those problems, and is now not arbitrary.
- The calculation of ->async_size in the initial_readahead section of
ondemand_readahead() now makes sense - it is zero if the chosen
size does not exceed the requested size. This means that we will not
set the PG_readahead flag in this case, but as the requested size
has not been satisfied we can expect a subsequent read ahead request
any way.

Note that the current function names page_cache_sync_ra() and
page_cache_async_ra() are misleading. All ra request are partly sync
and partly async, so either part can be empty.
A page_cache_sync_ra() request will usually set ->async_size non-zero,
implying it is not all synchronous.
When a non-zero req_count is passed to page_cache_async_ra(), the
implication is that some prefix of the request is synchronous, though
the calculation made there is incorrect - I haven't tried to fix it.

Signed-off-by: NeilBrown <[email protected]>
---
mm/readahead.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 103 insertions(+), 2 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index cf0dcf89eb69..5676f5c1aa39 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,6 +8,105 @@
* Initial version.
*/

+/**
+ * Readahead is used to read content into the page cache before it is
+ * explicitly requested by the application. Readahead only ever
+ * attempts to read pages which are not yet in the page cache. If a
+ * page is present but not up-to-date, readahead will not try to read
+ * it. In that case a simple ->readpage() will be requested.
+ *
+ * Readahead is triggered when an application read request (whether a
+ * systemcall or a page fault) finds that the requested page is not in
+ * the page cache, or that it is in the page cache and has the
+ * PG_readahead flag set. This flag indicates that the page was loaded
+ * as part of a previous read-ahead request and now that it has been
+ * accessed, it is time for the next read-ahead.
+ *
+ * Each readahead request is partly synchronous read, and partly async
+ * read-ahead. This is reflected in the struct file_ra_state which
+ * contains ->size being to total number of pages, and ->async_size
+ * which is the number of pages in the async section. The first page in
+ * this async section will have PG_readahead set as a trigger for a
+ * subsequent read ahead. Once a series of sequential reads has been
+ * established, there should be no need for a synchronous component and
+ * all read ahead request will be fully asynchronous.
+ *
+ * When either of the triggers causes a readahead, three numbers need to
+ * be determined: the start of the region, the size of the region, and
+ * the size of the async tail.
+ *
+ * The start of the region is simply the first page address at or after
+ * the accessed address, which is not currently populated in the page
+ * cache. This is found with a simple search in the page cache.
+ *
+ * The size of the async tail is determined by subtracting the size that
+ * was explicitly requested from the determined request size, unless
+ * this would be less than zero - then zero is used. NOTE THIS
+ * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
+ * PAGE.
+ *
+ * The size of the region is normally determined from the size of the
+ * previous readahead which loaded the preceding pages. This may be
+ * discovered from the struct file_ra_state for simple sequential reads,
+ * or from examining the state of the page cache when multiple
+ * sequential reads are interleaved. Specifically: where the readahead
+ * was triggered by the PG_readahead flag, the size of the previous
+ * readahead is assumed to be the number of pages from the triggering
+ * page to the start of the new readahead. In these cases, the size of
+ * the previous readahead is scaled, often doubled, for the new
+ * readahead, though see get_next_ra_size() for details.
+ *
+ * If the size of the previous read cannot be determined, the number of
+ * preceding pages in the page cache is used to estimate the size of
+ * a previous read. This estimate could easily be misled by random
+ * reads being coincidentally adjacent, so it is ignored unless it is
+ * larger than the current request, and it is not scaled up, unless it
+ * is at the start of file.
+ *
+ * In generally read ahead is accelerated at the start of the file, as
+ * reads from there are often sequential. There are other minor
+ * adjustments to the read ahead size in various special cases and these
+ * are best discovered by reading the code.
+ *
+ * The above calculation determine the readahead, to which any requested
+ * read size may be added.
+ *
+ * Readahead requests are sent to the filesystem using the ->readahead
+ * address space operation, for which mpage_readahead() is a canonical
+ * implementation. ->readahead() should normally initiate reads on all
+ * pages, but may fail to read any or all pages without causing an IO
+ * error. The page cache reading code will issue a ->readpage() request
+ * for any page which ->readahead() does not provided, and only an error
+ * from this will be final.
+ *
+ * ->readahead will generally call readahead_page() repeatedly to get
+ * each page from those prepared for read ahead. It may fail to read a
+ * page by:
+ * - not calling readahead_page() sufficiently many times, effectively
+ * ignoring some pages, as might be appropriate if the path to
+ * storage is congested.
+ * - failing to actually submit a read request for a given page,
+ * possibly due to insufficient resources, or
+ * - getting an error during subsequent processing of a request.
+ * In the last two cases, the page should be unlocked to indicate that
+ * the read attempt has failed. In the first case the page will be
+ * unlocked by the caller.
+ *
+ * Those pages not in the final ``async_size`` of the request should be
+ * considered to be important and ->readahead() should not fail them due
+ * to congestion or temporary resource unavailability, but should wait
+ * for necessary resources (e.g. memory or indexing information) to
+ * become available. Pages in the final ``async_size`` may be
+ * considered less urgent and failure to read them is more acceptable.
+ * In this case it best to use delete_from_page_cache() to remove the
+ * pages from the page cache as is automatically done for pages that
+ * were not fetched with readahead_page(). This will allow a
+ * subsequent synchronous read ahead request to try them again. If they
+ * are left in the page cache, then they will be read individually using
+ * ->readpage().
+ *
+ */
+
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
@@ -129,6 +228,8 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
aops->readahead(rac);
/* Clean up the remaining pages */
while ((page = readahead_page(rac))) {
+ if (rac->ra->async_pages >= readahead_count(rac))
+ delete_from_page_cache(page);
unlock_page(page);
put_page(page);
}
@@ -426,7 +527,7 @@ static int try_context_readahead(struct address_space *mapping,

ra->start = index;
ra->size = min(size + req_size, max);
- ra->async_size = 1;
+ ra->async_size = ra->size - req_size;

return 1;
}
@@ -527,7 +628,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
initial_readahead:
ra->start = index;
ra->size = get_init_ra_size(req_size, max_pages);
- ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
+ ra->async_size = ra->size > req_size ? ra->size - req_size : 0;

readit:
/*
--
2.35.1