I'm using "v3" simply because the v2 series of NFSD page allocator
work included the same bulk-release concept in a different form.
v2 has now been merged (thanks, Mel!). However, the bulk-release
part of that series was postponed.
Consider v3 to be an RFC refresh.
As with the page allocation side, I'm trying to reduce the average
number of times NFSD invokes the page allocation and release APIs
because they can be expensive, and because it is a resource that is
shared amongst all nfsd threads and thus access to it is partially
serialized. This small series tackles a code path that is frequently
invoked when NFSD handles READ operations on local filesystems that
support splice (i.e., most of the popular ones).
The previous version of this proposal placed the unused pages on
a local list and then re-used the pages directly in svc_alloc_arg()
before invoking alloc_pages_bulk_array() to fill in any remaining
missing rq_pages entries.
This meant there would be the possibility of some workloads that
caused accrual of pages without bounds, so the finished version of
that logic would have to be complex and possibly involve a shrinker.
In this version, I'm simply handing the pages back to the page
allocator, so all that complexity vanishes. What makes it more
efficient is that instead of calling put_page() for each page,
the code collects the unused pages in a per-nfsd thread array, and
returns them to the allocator using a bulk free API (release_pages)
when the array is full.
In this version of the series, each nfsd thread never accrues more
than 16 pages. We can easily make that larger or smaller, but 16
already reduces the rate of put_pages() calls to a minute fraction
of what it was, and does not consume much additional space in struct
svc_rqst.
Comments welcome!
---
Chuck Lever (3):
NFSD: Clean up splice actor
SUNRPC: Add svc_rqst_replace_page() API
NFSD: Batch release pages during splice read
fs/nfsd/vfs.c | 20 +++++---------------
include/linux/sunrpc/svc.h | 5 +++++
net/sunrpc/svc.c | 29 +++++++++++++++++++++++++++++
3 files changed, 39 insertions(+), 15 deletions(-)
--
Chuck Lever
Replacing a page in rq_pages[] requires a get_page(), which is a
bus-locked operation, and a put_page(), which can be even more
costly.
To reduce the cost of replacing a page, batch the put_page()
operations by collecting "free" pages in an array, and then
passing them to release_pages() when the array becomes full.
Signed-off-by: Chuck Lever <[email protected]>
---
include/linux/sunrpc/svc.h | 5 +++++
net/sunrpc/svc.c | 29 +++++++++++++++++++++++++++++
2 files changed, 34 insertions(+)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e91d51ea028b..68a9f51e2624 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -256,6 +256,9 @@ struct svc_rqst {
struct page * *rq_next_page; /* next reply page to use */
struct page * *rq_page_end; /* one past the last page */
+ struct page *rq_relpages[16];
+ int rq_numrelpages;
+
struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */
struct bio_vec rq_bvec[RPCSVC_MAXPAGES];
@@ -502,6 +505,8 @@ struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
struct svc_pool *pool, int node);
struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
struct svc_pool *pool, int node);
+void svc_rqst_replace_page(struct svc_rqst *rqstp,
+ struct page *page);
void svc_rqst_free(struct svc_rqst *);
void svc_exit_thread(struct svc_rqst *);
unsigned int svc_pool_map_get(void);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 0de918cb3d90..3679830cf123 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/slab.h>
+#include <linux/pagemap.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/xdr.h>
@@ -838,6 +839,33 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser
}
EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
+static void svc_rqst_release_replaced_pages(struct svc_rqst *rqstp)
+{
+ release_pages(rqstp->rq_relpages, rqstp->rq_numrelpages);
+ rqstp->rq_numrelpages = 0;
+}
+
+/**
+ * svc_rqst_replace_page - Replace one page in rq_pages[]
+ * @rqstp: svc_rqst with pages to replace
+ * @page: replacement page
+ *
+ * When replacing a page in rq_pages, batch the release of the
+ * replaced pages to avoid hammering put_page().
+ */
+void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
+{
+ if (*rqstp->rq_next_page) {
+ if (rqstp->rq_numrelpages >= ARRAY_SIZE(rqstp->rq_relpages))
+ svc_rqst_release_replaced_pages(rqstp);
+ rqstp->rq_relpages[rqstp->rq_numrelpages++] = *rqstp->rq_next_page;
+ }
+
+ get_page(page);
+ *(rqstp->rq_next_page++) = page;
+}
+EXPORT_SYMBOL_GPL(svc_rqst_replace_page);
+
/*
* Called from a server thread as it's exiting. Caller must hold the "service
* mutex" for the service.
@@ -846,6 +874,7 @@ void
svc_rqst_free(struct svc_rqst *rqstp)
{
svc_release_buffer(rqstp);
+ svc_rqst_release_replaced_pages(rqstp);
if (rqstp->rq_scratch_page)
put_page(rqstp->rq_scratch_page);
kfree(rqstp->rq_resp);
Large splice reads call put_page() repeatedly. put_page() is
relatively expensive to call, so replace it with the new
svc_rqst_replace_page() helper to help amortize that cost.
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/vfs.c | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index da5340dc0203..43ac23b200d2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -849,15 +849,10 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct page *page = buf->page;
if (rqstp->rq_res.page_len == 0) {
- get_page(page);
- put_page(*rqstp->rq_next_page);
- *(rqstp->rq_next_page++) = page;
+ svc_rqst_replace_page(rqstp, page);
rqstp->rq_res.page_base = buf->offset;
} else if (page != pp[-1]) {
- get_page(page);
- if (*rqstp->rq_next_page)
- put_page(*rqstp->rq_next_page);
- *(rqstp->rq_next_page++) = page;
+ svc_rqst_replace_page(rqstp, page);
}
rqstp->rq_res.page_len += sd->len;
On Fri, 09 Jul 2021, Chuck Lever wrote:
>
> In this version of the series, each nfsd thread never accrues more
> than 16 pages. We can easily make that larger or smaller, but 16
> already reduces the rate of put_pages() calls to a minute fraction
> of what it was, and does not consume much additional space in struct
> svc_rqst.
>
> Comments welcome!
Very nice. Does "1/16" really count as "minute"? Or did I miss
something and it is actually a smaller fraction?
Either way: excellent work.
Reviewed-by: NeilBrown <[email protected]>
NeilBrown
>
> ---
>
> Chuck Lever (3):
> NFSD: Clean up splice actor
> SUNRPC: Add svc_rqst_replace_page() API
> NFSD: Batch release pages during splice read
>
>
> fs/nfsd/vfs.c | 20 +++++---------------
> include/linux/sunrpc/svc.h | 5 +++++
> net/sunrpc/svc.c | 29 +++++++++++++++++++++++++++++
> 3 files changed, 39 insertions(+), 15 deletions(-)
>
> --
> Chuck Lever
>
>
On Thu, Jul 08, 2021 at 11:26:29AM -0400, Chuck Lever wrote:
> @@ -256,6 +256,9 @@ struct svc_rqst {
> struct page * *rq_next_page; /* next reply page to use */
> struct page * *rq_page_end; /* one past the last page */
>
> + struct page *rq_relpages[16];
> + int rq_numrelpages;
This is only one struct page away from being a pagevec ... ?
> @@ -838,6 +839,33 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser
> }
> EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
>
> +static void svc_rqst_release_replaced_pages(struct svc_rqst *rqstp)
> +{
> + release_pages(rqstp->rq_relpages, rqstp->rq_numrelpages);
> + rqstp->rq_numrelpages = 0;
This would be pagevec_release()
> On Jul 8, 2021, at 7:23 PM, NeilBrown <[email protected]> wrote:
>
> On Fri, 09 Jul 2021, Chuck Lever wrote:
>>
>> In this version of the series, each nfsd thread never accrues more
>> than 16 pages. We can easily make that larger or smaller, but 16
>> already reduces the rate of put_pages() calls to a minute fraction
>> of what it was, and does not consume much additional space in struct
>> svc_rqst.
>>
>> Comments welcome!
>
> Very nice. Does "1/16" really count as "minute"? Or did I miss
> something and it is actually a smaller fraction?
6% is better than an order of magnitude fewer calls. I can drop
the "minute".
> Either way: excellent work.
>
> Reviewed-by: NeilBrown <[email protected]>
Thanks!
> NeilBrown
>
>>
>> ---
>>
>> Chuck Lever (3):
>> NFSD: Clean up splice actor
>> SUNRPC: Add svc_rqst_replace_page() API
>> NFSD: Batch release pages during splice read
>>
>>
>> fs/nfsd/vfs.c | 20 +++++---------------
>> include/linux/sunrpc/svc.h | 5 +++++
>> net/sunrpc/svc.c | 29 +++++++++++++++++++++++++++++
>> 3 files changed, 39 insertions(+), 15 deletions(-)
>>
>> --
>> Chuck Lever
>>
>>
--
Chuck Lever
> On Jul 8, 2021, at 7:30 PM, Matthew Wilcox <[email protected]> wrote:
>
> On Thu, Jul 08, 2021 at 11:26:29AM -0400, Chuck Lever wrote:
>> @@ -256,6 +256,9 @@ struct svc_rqst {
>> struct page * *rq_next_page; /* next reply page to use */
>> struct page * *rq_page_end; /* one past the last page */
>>
>> + struct page *rq_relpages[16];
>> + int rq_numrelpages;
>
> This is only one struct page away from being a pagevec ... ?
>
>> @@ -838,6 +839,33 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser
>> }
>> EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
>>
>> +static void svc_rqst_release_replaced_pages(struct svc_rqst *rqstp)
>> +{
>> + release_pages(rqstp->rq_relpages, rqstp->rq_numrelpages);
>> + rqstp->rq_numrelpages = 0;
>
> This would be pagevec_release()
986 void __pagevec_release(struct pagevec *pvec)
987 {
988 if (!pvec->percpu_pvec_drained) {
989 lru_add_drain();
990 pvec->percpu_pvec_drained = true;
991 }
992 release_pages(pvec->pages, pagevec_count(pvec));
993 pagevec_reinit(pvec);
994 }
Pretty much the same under the bonnet. Fair enough!
--
Chuck Lever