From: Trond Myklebust <[email protected]>
If we're doing uncached readdir, allocate multiple pages in order to
try to avoid duplicate RPC calls for the same getdents() call.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 79 ++++++++++++++++++++++++++++++----------------------
1 file changed, 46 insertions(+), 33 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b6c3501e8f61..238872d116f7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -694,12 +694,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
struct nfs_entry *entry,
struct page **xdr_pages,
- struct page *fillme, unsigned int buflen)
+ unsigned int buflen,
+ struct page **arrays,
+ size_t narrays)
{
struct address_space *mapping = desc->file->f_mapping;
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch, *new, *page = fillme;
+ struct page *scratch, *new, *page = *arrays;
int status;
scratch = alloc_page(GFP_KERNEL);
@@ -725,15 +727,22 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
if (status != -ENOSPC)
continue;
- if (page->mapping != mapping)
- break;
- new = nfs_readdir_page_get_next(mapping, page->index + 1,
- entry->prev_cookie);
- if (!new)
- break;
- if (page != fillme)
- nfs_readdir_page_unlock_and_put(page);
- page = new;
+ if (narrays > 1) {
+ narrays--;
+ arrays++;
+ page = *arrays;
+ } else {
+ if (page->mapping != mapping)
+ break;
+ new = nfs_readdir_page_get_next(mapping,
+ page->index + 1,
+ entry->prev_cookie);
+ if (!new)
+ break;
+ if (page != *arrays)
+ nfs_readdir_page_unlock_and_put(page);
+ page = new;
+ }
status = nfs_readdir_add_to_array(entry, page);
} while (!status && !entry->eof);
@@ -750,7 +759,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
break;
}
- if (page != fillme)
+ if (page != *arrays)
nfs_readdir_page_unlock_and_put(page);
put_page(scratch);
@@ -790,10 +799,11 @@ static struct page **nfs_readdir_alloc_pages(size_t npages)
}
static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
- struct page *page, __be32 *verf_arg,
- __be32 *verf_res)
+ __be32 *verf_arg, __be32 *verf_res,
+ struct page **arrays, size_t narrays)
{
struct page **pages;
+ struct page *page = *arrays;
struct nfs_entry *entry;
size_t array_size;
struct inode *inode = file_inode(desc->file);
@@ -835,7 +845,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
break;
}
- status = nfs_readdir_page_filler(desc, entry, pages, page, pglen);
+ status = nfs_readdir_page_filler(desc, entry, pages, pglen,
+ arrays, narrays);
} while (!status && nfs_readdir_page_needs_filling(page));
nfs_readdir_free_pages(pages, array_size);
@@ -884,8 +895,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
if (!desc->page)
return -ENOMEM;
if (nfs_readdir_page_needs_filling(desc->page)) {
- res = nfs_readdir_xdr_to_array(desc, desc->page,
- nfsi->cookieverf, verf);
+ res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
+ &desc->page, 1);
if (res < 0) {
nfs_readdir_page_unlock_and_put_cached(desc);
if (res == -EBADCOOKIE || res == -ENOTSYNC) {
@@ -976,35 +987,37 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
*/
static int uncached_readdir(struct nfs_readdir_descriptor *desc)
{
- struct page *page = NULL;
+ struct page **arrays;
+ size_t i, sz = 16;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
int status;
dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
(unsigned long long)desc->dir_cookie);
- page = alloc_page(GFP_HIGHUSER);
- if (!page) {
- status = -ENOMEM;
- goto out;
- }
+ arrays = nfs_readdir_alloc_pages(sz);
+ if (!arrays)
+ return -ENOMEM;
+ for (i = 0; i < sz; i++)
+ nfs_readdir_page_init_array(arrays[i], desc->dir_cookie);
desc->page_index = 0;
desc->last_cookie = desc->dir_cookie;
- desc->page = page;
desc->duped = 0;
- nfs_readdir_page_init_array(page, desc->dir_cookie);
- status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
- if (status < 0)
- goto out_release;
+ status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
- nfs_do_filldir(desc);
+ for (i = 0; !desc->eof && i < sz; i++) {
+ desc->page = arrays[i];
+ nfs_do_filldir(desc);
+ }
+ desc->page = NULL;
+
+
+ for (i = 0; i < sz; i++)
+ nfs_readdir_clear_array(arrays[i]);
+ nfs_readdir_free_pages(arrays, sz);
- out_release:
- nfs_readdir_clear_array(desc->page);
- nfs_readdir_page_put(desc);
- out:
dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
__func__, status);
return status;
--
2.28.0
From: Trond Myklebust <[email protected]>
If the directory is changing, causing the page cache to get invalidated
while we are listing the contents, then the NFS client is currently forced
to read in the entire directory contents from scratch, because it needs
to perform a linear search for the readdir cookie. While this is not
an issue for small directories, it does not scale to directories with
millions of entries.
In order to be able to deal with large directories that are changing,
add a heuristic to ensure that if the page cache is empty, and we are
searching for a cookie that is not the zero cookie, we just default to
performing uncached readdir.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 238872d116f7..d7a9efd31ecd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -917,11 +917,28 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
return res;
}
+static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
+{
+ struct address_space *mapping = desc->file->f_mapping;
+ struct inode *dir = file_inode(desc->file);
+ unsigned int dtsize = NFS_SERVER(dir)->dtsize;
+ loff_t size = i_size_read(dir);
+
+ /*
+ * Default to uncached readdir if the page cache is empty, and
+ * we're looking for a non-zero cookie in a large directory.
+ */
+ return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
+}
+
/* Search for desc->dir_cookie from the beginning of the page cache */
static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
{
int res;
+ if (nfs_readdir_dont_search_cache(desc))
+ return -EBADCOOKIE;
+
do {
if (desc->page_index == 0) {
desc->current_index = 0;
--
2.28.0
On 7 Nov 2020, at 9:03, [email protected] wrote:
> From: Trond Myklebust <[email protected]>
>
> If we're doing uncached readdir, allocate multiple pages in order to
> try to avoid duplicate RPC calls for the same getdents() call.
>
> Signed-off-by: Trond Myklebust <[email protected]>
Reviewed-by: Benjamin Coddington <[email protected]>
Ben
> ---
> fs/nfs/dir.c | 79
> ++++++++++++++++++++++++++++++----------------------
> 1 file changed, 46 insertions(+), 33 deletions(-)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index b6c3501e8f61..238872d116f7 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -694,12 +694,14 @@ void nfs_prime_dcache(struct dentry *parent,
> struct nfs_entry *entry,
> static int nfs_readdir_page_filler(struct nfs_readdir_descriptor
> *desc,
> struct nfs_entry *entry,
> struct page **xdr_pages,
> - struct page *fillme, unsigned int buflen)
> + unsigned int buflen,
> + struct page **arrays,
> + size_t narrays)
> {
> struct address_space *mapping = desc->file->f_mapping;
> struct xdr_stream stream;
> struct xdr_buf buf;
> - struct page *scratch, *new, *page = fillme;
> + struct page *scratch, *new, *page = *arrays;
> int status;
>
> scratch = alloc_page(GFP_KERNEL);
> @@ -725,15 +727,22 @@ static int nfs_readdir_page_filler(struct
> nfs_readdir_descriptor *desc,
> if (status != -ENOSPC)
> continue;
>
> - if (page->mapping != mapping)
> - break;
> - new = nfs_readdir_page_get_next(mapping, page->index + 1,
> - entry->prev_cookie);
> - if (!new)
> - break;
> - if (page != fillme)
> - nfs_readdir_page_unlock_and_put(page);
> - page = new;
> + if (narrays > 1) {
> + narrays--;
> + arrays++;
> + page = *arrays;
> + } else {
> + if (page->mapping != mapping)
> + break;
> + new = nfs_readdir_page_get_next(mapping,
> + page->index + 1,
> + entry->prev_cookie);
> + if (!new)
> + break;
> + if (page != *arrays)
> + nfs_readdir_page_unlock_and_put(page);
> + page = new;
> + }
> status = nfs_readdir_add_to_array(entry, page);
> } while (!status && !entry->eof);
>
> @@ -750,7 +759,7 @@ static int nfs_readdir_page_filler(struct
> nfs_readdir_descriptor *desc,
> break;
> }
>
> - if (page != fillme)
> + if (page != *arrays)
> nfs_readdir_page_unlock_and_put(page);
>
> put_page(scratch);
> @@ -790,10 +799,11 @@ static struct page
> **nfs_readdir_alloc_pages(size_t npages)
> }
>
> static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor
> *desc,
> - struct page *page, __be32 *verf_arg,
> - __be32 *verf_res)
> + __be32 *verf_arg, __be32 *verf_res,
> + struct page **arrays, size_t narrays)
> {
> struct page **pages;
> + struct page *page = *arrays;
> struct nfs_entry *entry;
> size_t array_size;
> struct inode *inode = file_inode(desc->file);
> @@ -835,7 +845,8 @@ static int nfs_readdir_xdr_to_array(struct
> nfs_readdir_descriptor *desc,
> break;
> }
>
> - status = nfs_readdir_page_filler(desc, entry, pages, page, pglen);
> + status = nfs_readdir_page_filler(desc, entry, pages, pglen,
> + arrays, narrays);
> } while (!status && nfs_readdir_page_needs_filling(page));
>
> nfs_readdir_free_pages(pages, array_size);
> @@ -884,8 +895,8 @@ static int find_and_lock_cache_page(struct
> nfs_readdir_descriptor *desc)
> if (!desc->page)
> return -ENOMEM;
> if (nfs_readdir_page_needs_filling(desc->page)) {
> - res = nfs_readdir_xdr_to_array(desc, desc->page,
> - nfsi->cookieverf, verf);
> + res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
> + &desc->page, 1);
> if (res < 0) {
> nfs_readdir_page_unlock_and_put_cached(desc);
> if (res == -EBADCOOKIE || res == -ENOTSYNC) {
> @@ -976,35 +987,37 @@ static void nfs_do_filldir(struct
> nfs_readdir_descriptor *desc)
> */
> static int uncached_readdir(struct nfs_readdir_descriptor *desc)
> {
> - struct page *page = NULL;
> + struct page **arrays;
> + size_t i, sz = 16;
> __be32 verf[NFS_DIR_VERIFIER_SIZE];
> int status;
>
> dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie
> %Lu\n",
> (unsigned long long)desc->dir_cookie);
>
> - page = alloc_page(GFP_HIGHUSER);
> - if (!page) {
> - status = -ENOMEM;
> - goto out;
> - }
> + arrays = nfs_readdir_alloc_pages(sz);
> + if (!arrays)
> + return -ENOMEM;
> + for (i = 0; i < sz; i++)
> + nfs_readdir_page_init_array(arrays[i], desc->dir_cookie);
>
> desc->page_index = 0;
> desc->last_cookie = desc->dir_cookie;
> - desc->page = page;
> desc->duped = 0;
>
> - nfs_readdir_page_init_array(page, desc->dir_cookie);
> - status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
> - if (status < 0)
> - goto out_release;
> + status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays,
> sz);
>
> - nfs_do_filldir(desc);
> + for (i = 0; !desc->eof && i < sz; i++) {
> + desc->page = arrays[i];
> + nfs_do_filldir(desc);
> + }
> + desc->page = NULL;
> +
> +
> + for (i = 0; i < sz; i++)
> + nfs_readdir_clear_array(arrays[i]);
> + nfs_readdir_free_pages(arrays, sz);
>
> - out_release:
> - nfs_readdir_clear_array(desc->page);
> - nfs_readdir_page_put(desc);
> - out:
> dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
> __func__, status);
> return status;
> --
> 2.28.0
On 7 Nov 2020, at 9:03, [email protected] wrote:
> From: Trond Myklebust <[email protected]>
>
> If the directory is changing, causing the page cache to get
> invalidated
> while we are listing the contents, then the NFS client is currently
> forced
> to read in the entire directory contents from scratch, because it
> needs
> to perform a linear search for the readdir cookie. While this is not
> an issue for small directories, it does not scale to directories with
> millions of entries.
> In order to be able to deal with large directories that are changing,
> add a heuristic to ensure that if the page cache is empty, and we are
> searching for a cookie that is not the zero cookie, we just default to
> performing uncached readdir.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/dir.c | 17 +++++++++++++++++
> 1 file changed, 17 insertions(+)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 238872d116f7..d7a9efd31ecd 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -917,11 +917,28 @@ static int find_and_lock_cache_page(struct
> nfs_readdir_descriptor *desc)
> return res;
> }
>
> +static bool nfs_readdir_dont_search_cache(struct
> nfs_readdir_descriptor *desc)
> +{
> + struct address_space *mapping = desc->file->f_mapping;
> + struct inode *dir = file_inode(desc->file);
> + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
> + loff_t size = i_size_read(dir);
> +
> + /*
> + * Default to uncached readdir if the page cache is empty, and
> + * we're looking for a non-zero cookie in a large directory.
> + */
> + return desc->dir_cookie != 0 && mapping->nrpages == 0 && size >
> dtsize;
inode size > dtsize is a little hand-wavy. We have a lot of customers
trying to
reverse-engineer nfs_readdir() behavior instead of reading the code,
this
is sure to drive them crazy.
That said, in the absence of an easy way to make it tunable, I don't
have
anything better to suggest.
Reviewed-by: Benjamin Coddington <[email protected]>
Ben
> +}
> +
> /* Search for desc->dir_cookie from the beginning of the page cache
> */
> static int readdir_search_pagecache(struct nfs_readdir_descriptor
> *desc)
> {
> int res;
>
> + if (nfs_readdir_dont_search_cache(desc))
> + return -EBADCOOKIE;
> +
> do {
> if (desc->page_index == 0) {
> desc->current_index = 0;
> --
> 2.28.0
On Mon, 2020-11-09 at 16:41 -0500, Benjamin Coddington wrote:
> On 7 Nov 2020, at 9:03, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > If the directory is changing, causing the page cache to get
> > invalidated
> > while we are listing the contents, then the NFS client is currently
> > forced
> > to read in the entire directory contents from scratch, because it
> > needs
> > to perform a linear search for the readdir cookie. While this is
> > not
> > an issue for small directories, it does not scale to directories
> > with
> > millions of entries.
> > In order to be able to deal with large directories that are
> > changing,
> > add a heuristic to ensure that if the page cache is empty, and we
> > are
> > searching for a cookie that is not the zero cookie, we just default
> > to
> > performing uncached readdir.
> >
> > Signed-off-by: Trond Myklebust <[email protected]>
> > ---
> > fs/nfs/dir.c | 17 +++++++++++++++++
> > 1 file changed, 17 insertions(+)
> >
> > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > index 238872d116f7..d7a9efd31ecd 100644
> > --- a/fs/nfs/dir.c
> > +++ b/fs/nfs/dir.c
> > @@ -917,11 +917,28 @@ static int find_and_lock_cache_page(struct
> > nfs_readdir_descriptor *desc)
> > return res;
> > }
> >
> > +static bool nfs_readdir_dont_search_cache(struct
> > nfs_readdir_descriptor *desc)
> > +{
> > + struct address_space *mapping = desc->file->f_mapping;
> > + struct inode *dir = file_inode(desc->file);
> > + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
> > + loff_t size = i_size_read(dir);
> > +
> > + /*
> > + * Default to uncached readdir if the page cache is empty,
> > and
> > + * we're looking for a non-zero cookie in a large
> > directory.
> > + */
> > + return desc->dir_cookie != 0 && mapping->nrpages == 0 &&
> > size >
> > dtsize;
>
> inode size > dtsize is a little hand-wavy. We have a lot of
> customers
> trying to
> reverse-engineer nfs_readdir() behavior instead of reading the code,
> this
> is sure to drive them crazy.
>
> That said, in the absence of an easy way to make it tunable, I don't
> have
> anything better to suggest.
>
> Reviewed-by: Benjamin Coddington <[email protected]>
Right. It is a heuristic, but I would expect that the directory size is
going to be somewhat proportional to the number of RPC calls we need to
perform to read it. That number again is somewhat proportional to the
dtsize.
IOW: The general idea is correct.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Sat, Nov 7, 2020 at 9:14 AM <[email protected]> wrote:
>
> From: Trond Myklebust <[email protected]>
>
> If the directory is changing, causing the page cache to get invalidated
> while we are listing the contents, then the NFS client is currently forced
> to read in the entire directory contents from scratch, because it needs
> to perform a linear search for the readdir cookie. While this is not
> an issue for small directories, it does not scale to directories with
> millions of entries.
> In order to be able to deal with large directories that are changing,
> add a heuristic to ensure that if the page cache is empty, and we are
> searching for a cookie that is not the zero cookie, we just default to
> performing uncached readdir.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/dir.c | 17 +++++++++++++++++
> 1 file changed, 17 insertions(+)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 238872d116f7..d7a9efd31ecd 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -917,11 +917,28 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
> return res;
> }
>
> +static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
> +{
> + struct address_space *mapping = desc->file->f_mapping;
> + struct inode *dir = file_inode(desc->file);
> + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
> + loff_t size = i_size_read(dir);
> +
> + /*
> + * Default to uncached readdir if the page cache is empty, and
> + * we're looking for a non-zero cookie in a large directory.
> + */
> + return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
> +}
> +
> /* Search for desc->dir_cookie from the beginning of the page cache */
> static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
> {
> int res;
>
> + if (nfs_readdir_dont_search_cache(desc))
> + return -EBADCOOKIE;
> +
> do {
> if (desc->page_index == 0) {
> desc->current_index = 0;
> --
> 2.28.0
>
I did a lot of testing yesterday and last night and this mostly
behaves as designed.
However, before you sent this I was starting to test the following
patch which adds a NFS_DIR_CONTEXT_UNCACHED
flag inside nfs_open_dir_context. I was not sure about the logic when
to turn it on, so for now I'd ignore that
(especially nrpages > NFS_READDIR_UNCACHED_THRESHOLD). However, I'm
curious why:
1. you didn't take the approach of adding a per-process context flag
so once a process hits this condition, the
process would just shift to uncached and be unaffected by any other
process. I wonder about multiple directory
listing processes defeating this logic if it's not per-process so we
may get an unbounded time still
2. you put the logic inside readdir_search_pagecache rather than
inside the calling do { .. } while loop
commit a56ff638fe696929a1bc633b22e2d9bd05f3c308
Author: Dave Wysochanski <[email protected]>
Date: Fri Nov 6 08:32:41 2020 -0500
NFS: Use uncached readdir if we drop the pagecache with larger directories
Larger directories can get into problem where they do not make
forward progress once the pagecache times out via exceeding
acdirmax. Alleviate this problem by shifting to uncached
readdir if we drop the pagecache on larger directory.
Signed-off-by: Dave Wysochanski <[email protected]>
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ca30e2dbb9c3..7f43f75d5b76 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -78,6 +78,7 @@ static struct nfs_open_dir_context
*alloc_nfs_open_dir_context(struct inode *di
ctx->attr_gencount = nfsi->attr_gencount;
ctx->dir_cookie = 0;
ctx->dup_cookie = 0;
+ ctx->flags = 0;
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -1023,6 +1024,8 @@ static int nfs_readdir(struct file *file, struct
dir_context *ctx)
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
int res;
+ unsigned long nrpages;
+#define NFS_READDIR_UNCACHED_THRESHOLD 1024
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
file, (long long)ctx->pos);
@@ -1035,9 +1038,25 @@ static int nfs_readdir(struct file *file,
struct dir_context *ctx)
* revalidate the cookie.
*/
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
+ nrpages = inode->i_mapping->nrpages;
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
+ /*
+ * If we just dropped the pagecache, and we're not
+ * at the start of the directory, use uncached.
+ */
+ if (!test_bit(NFS_DIR_CONTEXT_UNCACHED, &dir_ctx->flags) &&
+ ctx->pos != 0 &&
+ !inode->i_mapping->nrpages &&
+ nrpages > NFS_READDIR_UNCACHED_THRESHOLD) {
+ set_bit(NFS_DIR_CONTEXT_UNCACHED, &dir_ctx->flags);
+ printk("NFS: DBG setting
NFS_DIR_CONTEXT_UNCACHED ctx->pos = %lld nrpages
+ }
+ }
+ if (test_bit(NFS_DIR_CONTEXT_UNCACHED, &dir_ctx->flags) &&
ctx->pos == 0) {
+ clear_bit(NFS_DIR_CONTEXT_UNCACHED, &dir_ctx->flags);
+ printk("NFS: DBG clearing NFS_DIR_CONTEXT_UNCACHED");
}
res = -ENOMEM;
@@ -1057,7 +1076,10 @@ static int nfs_readdir(struct file *file,
struct dir_context *ctx)
spin_unlock(&file->f_lock);
do {
- res = readdir_search_pagecache(desc);
+ if (test_bit(NFS_DIR_CONTEXT_UNCACHED, &dir_ctx->flags))
+ res = -EBADCOOKIE;
+ else
+ res = readdir_search_pagecache(desc);
if (res == -EBADCOOKIE) {
res = 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 681ed98e4ba8..fedcfec94d95 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -98,6 +98,8 @@ struct nfs_open_dir_context {
__u64 dir_cookie;
__u64 dup_cookie;
signed char duped;
+ unsigned long flags;
+#define NFS_DIR_CONTEXT_UNCACHED (1)
};
/*
On Tue, 2020-11-10 at 09:48 -0500, David Wysochanski wrote:
> On Sat, Nov 7, 2020 at 9:14 AM <[email protected]> wrote:
> >
> > From: Trond Myklebust <[email protected]>
> >
> > If the directory is changing, causing the page cache to get
> > invalidated
> > while we are listing the contents, then the NFS client is currently
> > forced
> > to read in the entire directory contents from scratch, because it
> > needs
> > to perform a linear search for the readdir cookie. While this is
> > not
> > an issue for small directories, it does not scale to directories
> > with
> > millions of entries.
> > In order to be able to deal with large directories that are
> > changing,
> > add a heuristic to ensure that if the page cache is empty, and we
> > are
> > searching for a cookie that is not the zero cookie, we just default
> > to
> > performing uncached readdir.
> >
> > Signed-off-by: Trond Myklebust <[email protected]>
> > ---
> > fs/nfs/dir.c | 17 +++++++++++++++++
> > 1 file changed, 17 insertions(+)
> >
> > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > index 238872d116f7..d7a9efd31ecd 100644
> > --- a/fs/nfs/dir.c
> > +++ b/fs/nfs/dir.c
> > @@ -917,11 +917,28 @@ static int find_and_lock_cache_page(struct
> > nfs_readdir_descriptor *desc)
> > return res;
> > }
> >
> > +static bool nfs_readdir_dont_search_cache(struct
> > nfs_readdir_descriptor *desc)
> > +{
> > + struct address_space *mapping = desc->file->f_mapping;
> > + struct inode *dir = file_inode(desc->file);
> > + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
> > + loff_t size = i_size_read(dir);
> > +
> > + /*
> > + * Default to uncached readdir if the page cache is empty,
> > and
> > + * we're looking for a non-zero cookie in a large
> > directory.
> > + */
> > + return desc->dir_cookie != 0 && mapping->nrpages == 0 &&
> > size > dtsize;
> > +}
> > +
> > /* Search for desc->dir_cookie from the beginning of the page
> > cache */
> > static int readdir_search_pagecache(struct nfs_readdir_descriptor
> > *desc)
> > {
> > int res;
> >
> > + if (nfs_readdir_dont_search_cache(desc))
> > + return -EBADCOOKIE;
> > +
> > do {
> > if (desc->page_index == 0) {
> > desc->current_index = 0;
> > --
> > 2.28.0
> >
> I did a lot of testing yesterday and last night and this mostly
> behaves as designed.
>
> However, before you sent this I was starting to test the following
> patch which adds a NFS_DIR_CONTEXT_UNCACHED
> flag inside nfs_open_dir_context. I was not sure about the logic
> when
> to turn it on, so for now I'd ignore that
> (especially nrpages > NFS_READDIR_UNCACHED_THRESHOLD). However, I'm
> curious why:
> 1. you didn't take the approach of adding a per-process context flag
> so once a process hits this condition, the
> process would just shift to uncached and be unaffected by any other
> process. I wonder about multiple directory
> listing processes defeating this logic if it's not per-process so we
> may get an unbounded time still
> 2. you put the logic inside readdir_search_pagecache rather than
> inside the calling do { .. } while loop
The reason for using uncached readdir here is because we're having
trouble sharing the cache. However if there is a possibility to do so,
because we have multiple processes racing to read the same directory,
then why should we not try?
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 9 Nov 2020, at 16:46, Trond Myklebust wrote:
> On Mon, 2020-11-09 at 16:41 -0500, Benjamin Coddington wrote:
>> On 7 Nov 2020, at 9:03, [email protected] wrote:
>>
>>> From: Trond Myklebust <[email protected]>
>>>
>>> If the directory is changing, causing the page cache to get
>>> invalidated
>>> while we are listing the contents, then the NFS client is currently
>>> forced
>>> to read in the entire directory contents from scratch, because it
>>> needs
>>> to perform a linear search for the readdir cookie. While this is
>>> not
>>> an issue for small directories, it does not scale to directories
>>> with
>>> millions of entries.
>>> In order to be able to deal with large directories that are
>>> changing,
>>> add a heuristic to ensure that if the page cache is empty, and we
>>> are
>>> searching for a cookie that is not the zero cookie, we just default
>>> to
>>> performing uncached readdir.
>>>
>>> Signed-off-by: Trond Myklebust <[email protected]>
>>> ---
>>> fs/nfs/dir.c | 17 +++++++++++++++++
>>> 1 file changed, 17 insertions(+)
>>>
>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
>>> index 238872d116f7..d7a9efd31ecd 100644
>>> --- a/fs/nfs/dir.c
>>> +++ b/fs/nfs/dir.c
>>> @@ -917,11 +917,28 @@ static int find_and_lock_cache_page(struct
>>> nfs_readdir_descriptor *desc)
>>> return res;
>>> }
>>>
>>> +static bool nfs_readdir_dont_search_cache(struct
>>> nfs_readdir_descriptor *desc)
>>> +{
>>> + struct address_space *mapping = desc->file->f_mapping;
>>> + struct inode *dir = file_inode(desc->file);
>>> + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
>>> + loff_t size = i_size_read(dir);
>>> +
>>> + /*
>>> + * Default to uncached readdir if the page cache is empty,
>>> and
>>> + * we're looking for a non-zero cookie in a large
>>> directory.
>>> + */
>>> + return desc->dir_cookie != 0 && mapping->nrpages == 0 &&
>>> size >
>>> dtsize;
>>
>> inode size > dtsize is a little hand-wavy. We have a lot of
>> customers
>> trying to
>> reverse-engineer nfs_readdir() behavior instead of reading the code,
>> this
>> is sure to drive them crazy.
>>
>> That said, in the absence of an easy way to make it tunable, I don't
>> have
>> anything better to suggest.
>>
>> Reviewed-by: Benjamin Coddington <[email protected]>
>
>
> Right. It is a heuristic, but I would expect that the directory size is
> going to be somewhat proportional to the number of RPC calls we need to
> perform to read it. That number again is somewhat proportional to the
> dtsize.
>
> IOW: The general idea is correct.
I can agree with that, but I have another thought:
If the point of the heuristic is to allow a full listing to eventually
complete, it should not be dependent on mapping->nrpages == 0. Otherwise,
other processes can start filling the cache and we're back to the situation
where filling the cache could take longer than acdirmax, and things
eventually congest to a halt.
Flipping a bit on the context to remain uncached gives a better assurance we
can continue to make forward progress.
It's too bad we're stuck caching entries linearly. What challenges might
exist if we tried to use an XArray to map directory position to cookie? I
imagine we could implement this in a single XArray by using both position
and cookie values as indices, and differentiate between them using two of
the three XA marks, and store a structure to represent both. Also unclear
would be how to handle the lifetime of the XArray, since we'd no longer be
using the VMs pagecache management..
/thoughts
Ben
On Wed, 2020-11-11 at 11:43 -0500, Benjamin Coddington wrote:
> On 9 Nov 2020, at 16:46, Trond Myklebust wrote:
>
> > On Mon, 2020-11-09 at 16:41 -0500, Benjamin Coddington wrote:
> > > On 7 Nov 2020, at 9:03, [email protected] wrote:
> > >
> > > > From: Trond Myklebust <[email protected]>
> > > >
> > > > If the directory is changing, causing the page cache to get
> > > > invalidated
> > > > while we are listing the contents, then the NFS client is
> > > > currently
> > > > forced
> > > > to read in the entire directory contents from scratch, because
> > > > it
> > > > needs
> > > > to perform a linear search for the readdir cookie. While this
> > > > is
> > > > not
> > > > an issue for small directories, it does not scale to
> > > > directories
> > > > with
> > > > millions of entries.
> > > > In order to be able to deal with large directories that are
> > > > changing,
> > > > add a heuristic to ensure that if the page cache is empty, and
> > > > we
> > > > are
> > > > searching for a cookie that is not the zero cookie, we just
> > > > default
> > > > to
> > > > performing uncached readdir.
> > > >
> > > > Signed-off-by: Trond Myklebust
> > > > <[email protected]>
> > > > ---
> > > > fs/nfs/dir.c | 17 +++++++++++++++++
> > > > 1 file changed, 17 insertions(+)
> > > >
> > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > > index 238872d116f7..d7a9efd31ecd 100644
> > > > --- a/fs/nfs/dir.c
> > > > +++ b/fs/nfs/dir.c
> > > > @@ -917,11 +917,28 @@ static int
> > > > find_and_lock_cache_page(struct
> > > > nfs_readdir_descriptor *desc)
> > > > return res;
> > > > }
> > > >
> > > > +static bool nfs_readdir_dont_search_cache(struct
> > > > nfs_readdir_descriptor *desc)
> > > > +{
> > > > + struct address_space *mapping = desc->file->f_mapping;
> > > > + struct inode *dir = file_inode(desc->file);
> > > > + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
> > > > + loff_t size = i_size_read(dir);
> > > > +
> > > > + /*
> > > > + * Default to uncached readdir if the page cache is
> > > > empty,
> > > > and
> > > > + * we're looking for a non-zero cookie in a large
> > > > directory.
> > > > + */
> > > > + return desc->dir_cookie != 0 && mapping->nrpages == 0
> > > > &&
> > > > size >
> > > > dtsize;
> > >
> > > inode size > dtsize is a little hand-wavy. We have a lot of
> > > customers
> > > trying to
> > > reverse-engineer nfs_readdir() behavior instead of reading the
> > > code,
> > > this
> > > is sure to drive them crazy.
> > >
> > > That said, in the absence of an easy way to make it tunable, I
> > > don't
> > > have
> > > anything better to suggest.
> > >
> > > Reviewed-by: Benjamin Coddington <[email protected]>
> >
> >
> > Right. It is a heuristic, but I would expect that the directory
> > size is
> > going to be somewhat proportional to the number of RPC calls we
> > need to
> > perform to read it. That number again is somewhat proportional to
> > the
> > dtsize.
> >
> > IOW: The general idea is correct.
>
> I can agree with that, but I have another thought:
>
> If the point of the heuristic is to allow a full listing to
> eventually
> complete, it should not be dependent on mapping->nrpages == 0.
> Otherwise,
> other processes can start filling the cache and we're back to the
> situation
> where filling the cache could take longer than acdirmax, and things
> eventually congest to a halt.
>
> Flipping a bit on the context to remain uncached gives a better
> assurance we
> can continue to make forward progress.
I disagree. The point of the page cache is to allow sharing of
information between processes where possible. If there are multiple
processes all trying to make progress, and one of them starts filling
the page cache from scratch, then why should we not use that?
The alternative is not scaling to multiple processes.
>
> It's too bad we're stuck caching entries linearly. What challenges
> might
> exist if we tried to use an XArray to map directory position to
> cookie? I
> imagine we could implement this in a single XArray by using both
> position
> and cookie values as indices, and differentiate between them using
> two of
> the three XA marks, and store a structure to represent both. Also
> unclear
> would be how to handle the lifetime of the XArray, since we'd no
> longer be
> using the VMs pagecache management..
>
You might be able to speed up first cookie lookup by having an Xarray
that maps from a 64-bit cookie to a nfs_cache_array_entry which
contains the next cookie to look up. However that would only work on
64-bit systems since xarrays take an unsigned long index.
Furthermore, you still need a way to map offsets to entries for the
case where we're not able to use cookies for lseek() purposes. That's a
linear search through the directory, which would be horrible with an
xarray of linked cookie values (so you'd probably need a second xarray
for that?).
Construction and teardown of that structure would be nasty for large
directories, since you have as many cookies as you have entries in your
directory. IOW: You'd have to tear down 127 times as many xarray
entries as we have now.
It is not obvious that we would be able to benefit from starting at an
arbitrary location and caching that data, since if the directory
changed, we'd have to read in the new data anyway.
Memory management would need to be implemented somehow. You'd need a
shrinker for this tree that could intelligently prune it.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 11 Nov 2020, at 12:34, Trond Myklebust wrote:
> On Wed, 2020-11-11 at 11:43 -0500, Benjamin Coddington wrote:
>> On 9 Nov 2020, at 16:46, Trond Myklebust wrote:
>>
>>> On Mon, 2020-11-09 at 16:41 -0500, Benjamin Coddington wrote:
>>>> On 7 Nov 2020, at 9:03, [email protected] wrote:
>>>>
>>>>> From: Trond Myklebust <[email protected]>
>>>>>
>>>>> If the directory is changing, causing the page cache to get
>>>>> invalidated
>>>>> while we are listing the contents, then the NFS client is
>>>>> currently
>>>>> forced
>>>>> to read in the entire directory contents from scratch, because
>>>>> it
>>>>> needs
>>>>> to perform a linear search for the readdir cookie. While this
>>>>> is
>>>>> not
>>>>> an issue for small directories, it does not scale to
>>>>> directories
>>>>> with
>>>>> millions of entries.
>>>>> In order to be able to deal with large directories that are
>>>>> changing,
>>>>> add a heuristic to ensure that if the page cache is empty, and
>>>>> we
>>>>> are
>>>>> searching for a cookie that is not the zero cookie, we just
>>>>> default
>>>>> to
>>>>> performing uncached readdir.
>>>>>
>>>>> Signed-off-by: Trond Myklebust
>>>>> <[email protected]>
>>>>> ---
>>>>> fs/nfs/dir.c | 17 +++++++++++++++++
>>>>> 1 file changed, 17 insertions(+)
>>>>>
>>>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
>>>>> index 238872d116f7..d7a9efd31ecd 100644
>>>>> --- a/fs/nfs/dir.c
>>>>> +++ b/fs/nfs/dir.c
>>>>> @@ -917,11 +917,28 @@ static int
>>>>> find_and_lock_cache_page(struct
>>>>> nfs_readdir_descriptor *desc)
>>>>> return res;
>>>>> }
>>>>>
>>>>> +static bool nfs_readdir_dont_search_cache(struct
>>>>> nfs_readdir_descriptor *desc)
>>>>> +{
>>>>> + struct address_space *mapping = desc->file->f_mapping;
>>>>> + struct inode *dir = file_inode(desc->file);
>>>>> + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
>>>>> + loff_t size = i_size_read(dir);
>>>>> +
>>>>> + /*
>>>>> + * Default to uncached readdir if the page cache is
>>>>> empty,
>>>>> and
>>>>> + * we're looking for a non-zero cookie in a large
>>>>> directory.
>>>>> + */
>>>>> + return desc->dir_cookie != 0 && mapping->nrpages == 0
>>>>> &&
>>>>> size >
>>>>> dtsize;
>>>>
>>>> inode size > dtsize is a little hand-wavy. We have a lot of
>>>> customers
>>>> trying to
>>>> reverse-engineer nfs_readdir() behavior instead of reading the
>>>> code,
>>>> this
>>>> is sure to drive them crazy.
>>>>
>>>> That said, in the absence of an easy way to make it tunable, I
>>>> don't
>>>> have
>>>> anything better to suggest.
>>>>
>>>> Reviewed-by: Benjamin Coddington <[email protected]>
>>>
>>>
>>> Right. It is a heuristic, but I would expect that the directory
>>> size is
>>> going to be somewhat proportional to the number of RPC calls we
>>> need to
>>> perform to read it. That number again is somewhat proportional to
>>> the
>>> dtsize.
>>>
>>> IOW: The general idea is correct.
>>
>> I can agree with that, but I have another thought:
>>
>> If the point of the heuristic is to allow a full listing to
>> eventually
>> complete, it should not be dependent on mapping->nrpages == 0.
>> Otherwise,
>> other processes can start filling the cache and we're back to the
>> situation
>> where filling the cache could take longer than acdirmax, and things
>> eventually congest to a halt.
>>
>> Flipping a bit on the context to remain uncached gives a better
>> assurance we
>> can continue to make forward progress.
>
> I disagree. The point of the page cache is to allow sharing of
> information between processes where possible. If there are multiple
> processes all trying to make progress, and one of them starts filling
> the page cache from scratch, then why should we not use that?
Because the process that starts filling the pagecache from scratch then
enjoins the process that may be nearly finished listing the directory to
start over waiting for the page cache to be filled (or help fill it).
If the time taken to get to a certain offset/cookie exceeds the time to
cache the directory's attributes, we'll drop the pagecache, or if we're
perhaps using READDIRPLUS with many entries, we'll saturate the memory on
the machine and start to reclaim it before we can ever finish. There are
scenarios where forward progress becomes very slow.
Perhaps the onus is on me to whip up an example - I will do that.
> The alternative is not scaling to multiple processes.
The next process that comes along filling the pagecache will benefit the
next processes, and so on, until a page is evicted or the cache is lost..
etc. The pagecache is still useful to multiple processes.
>> It's too bad we're stuck caching entries linearly. What challenges
>> might
>> exist if we tried to use an XArray to map directory position to
>> cookie? I
>> imagine we could implement this in a single XArray by using both
>> position
>> and cookie values as indices, and differentiate between them using
>> two of
>> the three XA marks, and store a structure to represent both. Also
>> unclear
>> would be how to handle the lifetime of the XArray, since we'd no
>> longer be
>> using the VMs pagecache management..
>>
>
> You might be able to speed up first cookie lookup by having an Xarray
> that maps from a 64-bit cookie to a nfs_cache_array_entry which
> contains the next cookie to look up. However that would only work on
> 64-bit systems since xarrays take an unsigned long index.
Yes, but I would like to allow processes to cache entries non-linearly.
> Furthermore, you still need a way to map offsets to entries for the
> case where we're not able to use cookies for lseek() purposes. That's a
> linear search through the directory, which would be horrible with an
> xarray of linked cookie values (so you'd probably need a second xarray
> for that?).
There's xa_for_each_marked(), but it may not perform - I haven't looked
at the implementation or tested it.
> Construction and teardown of that structure would be nasty for large
> directories, since you have as many cookies as you have entries in your
> directory. IOW: You'd have to tear down 127 times as many xarray
> entries as we have now.
>
> It is not obvious that we would be able to benefit from starting at an
> arbitrary location and caching that data, since if the directory
> changed, we'd have to read in the new data anyway.
The only case where it seems obvious is for the case where a very long
listing is about to complete, and then the pagecache is invalidated, and
then that plays out over and over again. This is the pain point for our
customers that are migrating NFS workloads onto slower (more latent)
cloud infrastructure.
> Memory management would need to be implemented somehow. You'd need a
> shrinker for this tree that could intelligently prune it.
nod.. thanks for your thoughts on this.
Ben
On Wed, 2020-11-11 at 14:53 -0500, Benjamin Coddington wrote:
> On 11 Nov 2020, at 12:34, Trond Myklebust wrote:
>
> > On Wed, 2020-11-11 at 11:43 -0500, Benjamin Coddington wrote:
> > > On 9 Nov 2020, at 16:46, Trond Myklebust wrote:
> > >
> > > > On Mon, 2020-11-09 at 16:41 -0500, Benjamin Coddington wrote:
> > > > > On 7 Nov 2020, at 9:03, [email protected] wrote:
> > > > >
> > > > > > From: Trond Myklebust <[email protected]>
> > > > > >
> > > > > > If the directory is changing, causing the page cache to get
> > > > > > invalidated
> > > > > > while we are listing the contents, then the NFS client is
> > > > > > currently
> > > > > > forced
> > > > > > to read in the entire directory contents from scratch,
> > > > > > because
> > > > > > it
> > > > > > needs
> > > > > > to perform a linear search for the readdir cookie. While
> > > > > > this
> > > > > > is
> > > > > > not
> > > > > > an issue for small directories, it does not scale to
> > > > > > directories
> > > > > > with
> > > > > > millions of entries.
> > > > > > In order to be able to deal with large directories that are
> > > > > > changing,
> > > > > > add a heuristic to ensure that if the page cache is empty,
> > > > > > and
> > > > > > we
> > > > > > are
> > > > > > searching for a cookie that is not the zero cookie, we just
> > > > > > default
> > > > > > to
> > > > > > performing uncached readdir.
> > > > > >
> > > > > > Signed-off-by: Trond Myklebust
> > > > > > <[email protected]>
> > > > > > ---
> > > > > > fs/nfs/dir.c | 17 +++++++++++++++++
> > > > > > 1 file changed, 17 insertions(+)
> > > > > >
> > > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > > > > index 238872d116f7..d7a9efd31ecd 100644
> > > > > > --- a/fs/nfs/dir.c
> > > > > > +++ b/fs/nfs/dir.c
> > > > > > @@ -917,11 +917,28 @@ static int
> > > > > > find_and_lock_cache_page(struct
> > > > > > nfs_readdir_descriptor *desc)
> > > > > > return res;
> > > > > > }
> > > > > >
> > > > > > +static bool nfs_readdir_dont_search_cache(struct
> > > > > > nfs_readdir_descriptor *desc)
> > > > > > +{
> > > > > > + struct address_space *mapping = desc->file-
> > > > > > >f_mapping;
> > > > > > + struct inode *dir = file_inode(desc->file);
> > > > > > + unsigned int dtsize = NFS_SERVER(dir)->dtsize;
> > > > > > + loff_t size = i_size_read(dir);
> > > > > > +
> > > > > > + /*
> > > > > > + * Default to uncached readdir if the page cache is
> > > > > > empty,
> > > > > > and
> > > > > > + * we're looking for a non-zero cookie in a large
> > > > > > directory.
> > > > > > + */
> > > > > > + return desc->dir_cookie != 0 && mapping->nrpages ==
> > > > > > 0
> > > > > > &&
> > > > > > size >
> > > > > > dtsize;
> > > > >
> > > > > inode size > dtsize is a little hand-wavy. We have a lot of
> > > > > customers
> > > > > trying to
> > > > > reverse-engineer nfs_readdir() behavior instead of reading
> > > > > the
> > > > > code,
> > > > > this
> > > > > is sure to drive them crazy.
> > > > >
> > > > > That said, in the absence of an easy way to make it tunable,
> > > > > I
> > > > > don't
> > > > > have
> > > > > anything better to suggest.
> > > > >
> > > > > Reviewed-by: Benjamin Coddington <[email protected]>
> > > >
> > > >
> > > > Right. It is a heuristic, but I would expect that the directory
> > > > size is
> > > > going to be somewhat proportional to the number of RPC calls we
> > > > need to
> > > > perform to read it. That number again is somewhat proportional
> > > > to
> > > > the
> > > > dtsize.
> > > >
> > > > IOW: The general idea is correct.
> > >
> > > I can agree with that, but I have another thought:
> > >
> > > If the point of the heuristic is to allow a full listing to
> > > eventually
> > > complete, it should not be dependent on mapping->nrpages == 0.
> > > Otherwise,
> > > other processes can start filling the cache and we're back to the
> > > situation
> > > where filling the cache could take longer than acdirmax, and
> > > things
> > > eventually congest to a halt.
> > >
> > > Flipping a bit on the context to remain uncached gives a better
> > > assurance we
> > > can continue to make forward progress.
> >
> > I disagree. The point of the page cache is to allow sharing of
> > information between processes where possible. If there are multiple
> > processes all trying to make progress, and one of them starts
> > filling
> > the page cache from scratch, then why should we not use that?
>
> Because the process that starts filling the pagecache from scratch
> then
> enjoins the process that may be nearly finished listing the directory
> to
> start over waiting for the page cache to be filled (or help fill it).
>
> If the time taken to get to a certain offset/cookie exceeds the time
> to
> cache the directory's attributes, we'll drop the pagecache, or if
> we're
> perhaps using READDIRPLUS with many entries, we'll saturate the
> memory on
> the machine and start to reclaim it before we can ever finish. There
> are
> scenarios where forward progress becomes very slow.
>
> Perhaps the onus is on me to whip up an example - I will do that.
>
> > The alternative is not scaling to multiple processes.
>
> The next process that comes along filling the pagecache will benefit
> the
> next processes, and so on, until a page is evicted or the cache is
> lost..
> etc. The pagecache is still useful to multiple processes.
>
> > > It's too bad we're stuck caching entries linearly. What
> > > challenges
> > > might
> > > exist if we tried to use an XArray to map directory position to
> > > cookie? I
> > > imagine we could implement this in a single XArray by using both
> > > position
> > > and cookie values as indices, and differentiate between them
> > > using
> > > two of
> > > the three XA marks, and store a structure to represent both.
> > > Also
> > > unclear
> > > would be how to handle the lifetime of the XArray, since we'd no
> > > longer be
> > > using the VMs pagecache management..
> > >
> >
> > You might be able to speed up first cookie lookup by having an
> > Xarray
> > that maps from a 64-bit cookie to a nfs_cache_array_entry which
> > contains the next cookie to look up. However that would only work
> > on
> > 64-bit systems since xarrays take an unsigned long index.
>
> Yes, but I would like to allow processes to cache entries non-
> linearly.
You still have to play them back in linear fashion. If you're using the
cookie as a lookup key, then it would require you to look up entries 1
at a time (i.e. look up cookie to find new entry with a cookie that
points to the next entry to be looked up). It would be slow...
>
> > Furthermore, you still need a way to map offsets to entries for the
> > case where we're not able to use cookies for lseek() purposes.
> > That's a
> > linear search through the directory, which would be horrible with
> > an
> > xarray of linked cookie values (so you'd probably need a second
> > xarray
> > for that?).
>
> There's xa_for_each_marked(), but it may not perform - I haven't
> looked
> at the implementation or tested it.
That looks up the directory in cookie order, not in the directory
order.
IOW: it might work OK for XFS, which appears to use ordered cookies,
but it will break badly with ext4, which uses hashed cookies.
>
> > Construction and teardown of that structure would be nasty for
> > large
> > directories, since you have as many cookies as you have entries in
> > your
> > directory. IOW: You'd have to tear down 127 times as many xarray
> > entries as we have now.
> >
> > It is not obvious that we would be able to benefit from starting at
> > an
> > arbitrary location and caching that data, since if the directory
> > changed, we'd have to read in the new data anyway.
>
> The only case where it seems obvious is for the case where a very
> long
> listing is about to complete, and then the pagecache is invalidated,
> and
> then that plays out over and over again. This is the pain point for
> our
> customers that are migrating NFS workloads onto slower (more latent)
> cloud infrastructure.
In testing, I found that the current patchset performed just fine
w.r.t. the readdir count. The reason why I wasn't seeing huge
performance increases when doing an 'rm -rf', for instance, was due to
there being 2 GETATTRs and 1 LOOKUP per unlink() call.
>
> > Memory management would need to be implemented somehow. You'd need
> > a
> > shrinker for this tree that could intelligently prune it.
>
> nod.. thanks for your thoughts on this.
>
> Ben
>
--
Trond Myklebust
CTO, Hammerspace Inc
4984 El Camino Real, Suite 208
Los Altos, CA 94022
http://www.hammer.space