2022-02-28 02:29:27

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v9 13/27] NFS: Reduce use of uncached readdir

From: Trond Myklebust <[email protected]>

When reading a very large directory, we want to try to keep the page
cache up to date if doing so is inexpensive. With the change to allow
readdir to continue reading even when the cache is incomplete, we no
longer need to fall back to uncached readdir in order to scale to large
directories.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 23 +++--------------------
1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0c190c93901e..0b7d4be38452 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -999,28 +999,11 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
return res;
}

-static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
-{
- struct address_space *mapping = desc->file->f_mapping;
- struct inode *dir = file_inode(desc->file);
- unsigned int dtsize = NFS_SERVER(dir)->dtsize;
- loff_t size = i_size_read(dir);
-
- /*
- * Default to uncached readdir if the page cache is empty, and
- * we're looking for a non-zero cookie in a large directory.
- */
- return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
-}
-
/* Search for desc->dir_cookie from the beginning of the page cache */
static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
{
int res;

- if (nfs_readdir_dont_search_cache(desc))
- return -EBADCOOKIE;
-
do {
if (desc->page_index == 0) {
desc->current_index = 0;
@@ -1273,10 +1256,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
- if (!nfs_readdir_use_cookie(filp)) {
+ dir_ctx->page_index = 0;
+ if (!nfs_readdir_use_cookie(filp))
dir_ctx->dir_cookie = 0;
- dir_ctx->page_index = 0;
- } else
+ else
dir_ctx->dir_cookie = offset;
if (offset == 0)
memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
--
2.35.1


2022-02-28 06:34:26

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v9 14/27] NFS: Improve heuristic for readdirplus

From: Trond Myklebust <[email protected]>

The heuristic for readdirplus is designed to try to detect 'ls -l' and
similar patterns. It does so by looking for cache hit/miss patterns in
both the attribute cache and in the dcache of the files in a given
directory, and then sets a flag for the readdirplus code to interpret.

The problem with this approach is that a single attribute or dcache miss
can cause the NFS code to force a refresh of the attributes for the
entire set of files contained in the directory.

To be able to make a more nuanced decision, let's sample the number of
hits and misses in the set of open directory descriptors. That allows us
to set thresholds at which we start preferring READDIRPLUS over regular
READDIR, or at which we start to force a re-read of the remaining
readdir cache using READDIRPLUS.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 82 ++++++++++++++++++++++++++----------------
fs/nfs/inode.c | 4 +--
fs/nfs/internal.h | 4 +--
fs/nfs/nfstrace.h | 1 -
include/linux/nfs_fs.h | 5 +--
5 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0b7d4be38452..c5c7175a257c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -87,8 +87,7 @@ alloc_nfs_open_dir_context(struct inode *dir)
nfs_set_cache_invalid(dir,
NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
- list_add(&ctx->list, &nfsi->open_files);
- clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf));
spin_unlock(&dir->i_lock);
return ctx;
@@ -99,9 +98,9 @@ alloc_nfs_open_dir_context(struct inode *dir)
static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
{
spin_lock(&dir->i_lock);
- list_del(&ctx->list);
+ list_del_rcu(&ctx->list);
spin_unlock(&dir->i_lock);
- kfree(ctx);
+ kfree_rcu(ctx, rcu_head);
}

/*
@@ -594,7 +593,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
/* We requested READDIRPLUS, but the server doesn't grok it */
if (error == -ENOTSUPP && desc->plus) {
NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
- clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
desc->plus = arg.plus = false;
goto again;
}
@@ -644,51 +642,61 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
return 1;
}

-static
-bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL)
+
+static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
+ unsigned int cache_hits,
+ unsigned int cache_misses)
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
- if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
- return true;
- if (ctx->pos == 0)
+ if (ctx->pos == 0 ||
+ cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
return true;
return false;
}

/*
- * This function is called by the lookup and getattr code to request the
+ * This function is called by the getattr code to request the
* use of readdirplus to accelerate any future lookups in the same
* directory.
*/
-void nfs_advise_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_hit(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;

- if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files))
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_hits);
+ rcu_read_unlock();
+ }
}

/*
* This function is mainly for use by nfs_getattr().
*
* If this is an 'ls -l', we want to force use of readdirplus.
- * Do this by checking if there is an active file descriptor
- * and calling nfs_advise_use_readdirplus, then forcing a
- * cache flush.
*/
-void nfs_force_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_miss(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;

- if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files)) {
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
- set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_misses);
+ rcu_read_unlock();
}
}

+static void nfs_lookup_advise_force_readdirplus(struct inode *dir)
+{
+ nfs_readdir_record_entry_cache_miss(dir);
+}
+
static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
unsigned long dir_verifier)
@@ -1122,6 +1130,19 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
return status;
}

+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
+static void nfs_readdir_handle_cache_misses(struct inode *inode,
+ struct nfs_readdir_descriptor *desc,
+ pgoff_t page_index,
+ unsigned int cache_misses)
+{
+ if (desc->ctx->pos == 0 ||
+ cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
+ return;
+ invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+}
+
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -1133,6 +1154,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
+ unsigned int cache_hits, cache_misses;
pgoff_t page_index;
int res;

@@ -1154,7 +1176,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out;
desc->file = file;
desc->ctx = ctx;
- desc->plus = nfs_use_readdirplus(inode, ctx);
desc->page_index_max = -1;

spin_lock(&file->f_lock);
@@ -1168,6 +1189,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->eof = dir_ctx->eof;
nfs_set_dtsize(desc, dir_ctx->dtsize);
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+ cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0);
+ cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0);
spin_unlock(&file->f_lock);

if (desc->eof) {
@@ -1175,9 +1198,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out_free;
}

- if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
- list_is_singular(&nfsi->open_files))
- invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+ desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
+ nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses);

do {
res = readdir_search_pagecache(desc);
@@ -1196,7 +1218,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
}
if (res == -ETOOSMALL && desc->plus) {
- clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
nfs_zap_caches(inode);
desc->page_index = 0;
desc->plus = false;
@@ -1610,7 +1631,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
nfs_set_verifier(dentry, dir_verifier);

/* set a readdirplus hint that we had a cache miss */
- nfs_force_use_readdirplus(dir);
+ nfs_lookup_advise_force_readdirplus(dir);
ret = 1;
out:
nfs_free_fattr(fattr);
@@ -1667,7 +1688,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
nfs_mark_dir_for_revalidate(dir);
goto out_bad;
}
- nfs_advise_use_readdirplus(dir);
goto out_valid;
}

@@ -1872,7 +1892,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
goto out;

/* Notify readdir to use READDIRPLUS */
- nfs_force_use_readdirplus(dir);
+ nfs_lookup_advise_force_readdirplus(dir);

no_entry:
res = d_splice_alias(inode, dentry);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7cecabf57b95..bbf4357ff727 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -787,7 +787,7 @@ static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
return;
parent = dget_parent(dentry);
- nfs_force_use_readdirplus(d_inode(parent));
+ nfs_readdir_record_entry_cache_miss(d_inode(parent));
dput(parent);
}

@@ -798,7 +798,7 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
return;
parent = dget_parent(dentry);
- nfs_advise_use_readdirplus(d_inode(parent));
+ nfs_readdir_record_entry_cache_hit(d_inode(parent));
dput(parent);
}

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b5398af53c7f..194840a97e3a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -366,8 +366,8 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);

/* dir.c */
-extern void nfs_advise_use_readdirplus(struct inode *dir);
-extern void nfs_force_use_readdirplus(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_hit(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_miss(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 45a310b586ce..3672f6703ee7 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,7 +36,6 @@

#define nfs_show_nfsi_flags(v) \
__print_flags(v, "|", \
- { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
{ BIT(NFS_INO_STALE), "STALE" }, \
{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 691a27936849..20a4cf0acad2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -101,6 +101,8 @@ struct nfs_open_context {

struct nfs_open_dir_context {
struct list_head list;
+ atomic_t cache_hits;
+ atomic_t cache_misses;
unsigned long attr_gencount;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
__u64 dir_cookie;
@@ -110,6 +112,7 @@ struct nfs_open_dir_context {
unsigned int dtsize;
signed char duped;
bool eof;
+ struct rcu_head rcu_head;
};

/*
@@ -274,13 +277,11 @@ struct nfs4_copy_state {
/*
* Bit offsets in flags field
*/
-#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */
#define NFS_INO_STALE (1) /* possible stale inode */
#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
#define NFS_INO_PRESERVE_UNLINKED (4) /* preserve file if removed while open */
#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
-#define NFS_INO_FORCE_READDIR (7) /* force readdirplus */
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
--
2.35.1

2022-03-10 00:31:24

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v9 14/27] NFS: Improve heuristic for readdirplus

On 27 Feb 2022, at 18:12, [email protected] wrote:

> From: Trond Myklebust <[email protected]>
>
> The heuristic for readdirplus is designed to try to detect 'ls -l' and
> similar patterns. It does so by looking for cache hit/miss patterns in
> both the attribute cache and in the dcache of the files in a given
> directory, and then sets a flag for the readdirplus code to interpret.
>
> The problem with this approach is that a single attribute or dcache miss
> can cause the NFS code to force a refresh of the attributes for the
> entire set of files contained in the directory.
>
> To be able to make a more nuanced decision, let's sample the number of
> hits and misses in the set of open directory descriptors. That allows us
> to set thresholds at which we start preferring READDIRPLUS over regular
> READDIR, or at which we start to force a re-read of the remaining
> readdir cache using READDIRPLUS.

I like this patch very much.

The heuristic doesn't kick-in until "ls -l" makes its second call into
nfs_readdir(), and for my filenames with 8 chars, that means that there are
about 5800 GETATTRs generated before we clean the cache to do more
READDIRPLUS. That's a large number to compound on connection latency.

We've already got some complaints that folk's 2nd "ls -l" takes "so much
longer" after 1a34c8c9a49e.

Can we possibly limit our first pass through nfs_readdir() so that the
heuristic takes effect sooner?

Ben

2022-03-10 23:05:38

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v9 14/27] NFS: Improve heuristic for readdirplus

On Wed, 2022-03-09 at 12:39 -0500, Benjamin Coddington wrote:
> On 27 Feb 2022, at 18:12, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > The heuristic for readdirplus is designed to try to detect 'ls -l'
> > and
> > similar patterns. It does so by looking for cache hit/miss patterns
> > in
> > both the attribute cache and in the dcache of the files in a given
> > directory, and then sets a flag for the readdirplus code to
> > interpret.
> >
> > The problem with this approach is that a single attribute or dcache
> > miss
> > can cause the NFS code to force a refresh of the attributes for the
> > entire set of files contained in the directory.
> >
> > To be able to make a more nuanced decision, let's sample the number
> > of
> > hits and misses in the set of open directory descriptors. That
> > allows us
> > to set thresholds at which we start preferring READDIRPLUS over
> > regular
> > READDIR, or at which we start to force a re-read of the remaining
> > readdir cache using READDIRPLUS.
>
> I like this patch very much.
>
> The heuristic doesn't kick-in until "ls -l" makes its second call
> into
> nfs_readdir(), and for my filenames with 8 chars, that means that
> there are
> about 5800 GETATTRs generated before we clean the cache to do more
> READDIRPLUS.  That's a large number to compound on connection
> latency.
>
> We've already got some complaints that folk's 2nd "ls -l" takes "so
> much
> longer" after 1a34c8c9a49e.
>
> Can we possibly limit our first pass through nfs_readdir() so that
> the
> heuristic takes effect sooner?
>

The problem is really that 'ls' (or possibly glibc) is passing in a
pretty huge buffer to the getdents() system call.

On my setup, that buffer appears to be 80K in size. So what happens is
that we get that first getdents() call, and so we fill the 80K buffer
with as many files as will fit. That can quickly run into several
thousand entries, if the filenames are relatively short.

Then 'ls' goes through the contents and does a stat() (or a statx()) on
each entry, and so we record the statistics. However that means those
first several thousand entries are indeed going to use cached data, or
force GETATTR to go on the wire. We only start using forced readdirplus
on the second pass.

Yes, I suppose we could limit getdents() to ignore the buffer size, and
just return fewer entries, however what's the "right" size in that
case?
More to the point, how much pain are we going to accept before we give
up trying these assorted heuristics, and just define a readdirplus()
system call modelled on statx()?

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2022-03-11 14:40:55

by Benjamin Coddington

[permalink] [raw]
Subject: [PATCH] NFS: Trigger "ls -l" readdir heuristic sooner

.. Something like this does the trick in my testing, but yes will have an
impact on regular workloads:

8<------------------------------------------------------------------------

Since commit 1a34c8c9a49e ("NFS: Support larger readdir buffers") has
updated dtsize and recent improvements to the READDIRPLUS helper heuristic,
the heuristic may not trigger until many dentries are emitted to userspace,
which may cause many thousands of GETATTR calls for "ls -l" when the
directory's pagecache has already been populated. This typically manifests
as a much slower total runtime for a _second_ invocation of "ls -l" within
the directory attribute cache timeouts.

Fix this by emitting only 17 entries for any first pass through the NFS
directory's ->iterate_shared(), which will allow userpace to prime the
counters for the heuristic.

Signed-off-by: Benjamin Coddington <[email protected]>
---
fs/nfs/dir.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7e12102b29e7..dc5fc9ba2c49 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1060,6 +1060,8 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
return res;
}

+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
/*
* Once we've found the start of the dirent within a page: fill 'er up...
*/
@@ -1069,6 +1071,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
struct file *file = desc->file;
struct nfs_cache_array *array;
unsigned int i;
+ bool first_emit = !desc->dir_cookie;

array = kmap(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -1092,6 +1095,10 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
+ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) {
+ desc->eob = true;
+ break;
+ }
}
if (array->page_is_eof)
desc->eof = !desc->eob;
@@ -1173,8 +1180,6 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
return status;
}

-#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
-
static bool nfs_readdir_handle_cache_misses(struct inode *inode,
struct nfs_readdir_descriptor *desc,
unsigned int cache_misses,
--
2.31.1

2022-03-11 23:23:57

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v9 14/27] NFS: Improve heuristic for readdirplus

On 10 Mar 2022, at 15:15, Trond Myklebust wrote:
>
> The problem is really that 'ls' (or possibly glibc) is passing in a
> pretty huge buffer to the getdents() system call.
>
> On my setup, that buffer appears to be 80K in size. So what happens is
> that we get that first getdents() call, and so we fill the 80K buffer
> with as many files as will fit. That can quickly run into several
> thousand entries, if the filenames are relatively short.
>
> Then 'ls' goes through the contents and does a stat() (or a statx()) on
> each entry, and so we record the statistics. However that means those
> first several thousand entries are indeed going to use cached data, or
> force GETATTR to go on the wire. We only start using forced readdirplus
> on the second pass.
>
> Yes, I suppose we could limit getdents() to ignore the buffer size, and
> just return fewer entries, however what's the "right" size in that
> case?

We can return fewer entries on the first call, so for the first pass the
right size is NFS_READDIR_CACHE_MISS_THRESHOLD + 1. I sent a patch.

> More to the point, how much pain are we going to accept before we give
> up trying these assorted heuristics, and just define a readdirplus()
> system call modelled on statx()?

We cursed ourselves by creating the heuristic, and now we've had to maintain
it and try to make everyone happy. The pain for us is when the behavior
keeps changing after sites have come to rely on previous performance.

I hope you can take a look at the patch.

Ben

2022-03-17 05:35:05

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH] NFS: Trigger "ls -l" readdir heuristic sooner

On Fri, Mar 11, 2022 at 9:40 AM Benjamin Coddington <[email protected]> wrote:
>
> .. Something like this does the trick in my testing, but yes will have an
> impact on regular workloads:
>
> 8<------------------------------------------------------------------------
>
> Since commit 1a34c8c9a49e ("NFS: Support larger readdir buffers") has
> updated dtsize and recent improvements to the READDIRPLUS helper heuristic,
> the heuristic may not trigger until many dentries are emitted to userspace,
> which may cause many thousands of GETATTR calls for "ls -l" when the
> directory's pagecache has already been populated. This typically manifests
> as a much slower total runtime for a _second_ invocation of "ls -l" within
> the directory attribute cache timeouts.
>
> Fix this by emitting only 17 entries for any first pass through the NFS
> directory's ->iterate_shared(), which will allow userpace to prime the
> counters for the heuristic.

Here's for what it's worth. An experiment between linux to linux where
the linux server had a "small" directory structure of 57274
directories, 5727390 files in total where each directory had ~100
files each.
With this patch:

date; time tree vol1 > tree.out && date; time tree vol1 > tree.out
Wed Mar 16 12:21:30 EDT 2022

real 11m7.923s
user 0m20.507s
sys 0m39.683s
Wed Mar 16 12:32:38 EDT 2022

real 40m1.751s
user 0m23.477s
sys 0m45.663s

Without the patch:
date; time tree vol1 > tree.out && date; time tree vol1 > tree.out
Wed Mar 16 13:49:12 EDT 2022

real 10m52.909s
user 0m21.342s
sys 0m39.198s
Wed Mar 16 14:00:05 EDT 2022

real 222m56.990s
user 0m30.392s
sys 2m25.202s


>
> Signed-off-by: Benjamin Coddington <[email protected]>
> ---
> fs/nfs/dir.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 7e12102b29e7..dc5fc9ba2c49 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -1060,6 +1060,8 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
> return res;
> }
>
> +#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
> +
> /*
> * Once we've found the start of the dirent within a page: fill 'er up...
> */
> @@ -1069,6 +1071,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
> struct file *file = desc->file;
> struct nfs_cache_array *array;
> unsigned int i;
> + bool first_emit = !desc->dir_cookie;
>
> array = kmap(desc->page);
> for (i = desc->cache_entry_index; i < array->size; i++) {
> @@ -1092,6 +1095,10 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
> desc->ctx->pos = desc->dir_cookie;
> else
> desc->ctx->pos++;
> + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) {
> + desc->eob = true;
> + break;
> + }
> }
> if (array->page_is_eof)
> desc->eof = !desc->eob;
> @@ -1173,8 +1180,6 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
> return status;
> }
>
> -#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
> -
> static bool nfs_readdir_handle_cache_misses(struct inode *inode,
> struct nfs_readdir_descriptor *desc,
> unsigned int cache_misses,
> --
> 2.31.1
>