From: Trond Myklebust <[email protected]>
The check for duplicate readdir cookies should only care if the change
attribute is invalid or the data cache is invalid.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f2df664db020..fa4d33687d2b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -411,7 +411,8 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
static bool
nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
{
- if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+ if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_DATA))
return false;
smp_rmb();
return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
--
2.31.1
From: Trond Myklebust <[email protected]>
If a user is doing 'ls -l', we have a heuristic in GETATTR that tells
the readdir code to try to use READDIRPLUS in order to refresh the inode
attributes. In certain cirumstances, we also try to invalidate the
remaining directory entries in order to ensure this refresh.
If there are multiple readers of the directory, we probably should avoid
invalidating the page cache, since the heuristic breaks down in that
situation anyway.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 16 +++++++++++-----
include/linux/nfs_fs.h | 5 ++---
2 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fa4d33687d2b..358033aea235 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -78,6 +78,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
ctx->attr_gencount = nfsi->attr_gencount;
ctx->dir_cookie = 0;
ctx->dup_cookie = 0;
+ ctx->page_index = 0;
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -85,6 +86,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
list_add(&ctx->list, &nfsi->open_files);
+ clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
spin_unlock(&dir->i_lock);
return ctx;
}
@@ -627,8 +629,7 @@ void nfs_force_use_readdirplus(struct inode *dir)
if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
!list_empty(&nfsi->open_files)) {
set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
- invalidate_mapping_pages(dir->i_mapping,
- nfsi->page_index + 1, -1);
+ set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
}
}
@@ -938,10 +939,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
sizeof(nfsi->cookieverf));
}
res = nfs_readdir_search_array(desc);
- if (res == 0) {
- nfsi->page_index = desc->page_index;
+ if (res == 0)
return 0;
- }
nfs_readdir_page_unlock_and_put_cached(desc);
return res;
}
@@ -1080,6 +1079,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
+ pgoff_t page_index;
int res;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -1110,10 +1110,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->dir_cookie = dir_ctx->dir_cookie;
desc->dup_cookie = dir_ctx->dup_cookie;
desc->duped = dir_ctx->duped;
+ page_index = dir_ctx->page_index;
desc->attr_gencount = dir_ctx->attr_gencount;
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
spin_unlock(&file->f_lock);
+ if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
+ list_is_singular(&nfsi->open_files))
+ invalidate_mapping_pages(inode->i_mapping, page_index, -1);
+
do {
res = readdir_search_pagecache(desc);
@@ -1150,6 +1155,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
dir_ctx->dup_cookie = desc->dup_cookie;
dir_ctx->duped = desc->duped;
dir_ctx->attr_gencount = desc->attr_gencount;
+ dir_ctx->page_index = desc->page_index;
memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
spin_unlock(&file->f_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 9b75448ce0df..ca547cc5458c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -103,6 +103,7 @@ struct nfs_open_dir_context {
__be32 verf[NFS_DIR_VERIFIER_SIZE];
__u64 dir_cookie;
__u64 dup_cookie;
+ pgoff_t page_index;
signed char duped;
};
@@ -181,9 +182,6 @@ struct nfs_inode {
struct rw_semaphore rmdir_sem;
struct mutex commit_mutex;
- /* track last access to cached pages */
- unsigned long page_index;
-
#if IS_ENABLED(CONFIG_NFS_V4)
struct nfs4_cached_acl *nfs4_acl;
/* NFSv4 state */
@@ -272,6 +270,7 @@ struct nfs4_copy_state {
#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */
+#define NFS_INO_FORCE_READDIR (7) /* force readdirplus */
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
--
2.31.1
On 29 Sep 2021, at 9:49, [email protected] wrote:
> From: Trond Myklebust <[email protected]>
>
> If a user is doing 'ls -l', we have a heuristic in GETATTR that tells
> the readdir code to try to use READDIRPLUS in order to refresh the inode
> attributes. In certain cirumstances, we also try to invalidate the
> remaining directory entries in order to ensure this refresh.
>
> If there are multiple readers of the directory, we probably should avoid
> invalidating the page cache, since the heuristic breaks down in that
> situation anyway.
Hi Trond, I'm curious about the motivation for this work because we're often
managing expectations about performance between various workloads. What
does the workload look like that prompted you to make this optimization?
I'm also interested because I have some work that improves readdir
performance for multiple readers on large directories, but is rotting
because things have been "good enough" for folks over here.
Ben
On Wed, 2021-09-29 at 10:56 -0400, Benjamin Coddington wrote:
> On 29 Sep 2021, at 9:49, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > If a user is doing 'ls -l', we have a heuristic in GETATTR that
> > tells
> > the readdir code to try to use READDIRPLUS in order to refresh the
> > inode
> > attributes. In certain cirumstances, we also try to invalidate the
> > remaining directory entries in order to ensure this refresh.
> >
> > If there are multiple readers of the directory, we probably should
> > avoid
> > invalidating the page cache, since the heuristic breaks down in
> > that
> > situation anyway.
>
> Hi Trond, I'm curious about the motivation for this work because
> we're often
> managing expectations about performance between various workloads.
> What
> does the workload look like that prompted you to make this
> optimization?
>
> I'm also interested because I have some work that improves readdir
> performance for multiple readers on large directories, but is rotting
> because things have been "good enough" for folks over here.
>
Are you talking about this patch specifically, or the series in
general?
The general motivation for the series is that in certain circumstances
involving a read-only scenario (no changes to the directory) and
multiple processes we're seeing a regression w.r.t. older kernels.
The motivation for this particular patch is twofold:
1. I want to get rid of the cached page_index in the inode and
replace it with something less persistent (inodes are forever,
unlike file descriptors).
2. The heuristic in question is designed to only work in the single
process/single threaded case.
--
Trond Myklebust Linux NFS client maintainer, Hammerspace
[email protected]
On 29 Sep 2021, at 12:23, Trond Myklebust wrote:
> On Wed, 2021-09-29 at 10:56 -0400, Benjamin Coddington wrote:
>> On 29 Sep 2021, at 9:49, [email protected] wrote:
>>
>>> From: Trond Myklebust <[email protected]>
>>>
>>> If a user is doing 'ls -l', we have a heuristic in GETATTR that
>>> tells
>>> the readdir code to try to use READDIRPLUS in order to refresh the
>>> inode
>>> attributes. In certain cirumstances, we also try to invalidate the
>>> remaining directory entries in order to ensure this refresh.
>>>
>>> If there are multiple readers of the directory, we probably should
>>> avoid
>>> invalidating the page cache, since the heuristic breaks down in
>>> that
>>> situation anyway.
>>
>> Hi Trond, I'm curious about the motivation for this work because
>> we're often
>> managing expectations about performance between various workloads.
>> What
>> does the workload look like that prompted you to make this
>> optimization?
>>
>> I'm also interested because I have some work that improves readdir
>> performance for multiple readers on large directories, but is rotting
>> because things have been "good enough" for folks over here.
>>
>
> Are you talking about this patch specifically, or the series in
> general?
A bit of both - the first two patches didn't really make sense to me, but I
get it now. Thanks.
> The general motivation for the series is that in certain circumstances
> involving a read-only scenario (no changes to the directory) and
> multiple processes we're seeing a regression w.r.t. older kernels.
>
> The motivation for this particular patch is twofold:
> 1. I want to get rid of the cached page_index in the inode and
> replace it with something less persistent (inodes are forever,
> unlike file descriptors).
> 2. The heuristic in question is designed to only work in the single
> process/single threaded case.
Make sense to me now.
I'm wondering if this might be creating a READDIR op amplification for N
readers sharing the directory's fd because one process can be dropping the
cache, which artificially deflates desc->page_index for a given dir_cookie.
That lower page_index gets reused on the next pass dropping the cache, and
it'll keep using the bottom pages and doing READDIRs. That wasn't possible
before because we only invalidated past nfsi->page_index.
Maybe an unlikely case (but I think some http servers could behave this
way), and I'd have to test it to be sure. I can try to do that unless you
think its not possible or concerning.
Ben
On Wed, 2021-09-29 at 16:21 -0400, Benjamin Coddington wrote:
> On 29 Sep 2021, at 12:23, Trond Myklebust wrote:
>
> > On Wed, 2021-09-29 at 10:56 -0400, Benjamin Coddington wrote:
> > > On 29 Sep 2021, at 9:49, [email protected] wrote:
> > >
> > > > From: Trond Myklebust <[email protected]>
> > > >
> > > > If a user is doing 'ls -l', we have a heuristic in GETATTR that
> > > > tells
> > > > the readdir code to try to use READDIRPLUS in order to refresh
> > > > the
> > > > inode
> > > > attributes. In certain cirumstances, we also try to invalidate
> > > > the
> > > > remaining directory entries in order to ensure this refresh.
> > > >
> > > > If there are multiple readers of the directory, we probably
> > > > should
> > > > avoid
> > > > invalidating the page cache, since the heuristic breaks down in
> > > > that
> > > > situation anyway.
> > >
> > > Hi Trond, I'm curious about the motivation for this work because
> > > we're often
> > > managing expectations about performance between various
> > > workloads.
> > > What
> > > does the workload look like that prompted you to make this
> > > optimization?
> > >
> > > I'm also interested because I have some work that improves
> > > readdir
> > > performance for multiple readers on large directories, but is
> > > rotting
> > > because things have been "good enough" for folks over here.
> > >
> >
> > Are you talking about this patch specifically, or the series in
> > general?
>
> A bit of both - the first two patches didn't really make sense to me,
> but I
> get it now. Thanks.
>
> > The general motivation for the series is that in certain
> > circumstances
> > involving a read-only scenario (no changes to the directory) and
> > multiple processes we're seeing a regression w.r.t. older kernels.
> >
> > The motivation for this particular patch is twofold:
> > 1. I want to get rid of the cached page_index in the inode and
> > replace it with something less persistent (inodes are
> > forever,
> > unlike file descriptors).
> > 2. The heuristic in question is designed to only work in the
> > single
> > process/single threaded case.
>
> Make sense to me now.
>
> I'm wondering if this might be creating a READDIR op amplification
> for N
> readers sharing the directory's fd because one process can be
> dropping the
> cache, which artificially deflates desc->page_index for a given
> dir_cookie.
> That lower page_index gets reused on the next pass dropping the
> cache, and
> it'll keep using the bottom pages and doing READDIRs. That wasn't
> possible
> before because we only invalidated past nfsi->page_index.
>
> Maybe an unlikely case (but I think some http servers could behave
> this
> way), and I'd have to test it to be sure. I can try to do that
> unless you
> think its not possible or concerning.
>
It is concerning, and indeed in our test we are seeing READDIR
amplification with these multiple process accesses. So scenarios like
the one you describe above are exactly the kind of issue I was looking
to fix with these patches.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 29 Sep 2021, at 16:35, Trond Myklebust wrote:
> It is concerning, and indeed in our test we are seeing READDIR
> amplification with these multiple process accesses. So scenarios like
> the one you describe above are exactly the kind of issue I was looking
> to fix with these patches.
I spent some time trying to trigger the scenario I was concerned about, but
I couldn't generate any op amplifications. Feel free to add my:
Tested-by: Benjamin Coddington <[email protected]>
Reviewed-by: Benjamin Coddington <[email protected]>
for the series.
Ben
On Thu, 2021-09-30 at 10:29 -0400, Benjamin Coddington wrote:
> On 29 Sep 2021, at 16:35, Trond Myklebust wrote:
> > It is concerning, and indeed in our test we are seeing READDIR
> > amplification with these multiple process accesses. So scenarios
> > like
> > the one you describe above are exactly the kind of issue I was
> > looking
> > to fix with these patches.
>
> I spent some time trying to trigger the scenario I was concerned
> about, but
> I couldn't generate any op amplifications. Feel free to add my:
>
> Tested-by: Benjamin Coddington <[email protected]>
> Reviewed-by: Benjamin Coddington <[email protected]>
>
Thanks Ben! I really appreciate your help.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]