From: Trond Myklebust <[email protected]>
When reading a very large directory, we want to try to keep the page
cache up to date if doing so is inexpensive. Right now, we will try to
refill the page cache if it is non-empty, irrespective of whether or not
doing so is going to take a long time.
Replace that algorithm with something that looks at how many times we've
refilled the page cache without seeing a cache hit.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 51 +++++++++++++++++++++---------------------
include/linux/nfs_fs.h | 1 +
2 files changed, 27 insertions(+), 25 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a7f25fef890a..d0707e63ce45 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -71,19 +71,16 @@ const struct address_space_operations nfs_dir_aops = {
#define NFS_INIT_DTSIZE PAGE_SIZE
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
+static struct nfs_open_dir_context *
+alloc_nfs_open_dir_context(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
struct nfs_open_dir_context *ctx;
- ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
if (ctx != NULL) {
- ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- ctx->dir_cookie = 0;
- ctx->dup_cookie = 0;
- ctx->page_index = 0;
ctx->dtsize = NFS_INIT_DTSIZE;
- ctx->eof = false;
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -170,6 +167,7 @@ struct nfs_readdir_descriptor {
unsigned long timestamp;
unsigned long gencount;
unsigned long attr_gencount;
+ unsigned int page_fill_misses;
unsigned int cache_entry_index;
unsigned int buffer_fills;
unsigned int dtsize;
@@ -925,6 +923,18 @@ nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
desc->last_cookie);
}
+#define NFS_READDIR_PAGE_FILL_MISS_MAX 5
+/*
+ * If we've tried to refill the page cache more than 5 times, and
+ * still not found our cookie, then we should stop and fall back
+ * to uncached readdir
+ */
+static bool nfs_readdir_may_fill_pagecache(struct nfs_readdir_descriptor *desc)
+{
+ return desc->dir_cookie == 0 ||
+ desc->page_fill_misses < NFS_READDIR_PAGE_FILL_MISS_MAX;
+}
+
/*
* Returns 0 if desc->dir_cookie was found on page desc->page_index
* and locks the page to prevent removal from the page cache.
@@ -940,6 +950,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
if (!desc->page)
return -ENOMEM;
if (nfs_readdir_page_needs_filling(desc->page)) {
+ if (!nfs_readdir_may_fill_pagecache(desc))
+ return -EBADCOOKIE;
desc->page_index_max = desc->page_index;
res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
&desc->page, 1);
@@ -958,36 +970,22 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
if (desc->page_index == 0)
memcpy(nfsi->cookieverf, verf,
sizeof(nfsi->cookieverf));
+ desc->page_fill_misses++;
}
res = nfs_readdir_search_array(desc);
- if (res == 0)
+ if (res == 0) {
+ desc->page_fill_misses = 0;
return 0;
+ }
nfs_readdir_page_unlock_and_put_cached(desc);
return res;
}
-static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
-{
- struct address_space *mapping = desc->file->f_mapping;
- struct inode *dir = file_inode(desc->file);
- unsigned int dtsize = NFS_SERVER(dir)->dtsize;
- loff_t size = i_size_read(dir);
-
- /*
- * Default to uncached readdir if the page cache is empty, and
- * we're looking for a non-zero cookie in a large directory.
- */
- return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
-}
-
/* Search for desc->dir_cookie from the beginning of the page cache */
static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
{
int res;
- if (nfs_readdir_dont_search_cache(desc))
- return -EBADCOOKIE;
-
do {
if (desc->page_index == 0) {
desc->current_index = 0;
@@ -1149,6 +1147,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
page_index = dir_ctx->page_index;
desc->attr_gencount = dir_ctx->attr_gencount;
desc->eof = dir_ctx->eof;
+ desc->page_fill_misses = dir_ctx->page_fill_misses;
nfs_set_dtsize(desc, dir_ctx->dtsize);
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
spin_unlock(&file->f_lock);
@@ -1204,6 +1203,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
dir_ctx->duped = desc->duped;
dir_ctx->attr_gencount = desc->attr_gencount;
dir_ctx->page_index = desc->page_index;
+ dir_ctx->page_fill_misses = desc->page_fill_misses;
dir_ctx->eof = desc->eof;
dir_ctx->dtsize = desc->dtsize;
memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
@@ -1247,6 +1247,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
dir_ctx->dir_cookie = offset;
else
dir_ctx->dir_cookie = 0;
+ dir_ctx->page_fill_misses = 0;
if (offset == 0)
memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
dir_ctx->duped = 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d27f7e788624..3165927048e4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -106,6 +106,7 @@ struct nfs_open_dir_context {
__u64 dir_cookie;
__u64 dup_cookie;
pgoff_t page_index;
+ unsigned int page_fill_misses;
unsigned int dtsize;
signed char duped;
bool eof;
--
2.35.1
On 21 Feb 2022, at 14:58, Trond Myklebust wrote:
> On Mon, 2022-02-21 at 11:45 -0500, Benjamin Coddington wrote:
>> On 21 Feb 2022, at 11:08, [email protected] wrote:
>>
>>> From: Trond Myklebust <[email protected]>
>>>
>>> When reading a very large directory, we want to try to keep the
>>> page
>>> cache up to date if doing so is inexpensive. Right now, we will try
>>> to
>>> refill the page cache if it is non-empty, irrespective of whether
>>> or not
>>> doing so is going to take a long time.
>>>
>>> Replace that algorithm with something that looks at how many times
>>> we've
>>> refilled the page cache without seeing a cache hit.
>>
>> Hi Trond, I've been following your work here - thanks for it.
>>
>> I'm wondering if there might be a regression on this patch for the
>> case
>> where two or more directory readers are part way through a large
>> directory
>> when the pagecache is truncated. If I'm reading this correctly,
>> those
>> readers will stop caching after 5 fills and finish the remainder of
>> their
>> directory reads in the uncached mode.
>>
>> Isn't there an OP amplification per reader in this case?
>>
>
> Depends... In the old case, we basically stopped doing uncached readdir
> if a third process starts filling the page cache again. In particular,
> this means we were vulnerable to restarting over and over once page
> reclaim starts to kick in for very large directories.
>
> In this new one, we have each process give it a try (5 fills each), and
> then fallback to uncached. Yes, there will be corner cases where this
> will perform less well than the old algorithm, but it should also be
> more deterministic.
>
> I am open to suggestions for better ways to determine when to cut over
> to uncached readdir. This is one way, that I think is better than what
> we have, however I'm sure it can be improved upon.
I still have old patches that allow each page to be "versioned" with the
change attribute, page_index, and cookie. This allows the page cache to be
culled page-by-page, and multiple fillers can continue to fill pages at
"headless" page offsets that match their original cookie and page_index
pair. This change would mean readers don't have to start over filling the
page cache when the cache is dropped, so we wouldn't need to worry about
when to cut over to the uncached mode - it makes the problem go away.
I felt there wasn't much interest in this work, and our most vocal customer
was happy enough with last winter's readdir improvements (thanks!) that I
didn't follow up, but I can refresh those patches and send them along again.
Ben
On 21 Feb 2022, at 11:08, [email protected] wrote:
> From: Trond Myklebust <[email protected]>
>
> When reading a very large directory, we want to try to keep the page
> cache up to date if doing so is inexpensive. Right now, we will try to
> refill the page cache if it is non-empty, irrespective of whether or not
> doing so is going to take a long time.
>
> Replace that algorithm with something that looks at how many times we've
> refilled the page cache without seeing a cache hit.
Hi Trond, I've been following your work here - thanks for it.
I'm wondering if there might be a regression on this patch for the case
where two or more directory readers are part way through a large directory
when the pagecache is truncated. If I'm reading this correctly, those
readers will stop caching after 5 fills and finish the remainder of their
directory reads in the uncached mode.
Isn't there an OP amplification per reader in this case?
Ben
On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
>
> We will always need the ability to cut over to uncached readdir.
Yes.
> If the cookie is no longer returned by the server because one or more
> files were deleted then we need to resolve the situation somehow (IOW:
> the 'rm *' case). The new algorithm _does_ improve performance on those
> situations, because it no longer requires us to read the entire
> directory before switching over: we try 5 times, then fail over.
Yes, using per-page validation doesn't remove the need for uncached readdir.
It does allow a reader to simply resume filling the cache where it left
off. There's no need to try 5 times and fail over. And there's no need to
make a trade-off and make the situation worse in certain scenarios.
I thought I'd point that out and make an offer to re-submit it. Any
interest?
Ben
From: Trond Myklebust <[email protected]>
The heuristic for readdirplus is designed to try to detect 'ls -l' and
similar patterns. It does so by looking for cache hit/miss patterns in
both the attribute cache and in the dcache of the files in a given
directory, and then sets a flag for the readdirplus code to interpret.
The problem with this approach is that a single attribute or dcache miss
can cause the NFS code to force a refresh of the attributes for the
entire set of files contained in the directory.
To be able to make a more nuanced decision, let's sample the number of
hits and misses in the set of open directory descriptors. That allows us
to set thresholds at which we start preferring READDIRPLUS over regular
READDIR, or at which we start to force a re-read of the remaining
readdir cache using READDIRPLUS.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 83 ++++++++++++++++++++++++++----------------
fs/nfs/inode.c | 4 +-
fs/nfs/internal.h | 4 +-
fs/nfs/nfstrace.h | 1 -
include/linux/nfs_fs.h | 5 ++-
5 files changed, 59 insertions(+), 38 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d0707e63ce45..c4d962bca9ef 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -87,8 +87,7 @@ alloc_nfs_open_dir_context(struct inode *dir)
nfs_set_cache_invalid(dir,
NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
- list_add(&ctx->list, &nfsi->open_files);
- clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
spin_unlock(&dir->i_lock);
return ctx;
}
@@ -98,9 +97,9 @@ alloc_nfs_open_dir_context(struct inode *dir)
static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
{
spin_lock(&dir->i_lock);
- list_del(&ctx->list);
+ list_del_rcu(&ctx->list);
spin_unlock(&dir->i_lock);
- kfree(ctx);
+ kfree_rcu(ctx, rcu_head);
}
/*
@@ -567,7 +566,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
/* We requested READDIRPLUS, but the server doesn't grok it */
if (error == -ENOTSUPP && desc->plus) {
NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
- clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
desc->plus = arg.plus = false;
goto again;
}
@@ -617,51 +615,61 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
return 1;
}
-static
-bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL)
+
+static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
+ unsigned int cache_hits,
+ unsigned int cache_misses)
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
- if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
- return true;
- if (ctx->pos == 0)
+ if (ctx->pos == 0 ||
+ cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
return true;
return false;
}
/*
- * This function is called by the lookup and getattr code to request the
+ * This function is called by the getattr code to request the
* use of readdirplus to accelerate any future lookups in the same
* directory.
*/
-void nfs_advise_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_hit(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
- if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files))
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_hits);
+ rcu_read_unlock();
+ }
}
/*
* This function is mainly for use by nfs_getattr().
*
* If this is an 'ls -l', we want to force use of readdirplus.
- * Do this by checking if there is an active file descriptor
- * and calling nfs_advise_use_readdirplus, then forcing a
- * cache flush.
*/
-void nfs_force_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_miss(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
- if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files)) {
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
- set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_misses);
+ rcu_read_unlock();
}
}
+static void nfs_lookup_advise_force_readdirplus(struct inode *dir)
+{
+ nfs_readdir_record_entry_cache_miss(dir);
+}
+
static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
unsigned long dir_verifier)
@@ -1101,6 +1109,20 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
return status;
}
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
+static void nfs_readdir_handle_cache_misses(struct inode *inode,
+ struct nfs_readdir_descriptor *desc,
+ pgoff_t page_index,
+ unsigned int cache_misses)
+{
+ if (desc->ctx->pos == 0 ||
+ cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD ||
+ !nfs_readdir_may_fill_pagecache(desc))
+ return;
+ invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+}
+
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -1112,6 +1134,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
+ unsigned int cache_hits, cache_misses;
pgoff_t page_index;
int res;
@@ -1137,7 +1160,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out;
desc->file = file;
desc->ctx = ctx;
- desc->plus = nfs_use_readdirplus(inode, ctx);
desc->page_index_max = -1;
spin_lock(&file->f_lock);
@@ -1150,6 +1172,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->page_fill_misses = dir_ctx->page_fill_misses;
nfs_set_dtsize(desc, dir_ctx->dtsize);
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+ cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0);
+ cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0);
spin_unlock(&file->f_lock);
if (desc->eof) {
@@ -1157,9 +1181,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out_free;
}
- if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
- list_is_singular(&nfsi->open_files))
- invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+ desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
+ nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses);
do {
res = readdir_search_pagecache(desc);
@@ -1178,7 +1201,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
}
if (res == -ETOOSMALL && desc->plus) {
- clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
nfs_zap_caches(inode);
desc->page_index = 0;
desc->plus = false;
@@ -1597,7 +1619,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
nfs_set_verifier(dentry, dir_verifier);
/* set a readdirplus hint that we had a cache miss */
- nfs_force_use_readdirplus(dir);
+ nfs_lookup_advise_force_readdirplus(dir);
ret = 1;
out:
nfs_free_fattr(fattr);
@@ -1654,7 +1676,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
nfs_mark_dir_for_revalidate(dir);
goto out_bad;
}
- nfs_advise_use_readdirplus(dir);
goto out_valid;
}
@@ -1859,7 +1880,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
goto out;
/* Notify readdir to use READDIRPLUS */
- nfs_force_use_readdirplus(dir);
+ nfs_lookup_advise_force_readdirplus(dir);
no_entry:
res = d_splice_alias(inode, dentry);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f9fc506ebb29..1bef81f5373a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -789,7 +789,7 @@ static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
return;
parent = dget_parent(dentry);
- nfs_force_use_readdirplus(d_inode(parent));
+ nfs_readdir_record_entry_cache_miss(d_inode(parent));
dput(parent);
}
@@ -800,7 +800,7 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
return;
parent = dget_parent(dentry);
- nfs_advise_use_readdirplus(d_inode(parent));
+ nfs_readdir_record_entry_cache_hit(d_inode(parent));
dput(parent);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2de7c56a1fbe..46dc97b65661 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -366,8 +366,8 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);
/* dir.c */
-extern void nfs_advise_use_readdirplus(struct inode *dir);
-extern void nfs_force_use_readdirplus(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_hit(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_miss(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 45a310b586ce..3672f6703ee7 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,7 +36,6 @@
#define nfs_show_nfsi_flags(v) \
__print_flags(v, "|", \
- { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
{ BIT(NFS_INO_STALE), "STALE" }, \
{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3165927048e4..0a5425a58bbd 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -101,6 +101,8 @@ struct nfs_open_context {
struct nfs_open_dir_context {
struct list_head list;
+ atomic_t cache_hits;
+ atomic_t cache_misses;
unsigned long attr_gencount;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
__u64 dir_cookie;
@@ -110,6 +112,7 @@ struct nfs_open_dir_context {
unsigned int dtsize;
signed char duped;
bool eof;
+ struct rcu_head rcu_head;
};
/*
@@ -274,13 +277,11 @@ struct nfs4_copy_state {
/*
* Bit offsets in flags field
*/
-#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */
#define NFS_INO_STALE (1) /* possible stale inode */
#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
#define NFS_INO_PRESERVE_UNLINKED (4) /* preserve file if removed while open */
#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
-#define NFS_INO_FORCE_READDIR (7) /* force readdirplus */
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
--
2.35.1
From: Trond Myklebust <[email protected]>
If attribute caching is turned off, then use of readdirplus is not going
to help stat() performance.
Readdirplus also doesn't help if a file is being written to, since we
will have to flush those writes in order to sync the mtime/ctime.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/inode.c | 33 +++++++++++++++++----------------
1 file changed, 17 insertions(+), 16 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1bef81f5373a..9d2af9887715 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -782,24 +782,26 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
-static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+/*
+ * Don't request help from readdirplus if the file is being written to,
+ * or if attribute caching is turned off
+ */
+static bool nfs_getattr_readdirplus_enable(const struct inode *inode)
{
- struct dentry *parent;
+ return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) &&
+ !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ;
+}
- if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
- return;
- parent = dget_parent(dentry);
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+{
+ struct dentry *parent = dget_parent(dentry);
nfs_readdir_record_entry_cache_miss(d_inode(parent));
dput(parent);
}
static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
{
- struct dentry *parent;
-
- if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
- return;
- parent = dget_parent(dentry);
+ struct dentry *parent = dget_parent(dentry);
nfs_readdir_record_entry_cache_hit(d_inode(parent));
dput(parent);
}
@@ -837,6 +839,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
int err = 0;
bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
bool do_update = false;
+ bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode);
trace_nfs_getattr_enter(inode);
@@ -845,7 +848,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
STATX_INO | STATX_SIZE | STATX_BLOCKS;
if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
- nfs_readdirplus_parent_cache_hit(path->dentry);
+ if (readdirplus_enabled)
+ nfs_readdirplus_parent_cache_hit(path->dentry);
goto out_no_revalidate;
}
@@ -895,15 +899,12 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
if (do_update) {
- /* Update the attribute cache */
- if (!(server->flags & NFS_MOUNT_NOAC))
+ if (readdirplus_enabled)
nfs_readdirplus_parent_cache_miss(path->dentry);
- else
- nfs_readdirplus_parent_cache_hit(path->dentry);
err = __nfs_revalidate_inode(server, inode);
if (err)
goto out;
- } else
+ } else if (readdirplus_enabled)
nfs_readdirplus_parent_cache_hit(path->dentry);
out_no_revalidate:
/* Only return attributes that were revalidated. */
--
2.35.1
On Mon, 2022-02-21 at 16:10 -0500, Benjamin Coddington wrote:
> On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
> >
> > We will always need the ability to cut over to uncached readdir.
>
> Yes.
>
> > If the cookie is no longer returned by the server because one or
> > more
> > files were deleted then we need to resolve the situation somehow
> > (IOW:
> > the 'rm *' case). The new algorithm _does_ improve performance on
> > those
> > situations, because it no longer requires us to read the entire
> > directory before switching over: we try 5 times, then fail over.
>
> Yes, using per-page validation doesn't remove the need for uncached
> readdir.
> It does allow a reader to simply resume filling the cache where it
> left
> off. There's no need to try 5 times and fail over. And there's no
> need to
> make a trade-off and make the situation worse in certain scenarios.
>
> I thought I'd point that out and make an offer to re-submit it. Any
> interest?
>
As I recall, I had concerns about that approach. Can you explain again
how it will work?
A few of the concerns I have revolve around telldir()/seekdir(). If the
platform is 32-bit, then we cannot use cookies as the telldir() output,
and so our only option is to use offsets into the page cache (this is
why this patch carves out an exception if desc->dir_cookie == 0). How
would that work with you scheme?
Even in the 64-bit case where are able to use cookies for
telldir()/seekdir(), how do we determine an appropriate page index
after a seekdir()?
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Mon, 2022-02-21 at 11:45 -0500, Benjamin Coddington wrote:
> On 21 Feb 2022, at 11:08, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > When reading a very large directory, we want to try to keep the
> > page
> > cache up to date if doing so is inexpensive. Right now, we will try
> > to
> > refill the page cache if it is non-empty, irrespective of whether
> > or not
> > doing so is going to take a long time.
> >
> > Replace that algorithm with something that looks at how many times
> > we've
> > refilled the page cache without seeing a cache hit.
>
> Hi Trond, I've been following your work here - thanks for it.
>
> I'm wondering if there might be a regression on this patch for the
> case
> where two or more directory readers are part way through a large
> directory
> when the pagecache is truncated. If I'm reading this correctly,
> those
> readers will stop caching after 5 fills and finish the remainder of
> their
> directory reads in the uncached mode.
>
> Isn't there an OP amplification per reader in this case?
>
Depends... In the old case, we basically stopped doing uncached readdir
if a third process starts filling the page cache again. In particular,
this means we were vulnerable to restarting over and over once page
reclaim starts to kick in for very large directories.
In this new one, we have each process give it a try (5 fills each), and
then fallback to uncached. Yes, there will be corner cases where this
will perform less well than the old algorithm, but it should also be
more deterministic.
I am open to suggestions for better ways to determine when to cut over
to uncached readdir. This is one way, that I think is better than what
we have, however I'm sure it can be improved upon.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Mon, 2022-02-21 at 15:22 -0500, Benjamin Coddington wrote:
> On 21 Feb 2022, at 14:58, Trond Myklebust wrote:
>
> > On Mon, 2022-02-21 at 11:45 -0500, Benjamin Coddington wrote:
> > > On 21 Feb 2022, at 11:08, [email protected] wrote:
> > >
> > > > From: Trond Myklebust <[email protected]>
> > > >
> > > > When reading a very large directory, we want to try to keep the
> > > > page
> > > > cache up to date if doing so is inexpensive. Right now, we will
> > > > try
> > > > to
> > > > refill the page cache if it is non-empty, irrespective of
> > > > whether
> > > > or not
> > > > doing so is going to take a long time.
> > > >
> > > > Replace that algorithm with something that looks at how many
> > > > times
> > > > we've
> > > > refilled the page cache without seeing a cache hit.
> > >
> > > Hi Trond, I've been following your work here - thanks for it.
> > >
> > > I'm wondering if there might be a regression on this patch for
> > > the
> > > case
> > > where two or more directory readers are part way through a large
> > > directory
> > > when the pagecache is truncated. If I'm reading this correctly,
> > > those
> > > readers will stop caching after 5 fills and finish the remainder
> > > of
> > > their
> > > directory reads in the uncached mode.
> > >
> > > Isn't there an OP amplification per reader in this case?
> > >
> >
> > Depends... In the old case, we basically stopped doing uncached
> > readdir
> > if a third process starts filling the page cache again. In
> > particular,
> > this means we were vulnerable to restarting over and over once page
> > reclaim starts to kick in for very large directories.
> >
> > In this new one, we have each process give it a try (5 fills each),
> > and
> > then fallback to uncached. Yes, there will be corner cases where
> > this
> > will perform less well than the old algorithm, but it should also
> > be
> > more deterministic.
> >
> > I am open to suggestions for better ways to determine when to cut
> > over
> > to uncached readdir. This is one way, that I think is better than
> > what
> > we have, however I'm sure it can be improved upon.
>
> I still have old patches that allow each page to be "versioned" with
> the
> change attribute, page_index, and cookie. This allows the page cache
> to be
> culled page-by-page, and multiple fillers can continue to fill pages
> at
> "headless" page offsets that match their original cookie and
> page_index
> pair. This change would mean readers don't have to start over
> filling the
> page cache when the cache is dropped, so we wouldn't need to worry
> about
> when to cut over to the uncached mode - it makes the problem go away.
>
> I felt there wasn't much interest in this work, and our most vocal
> customer
> was happy enough with last winter's readdir improvements (thanks!)
> that I
> didn't follow up, but I can refresh those patches and send them along
> again.
>
We will always need the ability to cut over to uncached readdir.
If the cookie is no longer returned by the server because one or more
files were deleted then we need to resolve the situation somehow (IOW:
the 'rm *' case). The new algorithm _does_ improve performance on those
situations, because it no longer requires us to read the entire
directory before switching over: we try 5 times, then fail over.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 21 Feb 2022, at 18:20, Trond Myklebust wrote:
> On Mon, 2022-02-21 at 16:10 -0500, Benjamin Coddington wrote:
>> On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
>>>
>>> We will always need the ability to cut over to uncached readdir.
>>
>> Yes.
>>
>>> If the cookie is no longer returned by the server because one or more
>>> files were deleted then we need to resolve the situation somehow (IOW:
>>> the 'rm *' case). The new algorithm _does_ improve performance on those
>>> situations, because it no longer requires us to read the entire
>>> directory before switching over: we try 5 times, then fail over.
>>
>> Yes, using per-page validation doesn't remove the need for uncached
>> readdir. It does allow a reader to simply resume filling the cache where
>> it left off. There's no need to try 5 times and fail over. And there's
>> no need to make a trade-off and make the situation worse in certain
>> scenarios.
>>
>> I thought I'd point that out and make an offer to re-submit it. Any
>> interest?
>>
>
> As I recall, I had concerns about that approach. Can you explain again
> how it will work?
Every page of readdir results has the directory's change attr stored on the
page. That, along with the page's index and the first cookie is enough
information to determine if the page's data can be used by another process.
Which means that when the pagecache is dropped, fillers don't have to restart
filling the cache at page index 0, they can continue to fill at whatever
index they were at previously. If another process finds a page that doesn't
match its page index, cookie, and the current directory's change attr, the
page is dropped and refilled from that process' indexing.
> A few of the concerns I have revolve around telldir()/seekdir(). If the
> platform is 32-bit, then we cannot use cookies as the telldir() output,
> and so our only option is to use offsets into the page cache (this is
> why this patch carves out an exception if desc->dir_cookie == 0). How
> would that work with you scheme?
For 32-bit seekdir, pages are filled starting at index 0. This is very
unlikely to match other readers (unless they also do the _same_ seekdir).
> Even in the 64-bit case where are able to use cookies for
> telldir()/seekdir(), how do we determine an appropriate page index
> after a seekdir()?
We don't. Instead we start filling at index 0. Again, the pagecache will
only be useful to other processes that have done the same llseek.
This approach optimizes the pagecache for processes that are doing similar
work, and has the added benefit of scaling well for large directory listings
under memory pressure. Also a number of classes of directory modifications
(such as renames, or insertions/deletions at locations a reader has moved
beyond) are no longer a reason to re-fill the pagecache from scratch.
Ben
On 22 Feb 2022, at 15:11, Trond Myklebust wrote:
> On Tue, 2022-02-22 at 07:50 -0500, Benjamin Coddington wrote:
>> On 21 Feb 2022, at 18:20, Trond Myklebust wrote:
>>
>>> On Mon, 2022-02-21 at 16:10 -0500, Benjamin Coddington wrote:
>>>> On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
>>>>>
>>>>> We will always need the ability to cut over to uncached
>>>>> readdir.
>>>>
>>>> Yes.
>>>>
>>>>> If the cookie is no longer returned by the server because one
>>>>> or more
>>>>> files were deleted then we need to resolve the situation
>>>>> somehow (IOW:
>>>>> the 'rm *' case). The new algorithm _does_ improve performance
>>>>> on those
>>>>> situations, because it no longer requires us to read the entire
>>>>> directory before switching over: we try 5 times, then fail
>>>>> over.
>>>>
>>>> Yes, using per-page validation doesn't remove the need for
>>>> uncached
>>>> readdir. It does allow a reader to simply resume filling the
>>>> cache where
>>>> it left off. There's no need to try 5 times and fail over. And
>>>> there's
>>>> no need to make a trade-off and make the situation worse in
>>>> certain
>>>> scenarios.
>>>>
>>>> I thought I'd point that out and make an offer to re-submit it.
>>>> Any
>>>> interest?
>>>>
>>>
>>> As I recall, I had concerns about that approach. Can you explain
>>> again
>>> how it will work?
>>
>> Every page of readdir results has the directory's change attr stored
>> on the
>> page. That, along with the page's index and the first cookie is
>> enough
>> information to determine if the page's data can be used by another
>> process.
>>
>> Which means that when the pagecache is dropped, fillers don't have to
>> restart
>> filling the cache at page index 0, they can continue to fill at
>> whatever
>> index they were at previously. If another process finds a page that
>> doesn't
>> match its page index, cookie, and the current directory's change
>> attr, the
>> page is dropped and refilled from that process' indexing.
>>
>>> A few of the concerns I have revolve around telldir()/seekdir(). If
>>> the
>>> platform is 32-bit, then we cannot use cookies as the telldir()
>>> output,
>>> and so our only option is to use offsets into the page cache (this
>>> is
>>> why this patch carves out an exception if desc->dir_cookie == 0).
>>> How
>>> would that work with you scheme?
>>
>> For 32-bit seekdir, pages are filled starting at index 0. This is
>> very
>> unlikely to match other readers (unless they also do the _same_
>> seekdir).
>>
>>> Even in the 64-bit case where are able to use cookies for
>>> telldir()/seekdir(), how do we determine an appropriate page index
>>> after a seekdir()?
>>
>> We don't. Instead we start filling at index 0. Again, the pagecache
>> will
>> only be useful to other processes that have done the same llseek.
>>
>> This approach optimizes the pagecache for processes that are doing
>> similar
>> work, and has the added benefit of scaling well for large directory
>> listings
>> under memory pressure. Also a number of classes of directory
>> modifications
>> (such as renames, or insertions/deletions at locations a reader has
>> moved
>> beyond) are no longer a reason to re-fill the pagecache from scratch.
>>
>
> OK, you've got me more or less sold on it.
>
> I'd still like to figure out how to improve the performance for seekdir
> (since I do have an interest in re-exporting NFS) but I've been playing
> around with a couple of patches that implement your concept and they do
> seem to work well for the common case of a linear read through the
> directory.
Nice. I have another version from the one I originally posted:
https://lore.kernel.org/linux-nfs/[email protected]/
.. but I don't remember exactly the changes and it needs rebasing. Should I
rebase it against your testing branch and send the result?
Ben
On Tue, 2022-02-22 at 15:21 -0500, Benjamin Coddington wrote:
> On 22 Feb 2022, at 15:11, Trond Myklebust wrote:
>
> > On Tue, 2022-02-22 at 07:50 -0500, Benjamin Coddington wrote:
> > > On 21 Feb 2022, at 18:20, Trond Myklebust wrote:
> > >
> > > > On Mon, 2022-02-21 at 16:10 -0500, Benjamin Coddington wrote:
> > > > > On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
> > > > > >
> > > > > > We will always need the ability to cut over to uncached
> > > > > > readdir.
> > > > >
> > > > > Yes.
> > > > >
> > > > > > If the cookie is no longer returned by the server because
> > > > > > one
> > > > > > or more
> > > > > > files were deleted then we need to resolve the situation
> > > > > > somehow (IOW:
> > > > > > the 'rm *' case). The new algorithm _does_ improve
> > > > > > performance
> > > > > > on those
> > > > > > situations, because it no longer requires us to read the
> > > > > > entire
> > > > > > directory before switching over: we try 5 times, then fail
> > > > > > over.
> > > > >
> > > > > Yes, using per-page validation doesn't remove the need for
> > > > > uncached
> > > > > readdir. It does allow a reader to simply resume filling the
> > > > > cache where
> > > > > it left off. There's no need to try 5 times and fail over.
> > > > > And
> > > > > there's
> > > > > no need to make a trade-off and make the situation worse in
> > > > > certain
> > > > > scenarios.
> > > > >
> > > > > I thought I'd point that out and make an offer to re-submit
> > > > > it.
> > > > > Any
> > > > > interest?
> > > > >
> > > >
> > > > As I recall, I had concerns about that approach. Can you
> > > > explain
> > > > again
> > > > how it will work?
> > >
> > > Every page of readdir results has the directory's change attr
> > > stored
> > > on the
> > > page. That, along with the page's index and the first cookie is
> > > enough
> > > information to determine if the page's data can be used by
> > > another
> > > process.
> > >
> > > Which means that when the pagecache is dropped, fillers don't
> > > have to
> > > restart
> > > filling the cache at page index 0, they can continue to fill at
> > > whatever
> > > index they were at previously. If another process finds a page
> > > that
> > > doesn't
> > > match its page index, cookie, and the current directory's change
> > > attr, the
> > > page is dropped and refilled from that process' indexing.
> > >
> > > > A few of the concerns I have revolve around
> > > > telldir()/seekdir(). If
> > > > the
> > > > platform is 32-bit, then we cannot use cookies as the telldir()
> > > > output,
> > > > and so our only option is to use offsets into the page cache
> > > > (this
> > > > is
> > > > why this patch carves out an exception if desc->dir_cookie ==
> > > > 0).
> > > > How
> > > > would that work with you scheme?
> > >
> > > For 32-bit seekdir, pages are filled starting at index 0. This
> > > is
> > > very
> > > unlikely to match other readers (unless they also do the _same_
> > > seekdir).
> > >
> > > > Even in the 64-bit case where are able to use cookies for
> > > > telldir()/seekdir(), how do we determine an appropriate page
> > > > index
> > > > after a seekdir()?
> > >
> > > We don't. Instead we start filling at index 0. Again, the
> > > pagecache
> > > will
> > > only be useful to other processes that have done the same llseek.
> > >
> > > This approach optimizes the pagecache for processes that are
> > > doing
> > > similar
> > > work, and has the added benefit of scaling well for large
> > > directory
> > > listings
> > > under memory pressure. Also a number of classes of directory
> > > modifications
> > > (such as renames, or insertions/deletions at locations a reader
> > > has
> > > moved
> > > beyond) are no longer a reason to re-fill the pagecache from
> > > scratch.
> > >
> >
> > OK, you've got me more or less sold on it.
> >
> > I'd still like to figure out how to improve the performance for
> > seekdir
> > (since I do have an interest in re-exporting NFS) but I've been
> > playing
> > around with a couple of patches that implement your concept and
> > they do
> > seem to work well for the common case of a linear read through the
> > directory.
>
> Nice. I have another version from the one I originally posted:
> https://lore.kernel.org/linux-nfs/[email protected]/
>
> .. but I don't remember exactly the changes and it needs rebasing.
> Should I
> rebase it against your testing branch and send the result?
My 2 patches did something slightly different to yours, storing the
change attribute in the array header instead of in page_private. That
makes for a slightly smaller change.
Having further slept on the idea, I think I know how to solve the
seekdir() issue, at least for the cases where we can use cookies as
telldir()/seekdir() offsets. If we ditch the linear index, and instead
use a hash of the desc->last_cookie as the index, then we can make
random access to the directories work with your idea.
There are a couple of further problems with this concept, but I think
those are solvable. The issues I have identified so far are:
* invalidate_inode_pages2_range() no longer works to clear out the
page cache in a predictable way for the readdirplus heuristic.
* duplicate cookie detection needs to be changed.
I think those are solvable problems.
--
Trond Myklebust Linux NFS client maintainer, Hammerspace
[email protected]
On Tue, 2022-02-22 at 07:50 -0500, Benjamin Coddington wrote:
> On 21 Feb 2022, at 18:20, Trond Myklebust wrote:
>
> > On Mon, 2022-02-21 at 16:10 -0500, Benjamin Coddington wrote:
> > > On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
> > > >
> > > > We will always need the ability to cut over to uncached
> > > > readdir.
> > >
> > > Yes.
> > >
> > > > If the cookie is no longer returned by the server because one
> > > > or more
> > > > files were deleted then we need to resolve the situation
> > > > somehow (IOW:
> > > > the 'rm *' case). The new algorithm _does_ improve performance
> > > > on those
> > > > situations, because it no longer requires us to read the entire
> > > > directory before switching over: we try 5 times, then fail
> > > > over.
> > >
> > > Yes, using per-page validation doesn't remove the need for
> > > uncached
> > > readdir. It does allow a reader to simply resume filling the
> > > cache where
> > > it left off. There's no need to try 5 times and fail over. And
> > > there's
> > > no need to make a trade-off and make the situation worse in
> > > certain
> > > scenarios.
> > >
> > > I thought I'd point that out and make an offer to re-submit it.
> > > Any
> > > interest?
> > >
> >
> > As I recall, I had concerns about that approach. Can you explain
> > again
> > how it will work?
>
> Every page of readdir results has the directory's change attr stored
> on the
> page. That, along with the page's index and the first cookie is
> enough
> information to determine if the page's data can be used by another
> process.
>
> Which means that when the pagecache is dropped, fillers don't have to
> restart
> filling the cache at page index 0, they can continue to fill at
> whatever
> index they were at previously. If another process finds a page that
> doesn't
> match its page index, cookie, and the current directory's change
> attr, the
> page is dropped and refilled from that process' indexing.
>
> > A few of the concerns I have revolve around telldir()/seekdir(). If
> > the
> > platform is 32-bit, then we cannot use cookies as the telldir()
> > output,
> > and so our only option is to use offsets into the page cache (this
> > is
> > why this patch carves out an exception if desc->dir_cookie == 0).
> > How
> > would that work with you scheme?
>
> For 32-bit seekdir, pages are filled starting at index 0. This is
> very
> unlikely to match other readers (unless they also do the _same_
> seekdir).
>
> > Even in the 64-bit case where are able to use cookies for
> > telldir()/seekdir(), how do we determine an appropriate page index
> > after a seekdir()?
>
> We don't. Instead we start filling at index 0. Again, the pagecache
> will
> only be useful to other processes that have done the same llseek.
>
> This approach optimizes the pagecache for processes that are doing
> similar
> work, and has the added benefit of scaling well for large directory
> listings
> under memory pressure. Also a number of classes of directory
> modifications
> (such as renames, or insertions/deletions at locations a reader has
> moved
> beyond) are no longer a reason to re-fill the pagecache from scratch.
>
OK, you've got me more or less sold on it.
I'd still like to figure out how to improve the performance for seekdir
(since I do have an interest in re-exporting NFS) but I've been playing
around with a couple of patches that implement your concept and they do
seem to work well for the common case of a linear read through the
directory.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Wed, 2022-02-23 at 08:34 -0500, Benjamin Coddington wrote:
> On 23 Feb 2022, at 7:17, Trond Myklebust wrote:
>
> > On Tue, 2022-02-22 at 15:21 -0500, Benjamin Coddington wrote:
> > > On 22 Feb 2022, at 15:11, Trond Myklebust wrote:
> > >
> > > > On Tue, 2022-02-22 at 07:50 -0500, Benjamin Coddington wrote:
> > > > > On 21 Feb 2022, at 18:20, Trond Myklebust wrote:
> > > > >
> > > > > > On Mon, 2022-02-21 at 16:10 -0500, Benjamin Coddington
> > > > > > wrote:
> > > > > > > On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
> > > > > > > >
> > > > > > > > We will always need the ability to cut over to uncached
> > > > > > > > readdir.
> > > > > > >
> > > > > > > Yes.
> > > > > > >
> > > > > > > > If the cookie is no longer returned by the server
> > > > > > > > because
> > > > > > > > one
> > > > > > > > or more
> > > > > > > > files were deleted then we need to resolve the
> > > > > > > > situation
> > > > > > > > somehow (IOW:
> > > > > > > > the 'rm *' case). The new algorithm _does_ improve
> > > > > > > > performance
> > > > > > > > on those
> > > > > > > > situations, because it no longer requires us to read
> > > > > > > > the
> > > > > > > > entire
> > > > > > > > directory before switching over: we try 5 times, then
> > > > > > > > fail
> > > > > > > > over.
> > > > > > >
> > > > > > > Yes, using per-page validation doesn't remove the need
> > > > > > > for
> > > > > > > uncached
> > > > > > > readdir. It does allow a reader to simply resume filling
> > > > > > > the
> > > > > > > cache where
> > > > > > > it left off. There's no need to try 5 times and fail
> > > > > > > over.
> > > > > > > And
> > > > > > > there's
> > > > > > > no need to make a trade-off and make the situation worse
> > > > > > > in
> > > > > > > certain
> > > > > > > scenarios.
> > > > > > >
> > > > > > > I thought I'd point that out and make an offer to re-
> > > > > > > submit
> > > > > > > it.
> > > > > > > Any
> > > > > > > interest?
> > > > > > >
> > > > > >
> > > > > > As I recall, I had concerns about that approach. Can you
> > > > > > explain
> > > > > > again
> > > > > > how it will work?
> > > > >
> > > > > Every page of readdir results has the directory's change attr
> > > > > stored
> > > > > on the
> > > > > page. That, along with the page's index and the first cookie
> > > > > is
> > > > > enough
> > > > > information to determine if the page's data can be used by
> > > > > another
> > > > > process.
> > > > >
> > > > > Which means that when the pagecache is dropped, fillers don't
> > > > > have to
> > > > > restart
> > > > > filling the cache at page index 0, they can continue to fill
> > > > > at
> > > > > whatever
> > > > > index they were at previously. If another process finds a
> > > > > page
> > > > > that
> > > > > doesn't
> > > > > match its page index, cookie, and the current directory's
> > > > > change
> > > > > attr, the
> > > > > page is dropped and refilled from that process' indexing.
> > > > >
> > > > > > A few of the concerns I have revolve around
> > > > > > telldir()/seekdir(). If
> > > > > > the
> > > > > > platform is 32-bit, then we cannot use cookies as the
> > > > > > telldir()
> > > > > > output,
> > > > > > and so our only option is to use offsets into the page
> > > > > > cache
> > > > > > (this
> > > > > > is
> > > > > > why this patch carves out an exception if desc->dir_cookie
> > > > > > ==
> > > > > > 0).
> > > > > > How
> > > > > > would that work with you scheme?
> > > > >
> > > > > For 32-bit seekdir, pages are filled starting at index 0.
> > > > > This
> > > > > is
> > > > > very
> > > > > unlikely to match other readers (unless they also do the
> > > > > _same_
> > > > > seekdir).
> > > > >
> > > > > > Even in the 64-bit case where are able to use cookies for
> > > > > > telldir()/seekdir(), how do we determine an appropriate
> > > > > > page
> > > > > > index
> > > > > > after a seekdir()?
> > > > >
> > > > > We don't. Instead we start filling at index 0. Again, the
> > > > > pagecache
> > > > > will
> > > > > only be useful to other processes that have done the same
> > > > > llseek.
> > > > >
> > > > > This approach optimizes the pagecache for processes that are
> > > > > doing
> > > > > similar
> > > > > work, and has the added benefit of scaling well for large
> > > > > directory
> > > > > listings
> > > > > under memory pressure. Also a number of classes of directory
> > > > > modifications
> > > > > (such as renames, or insertions/deletions at locations a
> > > > > reader
> > > > > has
> > > > > moved
> > > > > beyond) are no longer a reason to re-fill the pagecache from
> > > > > scratch.
> > > > >
> > > >
> > > > OK, you've got me more or less sold on it.
> > > >
> > > > I'd still like to figure out how to improve the performance for
> > > > seekdir
> > > > (since I do have an interest in re-exporting NFS) but I've been
> > > > playing
> > > > around with a couple of patches that implement your concept and
> > > > they do
> > > > seem to work well for the common case of a linear read through
> > > > the
> > > > directory.
> > >
> > > Nice. I have another version from the one I originally posted:
> > > https://lore.kernel.org/linux-nfs/[email protected]/
> > >
> > > .. but I don't remember exactly the changes and it needs
> > > rebasing.
> > > Should I
> > > rebase it against your testing branch and send the result?
> >
> > My 2 patches did something slightly different to yours, storing the
> > change attribute in the array header instead of in page_private.
> > That
> > makes for a slightly smaller change.
>
> I worried that increasing the size of the array header wouldn't allow
> us
> to
> store as many entries per page.
The struct nfs_cache_array header is 24 bytes long with the change
attribute (as opposed to 16 bytes without it). This size is independent
of the architecture, assuming that 'unsigned int' is 32-bits and
unsigned char is 8-bits (as is always the case on Linux).
On a 64-bit system, A single struct nfs_cache_array_entry ends up being
32 bytes long. So in a standard 4k page with a 24-byte or 16 byte
header you will be able to cache exactly 127 cache array entries.
On a 32-bit system, the cache entry is 28 bytes long (the difference
being the pointer length), and you can pack 145 entries in the 4k page.
IOW: The change in header size makes no difference to the number of
entries you can cache, because in both cases, the header remains
smaller than a single entry.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 23 Feb 2022, at 7:17, Trond Myklebust wrote:
> On Tue, 2022-02-22 at 15:21 -0500, Benjamin Coddington wrote:
>> On 22 Feb 2022, at 15:11, Trond Myklebust wrote:
>>
>>> On Tue, 2022-02-22 at 07:50 -0500, Benjamin Coddington wrote:
>>>> On 21 Feb 2022, at 18:20, Trond Myklebust wrote:
>>>>
>>>>> On Mon, 2022-02-21 at 16:10 -0500, Benjamin Coddington wrote:
>>>>>> On 21 Feb 2022, at 15:55, Trond Myklebust wrote:
>>>>>>>
>>>>>>> We will always need the ability to cut over to uncached
>>>>>>> readdir.
>>>>>>
>>>>>> Yes.
>>>>>>
>>>>>>> If the cookie is no longer returned by the server because
>>>>>>> one
>>>>>>> or more
>>>>>>> files were deleted then we need to resolve the situation
>>>>>>> somehow (IOW:
>>>>>>> the 'rm *' case). The new algorithm _does_ improve
>>>>>>> performance
>>>>>>> on those
>>>>>>> situations, because it no longer requires us to read the
>>>>>>> entire
>>>>>>> directory before switching over: we try 5 times, then fail
>>>>>>> over.
>>>>>>
>>>>>> Yes, using per-page validation doesn't remove the need for
>>>>>> uncached
>>>>>> readdir. It does allow a reader to simply resume filling the
>>>>>> cache where
>>>>>> it left off. There's no need to try 5 times and fail over.
>>>>>> And
>>>>>> there's
>>>>>> no need to make a trade-off and make the situation worse in
>>>>>> certain
>>>>>> scenarios.
>>>>>>
>>>>>> I thought I'd point that out and make an offer to re-submit
>>>>>> it.
>>>>>> Any
>>>>>> interest?
>>>>>>
>>>>>
>>>>> As I recall, I had concerns about that approach. Can you
>>>>> explain
>>>>> again
>>>>> how it will work?
>>>>
>>>> Every page of readdir results has the directory's change attr
>>>> stored
>>>> on the
>>>> page. That, along with the page's index and the first cookie is
>>>> enough
>>>> information to determine if the page's data can be used by
>>>> another
>>>> process.
>>>>
>>>> Which means that when the pagecache is dropped, fillers don't
>>>> have to
>>>> restart
>>>> filling the cache at page index 0, they can continue to fill at
>>>> whatever
>>>> index they were at previously. If another process finds a page
>>>> that
>>>> doesn't
>>>> match its page index, cookie, and the current directory's change
>>>> attr, the
>>>> page is dropped and refilled from that process' indexing.
>>>>
>>>>> A few of the concerns I have revolve around
>>>>> telldir()/seekdir(). If
>>>>> the
>>>>> platform is 32-bit, then we cannot use cookies as the telldir()
>>>>> output,
>>>>> and so our only option is to use offsets into the page cache
>>>>> (this
>>>>> is
>>>>> why this patch carves out an exception if desc->dir_cookie ==
>>>>> 0).
>>>>> How
>>>>> would that work with you scheme?
>>>>
>>>> For 32-bit seekdir, pages are filled starting at index 0. This
>>>> is
>>>> very
>>>> unlikely to match other readers (unless they also do the _same_
>>>> seekdir).
>>>>
>>>>> Even in the 64-bit case where are able to use cookies for
>>>>> telldir()/seekdir(), how do we determine an appropriate page
>>>>> index
>>>>> after a seekdir()?
>>>>
>>>> We don't. Instead we start filling at index 0. Again, the
>>>> pagecache
>>>> will
>>>> only be useful to other processes that have done the same llseek.
>>>>
>>>> This approach optimizes the pagecache for processes that are
>>>> doing
>>>> similar
>>>> work, and has the added benefit of scaling well for large
>>>> directory
>>>> listings
>>>> under memory pressure. Also a number of classes of directory
>>>> modifications
>>>> (such as renames, or insertions/deletions at locations a reader
>>>> has
>>>> moved
>>>> beyond) are no longer a reason to re-fill the pagecache from
>>>> scratch.
>>>>
>>>
>>> OK, you've got me more or less sold on it.
>>>
>>> I'd still like to figure out how to improve the performance for
>>> seekdir
>>> (since I do have an interest in re-exporting NFS) but I've been
>>> playing
>>> around with a couple of patches that implement your concept and
>>> they do
>>> seem to work well for the common case of a linear read through the
>>> directory.
>>
>> Nice. I have another version from the one I originally posted:
>> https://lore.kernel.org/linux-nfs/[email protected]/
>>
>> .. but I don't remember exactly the changes and it needs rebasing.
>> Should I
>> rebase it against your testing branch and send the result?
>
> My 2 patches did something slightly different to yours, storing the
> change attribute in the array header instead of in page_private. That
> makes for a slightly smaller change.
I worried that increasing the size of the array header wouldn't allow us
to
store as many entries per page.
> Having further slept on the idea, I think I know how to solve the
> seekdir() issue, at least for the cases where we can use cookies as
> telldir()/seekdir() offsets. If we ditch the linear index, and instead
> use a hash of the desc->last_cookie as the index, then we can make
> random access to the directories work with your idea.
Yes! Nice!
> There are a couple of further problems with this concept, but I think
> those are solvable. The issues I have identified so far are:
>
> * invalidate_inode_pages2_range() no longer works to clear out the
> page cache in a predictable way for the readdirplus heuristic.
> * duplicate cookie detection needs to be changed.
I don't understand those problems, yet. I have a few other things I
have to
do, but after those I will come back and give them some attention.
FWIW, I rebased that old series of page-validation work on your
e85f86150b32 NFS: Trace effects of the readdirplus heuristic
I'll send it as a reply to [ v6 13/13] in this posting, but I've only
compile-tested it so far.
Ben