2022-02-28 07:38:30

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v9 22/27] NFS: Clean up page array initialisation/free

From: Trond Myklebust <[email protected]>

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 483bb67d2ace..95a29a973dc8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -199,20 +199,17 @@ static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
nfs_set_dtsize(desc, desc->dtsize << 1);
}

-static void nfs_readdir_array_init(struct nfs_cache_array *array)
-{
- memset(array, 0, sizeof(struct nfs_cache_array));
-}
-
static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
u64 change_attr)
{
struct nfs_cache_array *array;

array = kmap_atomic(page);
- nfs_readdir_array_init(array);
array->change_attr = change_attr;
array->last_cookie = last_cookie;
+ array->size = 0;
+ array->page_full = 0;
+ array->page_is_eof = 0;
array->cookies_are_ordered = 1;
kunmap_atomic(array);
}
@@ -220,16 +217,15 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
-static
-void nfs_readdir_clear_array(struct page *page)
+static void nfs_readdir_clear_array(struct page *page)
{
struct nfs_cache_array *array;
- int i;
+ unsigned int i;

array = kmap_atomic(page);
for (i = 0; i < array->size; i++)
kfree(array->array[i].name);
- nfs_readdir_array_init(array);
+ array->size = 0;
kunmap_atomic(array);
}

--
2.35.1


2022-02-28 08:55:12

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

From: Trond Myklebust <[email protected]>

Instead of using a linear index to address the pages, use the cookie of
the first entry, since that is what we use to match the page anyway.

This allows us to avoid re-reading the entire cache on a seekdir() type
of operation. The latter is very common when re-exporting NFS, and is a
major performance drain.

The change does affect our duplicate cookie detection, since we can no
longer rely on the page index as a linear offset for detecting whether
we looped backwards. However since we no longer do a linear search
through all the pages on each call to nfs_readdir(), this is less of a
concern than it was previously.
The other downside is that invalidate_mapping_pages() no longer can use
the page index to avoid clearing pages that have been read. A subsequent
patch will restore the functionality this provides to the 'ls -l'
heuristic.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/Kconfig | 4 ++
fs/nfs/dir.c | 149 ++++++++++++++++++-----------------------
include/linux/nfs_fs.h | 2 -
3 files changed, 69 insertions(+), 86 deletions(-)

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 14a72224b657..47a53b3362b6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -4,6 +4,10 @@ config NFS_FS
depends on INET && FILE_LOCKING && MULTIUSER
select LOCKD
select SUNRPC
+ select CRYPTO
+ select CRYPTO_HASH
+ select XXHASH
+ select CRYPTO_XXHASH
select NFS_ACL_SUPPORT if NFS_V3_ACL
help
Choose Y here if you want to access files residing on other
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 95a29a973dc8..707ad0fd5a4e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -39,6 +39,7 @@
#include <linux/sched.h>
#include <linux/kmemleak.h>
#include <linux/xattr.h>
+#include <linux/xxhash.h>

#include "delegation.h"
#include "iostat.h"
@@ -159,9 +160,7 @@ struct nfs_readdir_descriptor {
pgoff_t page_index_max;
u64 dir_cookie;
u64 last_cookie;
- u64 dup_cookie;
loff_t current_index;
- loff_t prev_index;

__be32 verf[NFS_DIR_VERIFIER_SIZE];
unsigned long dir_verifier;
@@ -171,7 +170,6 @@ struct nfs_readdir_descriptor {
unsigned int cache_entry_index;
unsigned int buffer_fills;
unsigned int dtsize;
- signed char duped;
bool plus;
bool eob;
bool eof;
@@ -331,6 +329,28 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
return ret;
}

+#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14)
+/*
+ * Hash algorithm allowing content addressible access to sequences
+ * of directory cookies. Content is addressed by the value of the
+ * cookie index of the first readdir entry in a page.
+ *
+ * The xxhash algorithm is chosen because it is fast, and is supposed
+ * to result in a decent flat distribution of hashes.
+ *
+ * We then select only the first 18 bits to avoid issues with excessive
+ * memory use for the page cache XArray. 18 bits should allow the caching
+ * of 262144 pages of sequences of readdir entries. Since each page holds
+ * 127 readdir entries for a typical 64-bit system, that works out to a
+ * cache of ~ 33 million entries per directory.
+ */
+static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
+{
+ if (cookie == 0)
+ return 0;
+ return xxhash(&cookie, sizeof(cookie), 0) & NFS_READDIR_COOKIE_MASK;
+}
+
static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
u64 change_attr)
{
@@ -352,15 +372,15 @@ static void nfs_readdir_page_unlock_and_put(struct page *page)
}

static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
- pgoff_t index, u64 last_cookie)
+ u64 last_cookie,
+ u64 change_attr)
{
+ pgoff_t index = nfs_readdir_page_cookie_hash(last_cookie);
struct page *page;
- u64 change_attr;

page = grab_cache_page(mapping, index);
if (!page)
return NULL;
- change_attr = inode_peek_iversion_raw(mapping->host);
if (PageUptodate(page)) {
if (nfs_readdir_page_validate(page, last_cookie, change_attr))
return page;
@@ -371,11 +391,6 @@ static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
return page;
}

-static loff_t nfs_readdir_page_offset(struct page *page)
-{
- return (loff_t)page->index * (loff_t)nfs_readdir_array_maxentries();
-}
-
static u64 nfs_readdir_page_last_cookie(struct page *page)
{
struct nfs_cache_array *array;
@@ -408,11 +423,11 @@ static void nfs_readdir_page_set_eof(struct page *page)
}

static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
- pgoff_t index, u64 cookie)
+ u64 cookie, u64 change_attr)
{
struct page *page;

- page = nfs_readdir_page_get_locked(mapping, index, cookie);
+ page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
if (page) {
if (nfs_readdir_page_last_cookie(page) == cookie)
return page;
@@ -452,6 +467,13 @@ static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
desc->last_cookie = array->array[0].cookie;
}

+static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
+{
+ desc->current_index = 0;
+ desc->last_cookie = 0;
+ desc->page_index = 0;
+}
+
static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
@@ -492,8 +514,7 @@ static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
- int i;
- loff_t new_pos;
+ unsigned int i;
int status = -EAGAIN;

if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie))
@@ -501,32 +522,10 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,

for (i = 0; i < array->size; i++) {
if (array->array[i].cookie == desc->dir_cookie) {
- struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
-
- new_pos = nfs_readdir_page_offset(desc->page) + i;
- if (desc->attr_gencount != nfsi->attr_gencount) {
- desc->duped = 0;
- desc->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->prev_index) {
- if (desc->duped > 0
- && desc->dup_cookie == desc->dir_cookie) {
- if (printk_ratelimit()) {
- pr_notice("NFS: directory %pD2 contains a readdir loop."
- "Please contact your server vendor. "
- "The file: %s has duplicate cookie %llu\n",
- desc->file, array->array[i].name, desc->dir_cookie);
- }
- status = -ELOOP;
- goto out;
- }
- desc->dup_cookie = desc->dir_cookie;
- desc->duped = -1;
- }
if (nfs_readdir_use_cookie(desc->file))
desc->ctx->pos = desc->dir_cookie;
else
- desc->ctx->pos = new_pos;
- desc->prev_index = new_pos;
+ desc->ctx->pos = desc->current_index + i;
desc->cache_entry_index = i;
return 0;
}
@@ -538,7 +537,6 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
desc->eof = true;
} else
nfs_readdir_seek_next_array(array, desc);
-out:
return status;
}

@@ -783,10 +781,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
/* Perform conversion from xdr to cache array */
static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
struct nfs_entry *entry,
- struct page **xdr_pages,
- unsigned int buflen,
- struct page **arrays,
- size_t narrays)
+ struct page **xdr_pages, unsigned int buflen,
+ struct page **arrays, size_t narrays,
+ u64 change_attr)
{
struct address_space *mapping = desc->file->f_mapping;
struct xdr_stream stream;
@@ -826,18 +823,16 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
break;
arrays++;
*arrays = page = new;
- desc->page_index_max++;
} else {
- new = nfs_readdir_page_get_next(mapping,
- page->index + 1,
- entry->prev_cookie);
+ new = nfs_readdir_page_get_next(
+ mapping, entry->prev_cookie, change_attr);
if (!new)
break;
if (page != *arrays)
nfs_readdir_page_unlock_and_put(page);
page = new;
- desc->page_index_max = new->index;
}
+ desc->page_index_max++;
status = nfs_readdir_add_to_array(entry, page);
} while (!status && !entry->eof);

@@ -897,6 +892,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
__be32 *verf_arg, __be32 *verf_res,
struct page **arrays, size_t narrays)
{
+ u64 change_attr;
struct page **pages;
struct page *page = *arrays;
struct nfs_entry *entry;
@@ -921,6 +917,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
if (!pages)
goto out;

+ change_attr = inode_peek_iversion_raw(inode);
status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages,
dtsize, verf_res);
if (status < 0)
@@ -929,7 +926,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
pglen = status;
if (pglen != 0)
status = nfs_readdir_page_filler(desc, entry, pages, pglen,
- arrays, narrays);
+ arrays, narrays, change_attr);
else
nfs_readdir_page_set_eof(page);
desc->buffer_fills++;
@@ -959,9 +956,11 @@ nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
static struct page *
nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
{
- return nfs_readdir_page_get_locked(desc->file->f_mapping,
- desc->page_index,
- desc->last_cookie);
+ struct address_space *mapping = desc->file->f_mapping;
+ u64 change_attr = inode_peek_iversion_raw(mapping->host);
+
+ return nfs_readdir_page_get_locked(mapping, desc->last_cookie,
+ change_attr);
}

/*
@@ -993,7 +992,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
trace_nfs_readdir_cache_fill_done(inode, res);
if (res == -EBADCOOKIE || res == -ENOTSYNC) {
invalidate_inode_pages2(desc->file->f_mapping);
- desc->page_index = 0;
+ nfs_readdir_rewind_search(desc);
trace_nfs_readdir_invalidate_cache_range(
inode, 0, MAX_LFS_FILESIZE);
return -EAGAIN;
@@ -1007,12 +1006,10 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) {
memcpy(nfsi->cookieverf, verf,
sizeof(nfsi->cookieverf));
- invalidate_inode_pages2_range(desc->file->f_mapping,
- desc->page_index_max + 1,
+ invalidate_inode_pages2_range(desc->file->f_mapping, 1,
-1);
trace_nfs_readdir_invalidate_cache_range(
- inode, desc->page_index_max + 1,
- MAX_LFS_FILESIZE);
+ inode, 1, MAX_LFS_FILESIZE);
}
}
res = nfs_readdir_search_array(desc);
@@ -1028,11 +1025,6 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
int res;

do {
- if (desc->page_index == 0) {
- desc->current_index = 0;
- desc->prev_index = 0;
- desc->last_cookie = 0;
- }
res = find_and_lock_cache_page(desc);
} while (res == -EAGAIN);
return res;
@@ -1070,8 +1062,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
- if (desc->duped != 0)
- desc->duped = 1;
}
if (array->page_is_eof)
desc->eof = !desc->eob;
@@ -1113,7 +1103,6 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
desc->page_index = 0;
desc->cache_entry_index = 0;
desc->last_cookie = desc->dir_cookie;
- desc->duped = 0;
desc->page_index_max = 0;

trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
@@ -1146,6 +1135,8 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
for (i = 0; i < sz && arrays[i]; i++)
nfs_readdir_page_array_free(arrays[i]);
out:
+ if (!nfs_readdir_use_cookie(desc->file))
+ nfs_readdir_rewind_search(desc);
desc->page_index_max = -1;
kfree(arrays);
dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
@@ -1156,17 +1147,14 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)

static void nfs_readdir_handle_cache_misses(struct inode *inode,
struct nfs_readdir_descriptor *desc,
- pgoff_t page_index,
unsigned int cache_misses)
{
if (desc->ctx->pos == 0 ||
cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
return;
- if (invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1) == 0)
+ if (invalidate_mapping_pages(inode->i_mapping, 0, -1) == 0)
return;
- trace_nfs_readdir_invalidate_cache_range(
- inode, (loff_t)(page_index + 1) << PAGE_SHIFT,
- MAX_LFS_FILESIZE);
+ trace_nfs_readdir_invalidate_cache_range(inode, 0, MAX_LFS_FILESIZE);
}

/* The file offset position represents the dirent entry number. A
@@ -1181,7 +1169,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
unsigned int cache_hits, cache_misses;
- pgoff_t page_index;
int res;

dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -1206,10 +1193,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)

spin_lock(&file->f_lock);
desc->dir_cookie = dir_ctx->dir_cookie;
- desc->dup_cookie = dir_ctx->dup_cookie;
- desc->duped = dir_ctx->duped;
- page_index = dir_ctx->page_index;
- desc->page_index = page_index;
+ desc->page_index = dir_ctx->page_index;
desc->last_cookie = dir_ctx->last_cookie;
desc->attr_gencount = dir_ctx->attr_gencount;
desc->eof = dir_ctx->eof;
@@ -1225,7 +1209,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
}

desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
- nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses);
+ nfs_readdir_handle_cache_misses(inode, desc, cache_misses);

do {
res = readdir_search_pagecache(desc);
@@ -1245,7 +1229,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
}
if (res == -ETOOSMALL && desc->plus) {
nfs_zap_caches(inode);
- desc->page_index = 0;
desc->plus = false;
desc->eof = false;
continue;
@@ -1259,9 +1242,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)

spin_lock(&file->f_lock);
dir_ctx->dir_cookie = desc->dir_cookie;
- dir_ctx->dup_cookie = desc->dup_cookie;
dir_ctx->last_cookie = desc->last_cookie;
- dir_ctx->duped = desc->duped;
dir_ctx->attr_gencount = desc->attr_gencount;
dir_ctx->page_index = desc->page_index;
dir_ctx->eof = desc->eof;
@@ -1304,13 +1285,13 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
if (offset != filp->f_pos) {
filp->f_pos = offset;
dir_ctx->page_index = 0;
- if (!nfs_readdir_use_cookie(filp))
+ if (!nfs_readdir_use_cookie(filp)) {
dir_ctx->dir_cookie = 0;
- else
+ dir_ctx->last_cookie = 0;
+ } else {
dir_ctx->dir_cookie = offset;
- if (offset == 0)
- memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
- dir_ctx->duped = 0;
+ dir_ctx->last_cookie = offset;
+ }
dir_ctx->eof = false;
}
spin_unlock(&filp->f_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 20a4cf0acad2..42aad886d3c0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -106,11 +106,9 @@ struct nfs_open_dir_context {
unsigned long attr_gencount;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
__u64 dir_cookie;
- __u64 dup_cookie;
__u64 last_cookie;
pgoff_t page_index;
unsigned int dtsize;
- signed char duped;
bool eof;
struct rcu_head rcu_head;
};
--
2.35.1

2022-03-10 04:00:56

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

On 9 Mar 2022, at 15:01, Benjamin Coddington wrote:

> On 27 Feb 2022, at 18:12, [email protected] wrote:
>
>> From: Trond Myklebust <[email protected]>
>>
>> Instead of using a linear index to address the pages, use the cookie
>> of
>> the first entry, since that is what we use to match the page anyway.
>>
>> This allows us to avoid re-reading the entire cache on a seekdir()
>> type
>> of operation. The latter is very common when re-exporting NFS, and is
>> a
>> major performance drain.
>>
>> The change does affect our duplicate cookie detection, since we can
>> no
>> longer rely on the page index as a linear offset for detecting
>> whether
>> we looped backwards. However since we no longer do a linear search
>> through all the pages on each call to nfs_readdir(), this is less of
>> a
>> concern than it was previously.
>> The other downside is that invalidate_mapping_pages() no longer can
>> use
>> the page index to avoid clearing pages that have been read. A
>> subsequent
>> patch will restore the functionality this provides to the 'ls -l'
>> heuristic.
>
> I didn't realize the approach was to also hash out the linearly-cached
> entries. I thought we'd do something like flag the context for hashed
> page
> indexes after a seekdir event, and if there are collisions with the
> linear
> entries, they'll get fixed up when found.
>
> Doesn't that mean that with this approach seekdir() only hits the same
> pages
> when the entry offset is page-aligned? That's 1 in 127 odds.
>
> It also means we're amplifying the pagecache's useage for slightly
> changing
> directories - rather than re-using the same pages we're scattering our
> usage
> across the index. Eh, maybe not a big deal if we just expect the page
> cache's LRU to do the work.

I don't have a better idea, though.. have you tested this performance?

..

maybe.. the hash divided the u64 cookie space into 262144 buckets, each
being
a page the cookie could fall into. So cookies 1 - 70368744177663 map
into
page 1.. bah. That wont work.

I was worried that I was wrong about this, but this program shows the
problem by requiring a full READDIR for each entry if we walk the
entries
one-by-one with lseek(). I don't understand how the re-export seekdir()
case is helped by this unless you're hitting the exact same offsets all
the
time.

I think that a hash of the page index for seekdir is no better than
picking
an arbitrary offset, or just using the lowest pages in the cache.

Ben

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <string.h>

#define NFSDIR "/mnt/fedora/127_dentries"

int main(int argc, char **argv)
{
int i, dir_fd, bpos, total = 0;
size_t nread;
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
struct linux_dirent *d;
int buf_size = sizeof(struct linux_dirent) + sizeof("file_000");
char buf[buf_size];

/* create files: */
for (i = 0; i < 127; i++) {
sprintf(buf, NFSDIR "/file_%03d", i);
close(open(buf, O_CREAT, 666));
}

dir_fd = open(NFSDIR, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC);
if (dir_fd < 0) {
perror("cannot open dir");
return 1;
}

/* no cache pls */
posix_fadvise(dir_fd, 0, 0, POSIX_FADV_DONTNEED);

while (1) {
nread = syscall(SYS_getdents, dir_fd, buf, buf_size);
if (nread == 0 || nread == -1)
break;
for (bpos = 0; bpos < nread;) {
d = (struct linux_dirent *) (buf + bpos);
printf("%s offset %lu\n", d->d_name, d->d_off);

lseek(dir_fd, 0, SEEK_SET);
lseek(dir_fd, d->d_off, SEEK_SET);
total++;
bpos += d->d_reclen;
}
}
printf("Listing 1: %d total dirents\n", total);

close(dir_fd);
return 0;
}

2022-03-10 04:34:44

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

On 27 Feb 2022, at 18:12, [email protected] wrote:

> From: Trond Myklebust <[email protected]>
>
> Instead of using a linear index to address the pages, use the cookie of
> the first entry, since that is what we use to match the page anyway.
>
> This allows us to avoid re-reading the entire cache on a seekdir() type
> of operation. The latter is very common when re-exporting NFS, and is a
> major performance drain.
>
> The change does affect our duplicate cookie detection, since we can no
> longer rely on the page index as a linear offset for detecting whether
> we looped backwards. However since we no longer do a linear search
> through all the pages on each call to nfs_readdir(), this is less of a
> concern than it was previously.
> The other downside is that invalidate_mapping_pages() no longer can use
> the page index to avoid clearing pages that have been read. A subsequent
> patch will restore the functionality this provides to the 'ls -l'
> heuristic.

I didn't realize the approach was to also hash out the linearly-cached
entries. I thought we'd do something like flag the context for hashed page
indexes after a seekdir event, and if there are collisions with the linear
entries, they'll get fixed up when found.

Doesn't that mean that with this approach seekdir() only hits the same pages
when the entry offset is page-aligned? That's 1 in 127 odds.

It also means we're amplifying the pagecache's useage for slightly changing
directories - rather than re-using the same pages we're scattering our usage
across the index. Eh, maybe not a big deal if we just expect the page
cache's LRU to do the work.

Ben

2022-03-11 20:44:15

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

On 11 Mar 2022, at 9:02, Trond Myklebust wrote:

> On Fri, 2022-03-11 at 06:58 -0500, Benjamin Coddington wrote:
>> On 10 Mar 2022, at 16:07, Trond Myklebust wrote:
>>
>>> On Wed, 2022-03-09 at 15:01 -0500, Benjamin Coddington wrote:
>>>> On 27 Feb 2022, at 18:12, [email protected] wrote:
>>>>
>>>>> From: Trond Myklebust <[email protected]>
>>>>>
>>>>> Instead of using a linear index to address the pages, use the
>>>>> cookie of
>>>>> the first entry, since that is what we use to match the page
>>>>> anyway.
>>>>>
>>>>> This allows us to avoid re-reading the entire cache on a
>>>>> seekdir()
>>>>> type
>>>>> of operation. The latter is very common when re-exporting NFS,
>>>>> and
>>>>> is a
>>>>> major performance drain.
>>>>>
>>>>> The change does affect our duplicate cookie detection, since we
>>>>> can
>>>>> no
>>>>> longer rely on the page index as a linear offset for detecting
>>>>> whether
>>>>> we looped backwards. However since we no longer do a linear
>>>>> search
>>>>> through all the pages on each call to nfs_readdir(), this is
>>>>> less
>>>>> of a
>>>>> concern than it was previously.
>>>>> The other downside is that invalidate_mapping_pages() no longer
>>>>> can
>>>>> use
>>>>> the page index to avoid clearing pages that have been read. A
>>>>> subsequent
>>>>> patch will restore the functionality this provides to the 'ls -
>>>>> l'
>>>>> heuristic.
>>>>
>>>> I didn't realize the approach was to also hash out the linearly-
>>>> cached
>>>> entries.  I thought we'd do something like flag the context for
>>>> hashed page
>>>> indexes after a seekdir event, and if there are collisions with
>>>> the
>>>> linear
>>>> entries, they'll get fixed up when found.
>>>
>>> Why? What's the point of using 2 models where 1 will do?
>>
>> I don't think the hashed model is quite as simple and efficient
>> overall, and
>> may produce impacts to a system beyond NFS.
>>
>>>>
>>>> Doesn't that mean that with this approach seekdir() only hits the
>>>> same pages
>>>> when the entry offset is page-aligned?  That's 1 in 127 odds.
>>>
>>> The point is not to stomp all over the pages that contain aligned
>>> data
>>> when the application does call seekdir().
>>>
>>> IOW: we always optimise for the case where we do a linear read of
>>> the
>>> directory, but we support random seekdir() + read too.
>>
>> And that could be done just by bumping the seekdir users to some
>> constant
>> offset (index 262144 ?), or something else equally dead-nuts simple. 
>> That
>> keeps tightly clustered page indexes, so walking the cache is
>> faster.  That
>> reduces the "buckshot" effect the hashing has of eating up pagecache
>> pages
>> they'll never use again.  That doesn't cap our caching ability at 33
>> million
>> entries.
>>
>
> What you say would make sense if readdir cookies truly were offsets,
> but in general they're not. Cookies are unstructured data, and should
> be treated as unstructured data.
>
> Let's say I do cache more than 33 million entries and I have to find a
> cookie. I have to linearly read through at least 1GB of cached data
> before I can give up and start a new readdir. Either that, or I need to
> have a heuristic that tells me when to stop searching, and then another
> heuristic that tells me where to store the data in a way that doesn't
> trash the page cache.
>
> With the hashing, I seek to the page matching the hash, and I either
> immediately find what I need, or I immediately know to start a readdir.
> There is no need for any additional heuristic.

The scenario where we want to find a cookie while not doing a linear pass
through the directory will be the seekdir() case. In a linear walk, we have
the cached page index to help. So in the seekdir case, the chances of
having someone already fill a page and also having the cookie be the 1 in
127 that are page-aligned (and so match an already cached page) are small, I
think. Unless your use-case will often hit the exact same offsets over and
over.

So with the hashing and seekdir case, I think that the cache will be pretty
heavily filled with the same duplicated data at various offsets and rarely
useful. That's why I wondered if you'd tested your use-case for it and found
it to be advantageous. I think what we've got is going to work fine, but I
wonder if you've seen it to work well.

The major pain point most of our users complain about is not being able to
perform a complete walk in linear time with respect to size with
invalidations at play. This series fixes that, and is a huge bonus. Other
smaller performance improvements are pale in comparison for us, and might
just get us forever chasing one or two minor optimizations that have
trade-offs.

There's a lot of variables at play. For some client/server setups (like
some low-latency RDMA), and very large directories and cache sizes, it might
be more performant to just do the READDIR every time, walking local caches
be damned.

>> Its weird to me that we're doing exactly what XArray says not to do,
>> hash
>> the index, when we don't have to.
>>
>>>> It also means we're amplifying the pagecache's useage for
>>>> slightly
>>>> changing
>>>> directories - rather than re-using the same pages we're
>>>> scattering
>>>> our usage
>>>> across the index.  Eh, maybe not a big deal if we just expect the
>>>> page
>>>> cache's LRU to do the work.
>>>>
>>>
>>> I don't understand your point about 'not reusing'. If the user
>>> seeks to
>>> the same cookie, we reuse the page. However I don't know how you
>>> would
>>> go about setting up a schema that allows you to seek to an
>>> arbitrary
>>> cookie without doing a linear search.
>>
>> So when I was taking about 'reusing' a page, that's about re-filling
>> the
>> same pages rather than constantly conjuring new ones, which requires
>> less of
>> the pagecache's resources in total.  Maybe the pagecache can handle
>> that
>> without it negatively impacting other users of the cache that /will/
>> re-use
>> their cached pages, but I worry it might be irresponsible of us to
>> fill the
>> pagecache with pages we know we're never going to find again.
>>
>
> In the case where the processes are reading linearly through a
> directory that is not changing (or at least where the beginning of the
> directory is not changing), we will reuse the cached data, because just
> like in the linearly indexed case, each process ends up reading the
> exact same sequence of cookies, and looking up the exact same sequence
> of hashes.
>
> The sequences start to diverge only if they hit a part of the directory
> that is being modified. At that point, we're going to be invalidating
> page cache entries anyway with the last reader being more likely to be
> following the new sequence of cookies.

I don't think we clean up behind ourselves anymore. Now that we are going
to validate each page before using it, we don't invalidate the whole cache
at any point. That means that a divergence duplicates the pagecache usage
beyond the divergence.

Ben

2022-03-11 20:44:38

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

On Fri, 2022-03-11 at 11:14 -0500, Benjamin Coddington wrote:
> On 11 Mar 2022, at 9:02, Trond Myklebust wrote:
>
> > On Fri, 2022-03-11 at 06:58 -0500, Benjamin Coddington wrote:
> > > On 10 Mar 2022, at 16:07, Trond Myklebust wrote:
> > >
> > > > On Wed, 2022-03-09 at 15:01 -0500, Benjamin Coddington wrote:
> > > > > On 27 Feb 2022, at 18:12, [email protected] wrote:
> > > > >
> > > > > > From: Trond Myklebust <[email protected]>
> > > > > >
> > > > > > Instead of using a linear index to address the pages, use
> > > > > > the
> > > > > > cookie of
> > > > > > the first entry, since that is what we use to match the
> > > > > > page
> > > > > > anyway.
> > > > > >
> > > > > > This allows us to avoid re-reading the entire cache on a
> > > > > > seekdir()
> > > > > > type
> > > > > > of operation. The latter is very common when re-exporting
> > > > > > NFS,
> > > > > > and
> > > > > > is a
> > > > > > major performance drain.
> > > > > >
> > > > > > The change does affect our duplicate cookie detection,
> > > > > > since we
> > > > > > can
> > > > > > no
> > > > > > longer rely on the page index as a linear offset for
> > > > > > detecting
> > > > > > whether
> > > > > > we looped backwards. However since we no longer do a linear
> > > > > > search
> > > > > > through all the pages on each call to nfs_readdir(), this
> > > > > > is
> > > > > > less
> > > > > > of a
> > > > > > concern than it was previously.
> > > > > > The other downside is that invalidate_mapping_pages() no
> > > > > > longer
> > > > > > can
> > > > > > use
> > > > > > the page index to avoid clearing pages that have been read.
> > > > > > A
> > > > > > subsequent
> > > > > > patch will restore the functionality this provides to the
> > > > > > 'ls -
> > > > > > l'
> > > > > > heuristic.
> > > > >
> > > > > I didn't realize the approach was to also hash out the
> > > > > linearly-
> > > > > cached
> > > > > entries.  I thought we'd do something like flag the context
> > > > > for
> > > > > hashed page
> > > > > indexes after a seekdir event, and if there are collisions
> > > > > with
> > > > > the
> > > > > linear
> > > > > entries, they'll get fixed up when found.
> > > >
> > > > Why? What's the point of using 2 models where 1 will do?
> > >
> > > I don't think the hashed model is quite as simple and efficient
> > > overall, and
> > > may produce impacts to a system beyond NFS.
> > >
> > > > >
> > > > > Doesn't that mean that with this approach seekdir() only hits
> > > > > the
> > > > > same pages
> > > > > when the entry offset is page-aligned?  That's 1 in 127 odds.
> > > >
> > > > The point is not to stomp all over the pages that contain
> > > > aligned
> > > > data
> > > > when the application does call seekdir().
> > > >
> > > > IOW: we always optimise for the case where we do a linear read
> > > > of
> > > > the
> > > > directory, but we support random seekdir() + read too.
> > >
> > > And that could be done just by bumping the seekdir users to some
> > > constant
> > > offset (index 262144 ?), or something else equally dead-nuts
> > > simple. 
> > > That
> > > keeps tightly clustered page indexes, so walking the cache is
> > > faster.  That
> > > reduces the "buckshot" effect the hashing has of eating up
> > > pagecache
> > > pages
> > > they'll never use again.  That doesn't cap our caching ability at
> > > 33
> > > million
> > > entries.
> > >
> >
> > What you say would make sense if readdir cookies truly were
> > offsets,
> > but in general they're not. Cookies are unstructured data, and
> > should
> > be treated as unstructured data.
> >
> > Let's say I do cache more than 33 million entries and I have to
> > find a
> > cookie. I have to linearly read through at least 1GB of cached data
> > before I can give up and start a new readdir. Either that, or I
> > need to
> > have a heuristic that tells me when to stop searching, and then
> > another
> > heuristic that tells me where to store the data in a way that
> > doesn't
> > trash the page cache.
> >
> > With the hashing, I seek to the page matching the hash, and I
> > either
> > immediately find what I need, or I immediately know to start a
> > readdir.
> > There is no need for any additional heuristic.
>
> The scenario where we want to find a cookie while not doing a linear
> pass
> through the directory will be the seekdir() case.  In a linear walk,
> we have
> the cached page index to help.  So in the seekdir case, the chances
> of
> having someone already fill a page and also having the cookie be the
> 1 in
> 127 that are page-aligned (and so match an already cached page) are
> small, I
> think.  Unless your use-case will often hit the exact same offsets
> over and
> over.

For the use case where we are reexporting NFS, it can definitely
happen.
Firstly, the clients usually are reading the reexported directory
linearly, so they will tend to follow the same cookie request patterns.
Secondly, we're not going to replay the readdir from the duplicate
reply cache if the client resends the request. So even if you only have
one client, there can be a benefit.

>
> So with the hashing and seekdir case, I think that the cache will be
> pretty
> heavily filled with the same duplicated data at various offsets and
> rarely
> useful.  That's why I wondered if you'd tested your use-case for it
> and found
> it to be advantageous.  I think what we've got is going to work fine,
> but I
> wonder if you've seen it to work well.
>
> The major pain point most of our users complain about is not being
> able to
> perform a complete walk in linear time with respect to size with
> invalidations at play.  This series fixes that, and is a huge bonus. 
> Other
> smaller performance improvements are pale in comparison for us, and
> might
> just get us forever chasing one or two minor optimizations that have
> trade-offs.
>
> There's a lot of variables at play.  For some client/server setups
> (like
> some low-latency RDMA), and very large directories and cache sizes,
> it might
> be more performant to just do the READDIR every time, walking local
> caches
> be damned.
>

Sure, so a dedicated readdirplus() system call could help by providing
the same kind of guidance that statx() does today.

> > > Its weird to me that we're doing exactly what XArray says not to
> > > do,
> > > hash
> > > the index, when we don't have to.
> > >
> > > > > It also means we're amplifying the pagecache's useage for
> > > > > slightly
> > > > > changing
> > > > > directories - rather than re-using the same pages we're
> > > > > scattering
> > > > > our usage
> > > > > across the index.  Eh, maybe not a big deal if we just expect
> > > > > the
> > > > > page
> > > > > cache's LRU to do the work.
> > > > >
> > > >
> > > > I don't understand your point about 'not reusing'. If the user
> > > > seeks to
> > > > the same cookie, we reuse the page. However I don't know how
> > > > you
> > > > would
> > > > go about setting up a schema that allows you to seek to an
> > > > arbitrary
> > > > cookie without doing a linear search.
> > >
> > > So when I was taking about 'reusing' a page, that's about re-
> > > filling
> > > the
> > > same pages rather than constantly conjuring new ones, which
> > > requires
> > > less of
> > > the pagecache's resources in total.  Maybe the pagecache can
> > > handle
> > > that
> > > without it negatively impacting other users of the cache that
> > > /will/
> > > re-use
> > > their cached pages, but I worry it might be irresponsible of us
> > > to
> > > fill the
> > > pagecache with pages we know we're never going to find again.
> > >
> >
> > In the case where the processes are reading linearly through a
> > directory that is not changing (or at least where the beginning of
> > the
> > directory is not changing), we will reuse the cached data, because
> > just
> > like in the linearly indexed case, each process ends up reading the
> > exact same sequence of cookies, and looking up the exact same
> > sequence
> > of hashes.
> >
> > The sequences start to diverge only if they hit a part of the
> > directory
> > that is being modified. At that point, we're going to be
> > invalidating
> > page cache entries anyway with the last reader being more likely to
> > be
> > following the new sequence of cookies.
>
> I don't think we clean up behind ourselves anymore.  Now that we are
> going
> to validate each page before using it, we don't invalidate the whole
> cache
> at any point.  That means that a divergence duplicates the pagecache
> usage
> beyond the divergence.
>

No. I reinstated the call to nfs_revalidate_mapping() in the linux-next
branch after Olga demonstrated that NFSv3 is still troubled with crappy
mtime/ctime resolutions on the server causing directory changes to not
be reflected in the readdir cache.

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2022-03-11 20:50:40

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

On Wed, 2022-03-09 at 15:01 -0500, Benjamin Coddington wrote:
> On 27 Feb 2022, at 18:12, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > Instead of using a linear index to address the pages, use the
> > cookie of
> > the first entry, since that is what we use to match the page
> > anyway.
> >
> > This allows us to avoid re-reading the entire cache on a seekdir()
> > type
> > of operation. The latter is very common when re-exporting NFS, and
> > is a
> > major performance drain.
> >
> > The change does affect our duplicate cookie detection, since we can
> > no
> > longer rely on the page index as a linear offset for detecting
> > whether
> > we looped backwards. However since we no longer do a linear search
> > through all the pages on each call to nfs_readdir(), this is less
> > of a
> > concern than it was previously.
> > The other downside is that invalidate_mapping_pages() no longer can
> > use
> > the page index to avoid clearing pages that have been read. A
> > subsequent
> > patch will restore the functionality this provides to the 'ls -l'
> > heuristic.
>
> I didn't realize the approach was to also hash out the linearly-
> cached
> entries.  I thought we'd do something like flag the context for
> hashed page
> indexes after a seekdir event, and if there are collisions with the
> linear
> entries, they'll get fixed up when found.

Why? What's the point of using 2 models where 1 will do?

>
> Doesn't that mean that with this approach seekdir() only hits the
> same pages
> when the entry offset is page-aligned?  That's 1 in 127 odds.

The point is not to stomp all over the pages that contain aligned data
when the application does call seekdir().

IOW: we always optimise for the case where we do a linear read of the
directory, but we support random seekdir() + read too.

>
> It also means we're amplifying the pagecache's useage for slightly
> changing
> directories - rather than re-using the same pages we're scattering
> our usage
> across the index.  Eh, maybe not a big deal if we just expect the
> page
> cache's LRU to do the work.
>

I don't understand your point about 'not reusing'. If the user seeks to
the same cookie, we reuse the page. However I don't know how you would
go about setting up a schema that allows you to seek to an arbitrary
cookie without doing a linear search.

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2022-03-11 21:05:40

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

On 10 Mar 2022, at 16:07, Trond Myklebust wrote:

> On Wed, 2022-03-09 at 15:01 -0500, Benjamin Coddington wrote:
>> On 27 Feb 2022, at 18:12, [email protected] wrote:
>>
>>> From: Trond Myklebust <[email protected]>
>>>
>>> Instead of using a linear index to address the pages, use the
>>> cookie of
>>> the first entry, since that is what we use to match the page
>>> anyway.
>>>
>>> This allows us to avoid re-reading the entire cache on a seekdir()
>>> type
>>> of operation. The latter is very common when re-exporting NFS, and
>>> is a
>>> major performance drain.
>>>
>>> The change does affect our duplicate cookie detection, since we can
>>> no
>>> longer rely on the page index as a linear offset for detecting
>>> whether
>>> we looped backwards. However since we no longer do a linear search
>>> through all the pages on each call to nfs_readdir(), this is less
>>> of a
>>> concern than it was previously.
>>> The other downside is that invalidate_mapping_pages() no longer can
>>> use
>>> the page index to avoid clearing pages that have been read. A
>>> subsequent
>>> patch will restore the functionality this provides to the 'ls -l'
>>> heuristic.
>>
>> I didn't realize the approach was to also hash out the linearly-
>> cached
>> entries.  I thought we'd do something like flag the context for
>> hashed page
>> indexes after a seekdir event, and if there are collisions with the
>> linear
>> entries, they'll get fixed up when found.
>
> Why? What's the point of using 2 models where 1 will do?

I don't think the hashed model is quite as simple and efficient overall, and
may produce impacts to a system beyond NFS.

>>
>> Doesn't that mean that with this approach seekdir() only hits the
>> same pages
>> when the entry offset is page-aligned?  That's 1 in 127 odds.
>
> The point is not to stomp all over the pages that contain aligned data
> when the application does call seekdir().
>
> IOW: we always optimise for the case where we do a linear read of the
> directory, but we support random seekdir() + read too.

And that could be done just by bumping the seekdir users to some constant
offset (index 262144 ?), or something else equally dead-nuts simple. That
keeps tightly clustered page indexes, so walking the cache is faster. That
reduces the "buckshot" effect the hashing has of eating up pagecache pages
they'll never use again. That doesn't cap our caching ability at 33 million
entries.

Its weird to me that we're doing exactly what XArray says not to do, hash
the index, when we don't have to.

>> It also means we're amplifying the pagecache's useage for slightly
>> changing
>> directories - rather than re-using the same pages we're scattering
>> our usage
>> across the index.  Eh, maybe not a big deal if we just expect the
>> page
>> cache's LRU to do the work.
>>
>
> I don't understand your point about 'not reusing'. If the user seeks to
> the same cookie, we reuse the page. However I don't know how you would
> go about setting up a schema that allows you to seek to an arbitrary
> cookie without doing a linear search.

So when I was taking about 'reusing' a page, that's about re-filling the
same pages rather than constantly conjuring new ones, which requires less of
the pagecache's resources in total. Maybe the pagecache can handle that
without it negatively impacting other users of the cache that /will/ re-use
their cached pages, but I worry it might be irresponsible of us to fill the
pagecache with pages we know we're never going to find again.

Ben

2022-03-11 21:50:07

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v9 23/27] NFS: Convert readdir page cache to use a cookie based index

On Fri, 2022-03-11 at 06:58 -0500, Benjamin Coddington wrote:
> On 10 Mar 2022, at 16:07, Trond Myklebust wrote:
>
> > On Wed, 2022-03-09 at 15:01 -0500, Benjamin Coddington wrote:
> > > On 27 Feb 2022, at 18:12, [email protected] wrote:
> > >
> > > > From: Trond Myklebust <[email protected]>
> > > >
> > > > Instead of using a linear index to address the pages, use the
> > > > cookie of
> > > > the first entry, since that is what we use to match the page
> > > > anyway.
> > > >
> > > > This allows us to avoid re-reading the entire cache on a
> > > > seekdir()
> > > > type
> > > > of operation. The latter is very common when re-exporting NFS,
> > > > and
> > > > is a
> > > > major performance drain.
> > > >
> > > > The change does affect our duplicate cookie detection, since we
> > > > can
> > > > no
> > > > longer rely on the page index as a linear offset for detecting
> > > > whether
> > > > we looped backwards. However since we no longer do a linear
> > > > search
> > > > through all the pages on each call to nfs_readdir(), this is
> > > > less
> > > > of a
> > > > concern than it was previously.
> > > > The other downside is that invalidate_mapping_pages() no longer
> > > > can
> > > > use
> > > > the page index to avoid clearing pages that have been read. A
> > > > subsequent
> > > > patch will restore the functionality this provides to the 'ls -
> > > > l'
> > > > heuristic.
> > >
> > > I didn't realize the approach was to also hash out the linearly-
> > > cached
> > > entries.  I thought we'd do something like flag the context for
> > > hashed page
> > > indexes after a seekdir event, and if there are collisions with
> > > the
> > > linear
> > > entries, they'll get fixed up when found.
> >
> > Why? What's the point of using 2 models where 1 will do?
>
> I don't think the hashed model is quite as simple and efficient
> overall, and
> may produce impacts to a system beyond NFS.
>
> > >
> > > Doesn't that mean that with this approach seekdir() only hits the
> > > same pages
> > > when the entry offset is page-aligned?  That's 1 in 127 odds.
> >
> > The point is not to stomp all over the pages that contain aligned
> > data
> > when the application does call seekdir().
> >
> > IOW: we always optimise for the case where we do a linear read of
> > the
> > directory, but we support random seekdir() + read too.
>
> And that could be done just by bumping the seekdir users to some
> constant
> offset (index 262144 ?), or something else equally dead-nuts simple. 
> That
> keeps tightly clustered page indexes, so walking the cache is
> faster.  That
> reduces the "buckshot" effect the hashing has of eating up pagecache
> pages
> they'll never use again.  That doesn't cap our caching ability at 33
> million
> entries.
>

What you say would make sense if readdir cookies truly were offsets,
but in general they're not. Cookies are unstructured data, and should
be treated as unstructured data.

Let's say I do cache more than 33 million entries and I have to find a
cookie. I have to linearly read through at least 1GB of cached data
before I can give up and start a new readdir. Either that, or I need to
have a heuristic that tells me when to stop searching, and then another
heuristic that tells me where to store the data in a way that doesn't
trash the page cache.

With the hashing, I seek to the page matching the hash, and I either
immediately find what I need, or I immediately know to start a readdir.
There is no need for any additional heuristic.

> Its weird to me that we're doing exactly what XArray says not to do,
> hash
> the index, when we don't have to.
>
> > > It also means we're amplifying the pagecache's useage for
> > > slightly
> > > changing
> > > directories - rather than re-using the same pages we're
> > > scattering
> > > our usage
> > > across the index.  Eh, maybe not a big deal if we just expect the
> > > page
> > > cache's LRU to do the work.
> > >
> >
> > I don't understand your point about 'not reusing'. If the user
> > seeks to
> > the same cookie, we reuse the page. However I don't know how you
> > would
> > go about setting up a schema that allows you to seek to an
> > arbitrary
> > cookie without doing a linear search.
>
> So when I was taking about 'reusing' a page, that's about re-filling
> the
> same pages rather than constantly conjuring new ones, which requires
> less of
> the pagecache's resources in total.  Maybe the pagecache can handle
> that
> without it negatively impacting other users of the cache that /will/
> re-use
> their cached pages, but I worry it might be irresponsible of us to
> fill the
> pagecache with pages we know we're never going to find again.
>

In the case where the processes are reading linearly through a
directory that is not changing (or at least where the beginning of the
directory is not changing), we will reuse the cached data, because just
like in the linearly indexed case, each process ends up reading the
exact same sequence of cookies, and looking up the exact same sequence
of hashes.

The sequences start to diverge only if they hit a part of the directory
that is being modified. At that point, we're going to be invalidating
page cache entries anyway with the last reader being more likely to be
following the new sequence of cookies.

The hashed indexing does come with a cost, thanks to XArray but that
cost is limited to a max of 8MB with the current scheme.

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]