2022-02-23 21:27:49

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v7 18/21] NFS: Trace effects of the readdirplus heuristic

From: Trond Myklebust <[email protected]>

Enable tracking of when the readdirplus heuristic causes a page cache
invalidation.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 11 ++++++++++-
fs/nfs/nfstrace.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 95b18d1ad0cf..06bd612296d5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -985,6 +985,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
if (res == -EBADCOOKIE || res == -ENOTSYNC) {
invalidate_inode_pages2(desc->file->f_mapping);
desc->page_index = 0;
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, 0, MAX_LFS_FILESIZE);
return -EAGAIN;
}
return res;
@@ -999,6 +1001,9 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
invalidate_inode_pages2_range(desc->file->f_mapping,
desc->page_index_max + 1,
-1);
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, desc->page_index_max + 1,
+ MAX_LFS_FILESIZE);
}
}
res = nfs_readdir_search_array(desc);
@@ -1145,7 +1150,11 @@ static void nfs_readdir_handle_cache_misses(struct inode *inode,
if (desc->ctx->pos == 0 ||
cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
return;
- invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+ if (invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1) == 0)
+ return;
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, (loff_t)(page_index + 1) << PAGE_SHIFT,
+ MAX_LFS_FILESIZE);
}

/* The file offset position represents the dirent entry number. A
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 7c1102b991d0..ec2645d20abf 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -273,6 +273,56 @@ DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
DEFINE_NFS_UPDATE_SIZE_EVENT(update);
DEFINE_NFS_UPDATE_SIZE_EVENT(grow);

+DECLARE_EVENT_CLASS(nfs_inode_range_event,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t range_start,
+ loff_t range_end
+ ),
+
+ TP_ARGS(inode, range_start, range_end),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, range_start)
+ __field(loff_t, range_end)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->fileid = nfsi->fileid;
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->range_start = range_start;
+ __entry->range_end = range_end;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "range=[%lld, %lld]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->range_start, __entry->range_end
+ )
+);
+
+#define DEFINE_NFS_INODE_RANGE_EVENT(name) \
+ DEFINE_EVENT(nfs_inode_range_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ loff_t range_start, \
+ loff_t range_end \
+ ), \
+ TP_ARGS(inode, range_start, range_end))
+
+DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range);
+
DECLARE_EVENT_CLASS(nfs_readdir_event,
TP_PROTO(
const struct file *file,
--
2.35.1


2022-02-24 01:42:05

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v7 19/21] NFS: Convert readdir page cache to use a cookie based index

From: Trond Myklebust <[email protected]>

Instead of using a linear index to address the pages, use the cookie of
the first entry, since that is what we use to match the page anyway.

This allows us to avoid re-reading the entire cache on a seekdir() type
of operation. The latter is very common when re-exporting NFS, and is a
major performance drain.

The change does affect our duplicate cookie detection, since we can no
longer rely on the page index as a linear offset for detecting whether
we looped backwards. However since we no longer do a linear search
through all the pages on each call to nfs_readdir(), this is less of a
concern than it was previously.
The other downside is that invalidate_mapping_pages() no longer can use
the page index to avoid clearing pages that have been read. A subsequent
patch will restore the functionality this provides to the 'ls -l'
heuristic.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 99 +++++++++++++++---------------------------
include/linux/nfs_fs.h | 2 -
2 files changed, 34 insertions(+), 67 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06bd612296d5..2007eebfb5cf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -39,6 +39,7 @@
#include <linux/sched.h>
#include <linux/kmemleak.h>
#include <linux/xattr.h>
+#include <linux/xxhash.h>

#include "delegation.h"
#include "iostat.h"
@@ -158,9 +159,7 @@ struct nfs_readdir_descriptor {
pgoff_t page_index_max;
u64 dir_cookie;
u64 last_cookie;
- u64 dup_cookie;
loff_t current_index;
- loff_t prev_index;

__be32 verf[NFS_DIR_VERIFIER_SIZE];
unsigned long dir_verifier;
@@ -170,7 +169,6 @@ struct nfs_readdir_descriptor {
unsigned int cache_entry_index;
unsigned int buffer_fills;
unsigned int dtsize;
- signed char duped;
bool plus;
bool eob;
bool eof;
@@ -333,6 +331,13 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
return ret;
}

+static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
+{
+ if (cookie == 0)
+ return 0;
+ return xxhash(&cookie, sizeof(cookie), 0);
+}
+
static bool nfs_readdir_page_cookie_match(struct page *page, u64 last_cookie,
u64 change_attr)
{
@@ -354,8 +359,9 @@ static void nfs_readdir_page_unlock_and_put(struct page *page)
}

static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
- pgoff_t index, u64 last_cookie)
+ u64 last_cookie)
{
+ pgoff_t index = nfs_readdir_page_cookie_hash(last_cookie);
struct page *page;
u64 change_attr;

@@ -374,11 +380,6 @@ static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
return page;
}

-static loff_t nfs_readdir_page_offset(struct page *page)
-{
- return (loff_t)page->index * (loff_t)nfs_readdir_array_maxentries();
-}
-
static u64 nfs_readdir_page_last_cookie(struct page *page)
{
struct nfs_cache_array *array;
@@ -411,11 +412,11 @@ static void nfs_readdir_page_set_eof(struct page *page)
}

static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
- pgoff_t index, u64 cookie)
+ u64 cookie)
{
struct page *page;

- page = nfs_readdir_page_get_locked(mapping, index, cookie);
+ page = nfs_readdir_page_get_locked(mapping, cookie);
if (page) {
if (nfs_readdir_page_last_cookie(page) == cookie)
return page;
@@ -443,6 +444,13 @@ bool nfs_readdir_use_cookie(const struct file *filp)
return true;
}

+static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
+{
+ desc->current_index = 0;
+ desc->last_cookie = 0;
+ desc->page_index = 0;
+}
+
static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
@@ -491,32 +499,11 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,

for (i = 0; i < array->size; i++) {
if (array->array[i].cookie == desc->dir_cookie) {
- struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
-
- new_pos = nfs_readdir_page_offset(desc->page) + i;
- if (desc->attr_gencount != nfsi->attr_gencount) {
- desc->duped = 0;
- desc->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->prev_index) {
- if (desc->duped > 0
- && desc->dup_cookie == desc->dir_cookie) {
- if (printk_ratelimit()) {
- pr_notice("NFS: directory %pD2 contains a readdir loop."
- "Please contact your server vendor. "
- "The file: %s has duplicate cookie %llu\n",
- desc->file, array->array[i].name, desc->dir_cookie);
- }
- status = -ELOOP;
- goto out;
- }
- desc->dup_cookie = desc->dir_cookie;
- desc->duped = -1;
- }
+ new_pos = desc->current_index + i;
if (nfs_readdir_use_cookie(desc->file))
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos = new_pos;
- desc->prev_index = new_pos;
desc->cache_entry_index = i;
return 0;
}
@@ -527,7 +514,6 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
if (desc->dir_cookie == array->last_cookie)
desc->eof = true;
}
-out:
return status;
}

@@ -820,18 +806,16 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
break;
arrays++;
*arrays = page = new;
- desc->page_index_max++;
} else {
new = nfs_readdir_page_get_next(mapping,
- page->index + 1,
entry->prev_cookie);
if (!new)
break;
if (page != *arrays)
nfs_readdir_page_unlock_and_put(page);
page = new;
- desc->page_index_max = new->index;
}
+ desc->page_index_max++;
status = nfs_readdir_add_to_array(entry, page);
} while (!status && !entry->eof);

@@ -954,7 +938,6 @@ static struct page *
nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
{
return nfs_readdir_page_get_locked(desc->file->f_mapping,
- desc->page_index,
desc->last_cookie);
}

@@ -984,7 +967,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
trace_nfs_readdir_cache_fill_done(inode, res);
if (res == -EBADCOOKIE || res == -ENOTSYNC) {
invalidate_inode_pages2(desc->file->f_mapping);
- desc->page_index = 0;
+ nfs_readdir_rewind_search(desc);
trace_nfs_readdir_invalidate_cache_range(
inode, 0, MAX_LFS_FILESIZE);
return -EAGAIN;
@@ -998,12 +981,10 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) {
memcpy(nfsi->cookieverf, verf,
sizeof(nfsi->cookieverf));
- invalidate_inode_pages2_range(desc->file->f_mapping,
- desc->page_index_max + 1,
+ invalidate_inode_pages2_range(desc->file->f_mapping, 1,
-1);
trace_nfs_readdir_invalidate_cache_range(
- inode, desc->page_index_max + 1,
- MAX_LFS_FILESIZE);
+ inode, 1, MAX_LFS_FILESIZE);
}
}
res = nfs_readdir_search_array(desc);
@@ -1019,11 +1000,6 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
int res;

do {
- if (desc->page_index == 0) {
- desc->current_index = 0;
- desc->prev_index = 0;
- desc->last_cookie = 0;
- }
res = find_and_lock_cache_page(desc);
} while (res == -EAGAIN);
return res;
@@ -1058,8 +1034,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
- if (desc->duped != 0)
- desc->duped = 1;
}
if (array->page_is_eof)
desc->eof = !desc->eob;
@@ -1101,7 +1075,6 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
desc->page_index = 0;
desc->cache_entry_index = 0;
desc->last_cookie = desc->dir_cookie;
- desc->duped = 0;
desc->page_index_max = 0;

trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
@@ -1134,6 +1107,8 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
for (i = 0; i < sz && arrays[i]; i++)
nfs_readdir_page_array_free(arrays[i]);
out:
+ if (!nfs_readdir_use_cookie(desc->file))
+ nfs_readdir_rewind_search(desc);
desc->page_index_max = -1;
kfree(arrays);
dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
@@ -1144,17 +1119,14 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)

static void nfs_readdir_handle_cache_misses(struct inode *inode,
struct nfs_readdir_descriptor *desc,
- pgoff_t page_index,
unsigned int cache_misses)
{
if (desc->ctx->pos == 0 ||
cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
return;
- if (invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1) == 0)
+ if (invalidate_mapping_pages(inode->i_mapping, 0, -1) == 0)
return;
- trace_nfs_readdir_invalidate_cache_range(
- inode, (loff_t)(page_index + 1) << PAGE_SHIFT,
- MAX_LFS_FILESIZE);
+ trace_nfs_readdir_invalidate_cache_range(inode, 0, MAX_LFS_FILESIZE);
}

/* The file offset position represents the dirent entry number. A
@@ -1194,8 +1166,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)

spin_lock(&file->f_lock);
desc->dir_cookie = dir_ctx->dir_cookie;
- desc->dup_cookie = dir_ctx->dup_cookie;
- desc->duped = dir_ctx->duped;
page_index = dir_ctx->page_index;
desc->page_index = page_index;
desc->last_cookie = dir_ctx->last_cookie;
@@ -1213,7 +1183,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
}

desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
- nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses);
+ nfs_readdir_handle_cache_misses(inode, desc, cache_misses);

do {
res = readdir_search_pagecache(desc);
@@ -1233,7 +1203,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
}
if (res == -ETOOSMALL && desc->plus) {
nfs_zap_caches(inode);
- desc->page_index = 0;
desc->plus = false;
desc->eof = false;
continue;
@@ -1252,9 +1221,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)

spin_lock(&file->f_lock);
dir_ctx->dir_cookie = desc->dir_cookie;
- dir_ctx->dup_cookie = desc->dup_cookie;
dir_ctx->last_cookie = desc->last_cookie;
- dir_ctx->duped = desc->duped;
dir_ctx->attr_gencount = desc->attr_gencount;
dir_ctx->page_index = desc->page_index;
dir_ctx->eof = desc->eof;
@@ -1297,13 +1264,15 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
if (offset != filp->f_pos) {
filp->f_pos = offset;
dir_ctx->page_index = 0;
- if (!nfs_readdir_use_cookie(filp))
+ if (!nfs_readdir_use_cookie(filp)) {
dir_ctx->dir_cookie = 0;
- else
+ dir_ctx->last_cookie = 0;
+ } else {
dir_ctx->dir_cookie = offset;
+ dir_ctx->last_cookie = offset;
+ }
if (offset == 0)
memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
- dir_ctx->duped = 0;
dir_ctx->eof = false;
}
spin_unlock(&filp->f_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 20a4cf0acad2..42aad886d3c0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -106,11 +106,9 @@ struct nfs_open_dir_context {
unsigned long attr_gencount;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
__u64 dir_cookie;
- __u64 dup_cookie;
__u64 last_cookie;
pgoff_t page_index;
unsigned int dtsize;
- signed char duped;
bool eof;
struct rcu_head rcu_head;
};
--
2.35.1

2022-02-24 17:57:34

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v7 19/21] NFS: Convert readdir page cache to use a cookie based index

On 23 Feb 2022, at 16:13, [email protected] wrote:

> From: Trond Myklebust <[email protected]>
>
> Instead of using a linear index to address the pages, use the cookie of
> the first entry, since that is what we use to match the page anyway.
>
> This allows us to avoid re-reading the entire cache on a seekdir() type
> of operation. The latter is very common when re-exporting NFS, and is a
> major performance drain.
>
> The change does affect our duplicate cookie detection, since we can no
> longer rely on the page index as a linear offset for detecting whether
> we looped backwards. However since we no longer do a linear search
> through all the pages on each call to nfs_readdir(), this is less of a
> concern than it was previously.
> The other downside is that invalidate_mapping_pages() no longer can use
> the page index to avoid clearing pages that have been read. A subsequent
> patch will restore the functionality this provides to the 'ls -l'
> heuristic.

This is cool, but one reason I did not explore this was that the page cache
index uses XArray, which is optimized for densly clustered indexes. This
particular sentence in the documentation was enough to scare me away:

"The XArray implementation is efficient when the indices used are densely
clustered; hashing the object and using the hash as the index will not
perform well."

However, the "not perform well" may be orders of magnitude smaller than
anthing like RPC. Do you have concerns about this?

Another option might be to flag the context after a seekdir, which would
trigger a shift in the page_index or "turn on" hashed indexes, however
that's really only going to improve the re-export case with v4 or cached
fds.

Or maybe the /first/ seekdir on a context sets its own offset into the
pagecache - that could be a hash, and pages are filled from there.

Hmm..

Ben

2022-02-25 04:47:48

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v7 19/21] NFS: Convert readdir page cache to use a cookie based index

On Thu, 2022-02-24 at 12:31 -0500, Benjamin Coddington wrote:
> On 23 Feb 2022, at 16:13, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > Instead of using a linear index to address the pages, use the
> > cookie of
> > the first entry, since that is what we use to match the page
> > anyway.
> >
> > This allows us to avoid re-reading the entire cache on a seekdir()
> > type
> > of operation. The latter is very common when re-exporting NFS, and
> > is a
> > major performance drain.
> >
> > The change does affect our duplicate cookie detection, since we can
> > no
> > longer rely on the page index as a linear offset for detecting
> > whether
> > we looped backwards. However since we no longer do a linear search
> > through all the pages on each call to nfs_readdir(), this is less
> > of a
> > concern than it was previously.
> > The other downside is that invalidate_mapping_pages() no longer can
> > use
> > the page index to avoid clearing pages that have been read. A
> > subsequent
> > patch will restore the functionality this provides to the 'ls -l'
> > heuristic.
>
> This is cool, but one reason I did not explore this was that the page
> cache
> index uses XArray, which is optimized for densly clustered indexes. 
> This
> particular sentence in the documentation was enough to scare me away:
>
> "The XArray implementation is efficient when the indices used are
> densely
> clustered; hashing the object and using the hash as the index will
> not
> perform well."
>
> However, the "not perform well" may be orders of magnitude smaller
> than
> anthing like RPC.  Do you have concerns about this?

What is the difference between this workload and a random access
database workload?

If the XArray is incapable of dealing with random access, then we
should never have chosen it for the page cache. I'm therefore assuming
that either the above comment is referring to micro-optimisations that
don't matter much with these workloads, or else that the plan is to
replace the XArray with something more appropriate for a page cache
workload.


>
> Another option might be to flag the context after a seekdir, which
> would
> trigger a shift in the page_index or "turn on" hashed indexes,
> however
> that's really only going to improve the re-export case with v4 or
> cached
> fds.
>
> Or maybe the /first/ seekdir on a context sets its own offset into
> the
> pagecache - that could be a hash, and pages are filled from there.
>

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2022-02-25 05:23:19

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v7 19/21] NFS: Convert readdir page cache to use a cookie based index

On Fri, 2022-02-25 at 14:17 +1100, NeilBrown wrote:
> On Fri, 25 Feb 2022, Trond Myklebust wrote:
> > On Thu, 2022-02-24 at 12:31 -0500, Benjamin Coddington wrote:
> > >
> > > "The XArray implementation is efficient when the indices used are
> > > densely
> > > clustered; hashing the object and using the hash as the index
> > > will
> > > not
> > > perform well."
> > >
> > > However, the "not perform well" may be orders of magnitude
> > > smaller
> > > than
> > > anthing like RPC.  Do you have concerns about this?
> >
> > What is the difference between this workload and a random access
> > database workload?
>
> Probably the range of expected addresses.
> If I understand the proposal correctly, the page addresses in this
> workload could be any 64bit number.
> For a large database, it would be at most 52 bits (assuming 64bits
> worth
> of bytes), and very likely substantially smaller - maybe 40 bits for
> a
> really really big database.
>
> >
> > If the XArray is incapable of dealing with random access, then we
> > should never have chosen it for the page cache. I'm therefore
> > assuming
> > that either the above comment is referring to micro-optimisations
> > that
> > don't matter much with these workloads, or else that the plan is to
> > replace the XArray with something more appropriate for a page cache
> > workload.
>
> I haven't looked at the code recently so this might not be 100%
> accurate, but XArray generally assumes that pages are often adjacent.
> They don't have to be, but there is a cost.
> It uses a multi-level array with 9 bits per level.  At each level
> there
> are a whole number of pages for indexes to the next level.
>
> If there are two entries, that are 2^45 separate, that is 5 levels of
> indexing that cannot be shared.  So the path to one entry is 5 pages,
> each of which contains a single pointer.  The path to the other entry
> is
> a separate set of 5 pages.
>
> So worst case, the index would be about 64/9 or 7 times the size of
> the
> data.  As the number of data pages increases, this would shrink
> slightly, but I suspect you wouldn't get below a factor of 3 before
> you
> fill up all of your memory.
>


If the problem is just the range, then that is trivial to fix: we can
just use xxhash32(), and take the hit of more collisions. However if
the problem is the access pattern, then I have serious questions about
the choice of implementation for the page cache. If the cache can't
support file random access, then we're barking up the wrong tree on the
wrong continent.

Either way, I see avoiding linear searches for cookies as a benefit
that is worth pursuing.

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2022-02-25 06:27:52

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v7 19/21] NFS: Convert readdir page cache to use a cookie based index

On Fri, 25 Feb 2022, Trond Myklebust wrote:
> On Thu, 2022-02-24 at 12:31 -0500, Benjamin Coddington wrote:
> >
> > "The XArray implementation is efficient when the indices used are
> > densely
> > clustered; hashing the object and using the hash as the index will
> > not
> > perform well."
> >
> > However, the "not perform well" may be orders of magnitude smaller
> > than
> > anthing like RPC.  Do you have concerns about this?
>
> What is the difference between this workload and a random access
> database workload?

Probably the range of expected addresses.
If I understand the proposal correctly, the page addresses in this
workload could be any 64bit number.
For a large database, it would be at most 52 bits (assuming 64bits worth
of bytes), and very likely substantially smaller - maybe 40 bits for a
really really big database.

>
> If the XArray is incapable of dealing with random access, then we
> should never have chosen it for the page cache. I'm therefore assuming
> that either the above comment is referring to micro-optimisations that
> don't matter much with these workloads, or else that the plan is to
> replace the XArray with something more appropriate for a page cache
> workload.

I haven't looked at the code recently so this might not be 100%
accurate, but XArray generally assumes that pages are often adjacent.
They don't have to be, but there is a cost.
It uses a multi-level array with 9 bits per level. At each level there
are a whole number of pages for indexes to the next level.

If there are two entries, that are 2^45 separate, that is 5 levels of
indexing that cannot be shared. So the path to one entry is 5 pages,
each of which contains a single pointer. The path to the other entry is
a separate set of 5 pages.

So worst case, the index would be about 64/9 or 7 times the size of the
data. As the number of data pages increases, this would shrink
slightly, but I suspect you wouldn't get below a factor of 3 before you
fill up all of your memory.

NeilBrown

2022-02-25 14:03:57

by Benjamin Coddington

[permalink] [raw]
Subject: Re: [PATCH v7 19/21] NFS: Convert readdir page cache to use a cookie based index

On 24 Feb 2022, at 23:25, Trond Myklebust wrote:

> On Fri, 2022-02-25 at 14:17 +1100, NeilBrown wrote:

>> I haven't looked at the code recently so this might not be 100% accurate,
>> but XArray generally assumes that pages are often adjacent. They don't
>> have to be, but there is a cost. It uses a multi-level array with 9 bits
>> per level.  At each level there are a whole number of pages for indexes
>> to the next level.
>>
>> If there are two entries, that are 2^45 separate, that is 5 levels of
>> indexing that cannot be shared.  So the path to one entry is 5 pages,
>> each of which contains a single pointer.  The path to the other entry is
>> a separate set of 5 pages.
>>
>> So worst case, the index would be about 64/9 or 7 times the size of the
>> data.  As the number of data pages increases, this would shrink slightly,
>> but I suspect you wouldn't get below a factor of 3 before you fill up all
>> of your memory.

Yikes!

> If the problem is just the range, then that is trivial to fix: we can
> just use xxhash32(), and take the hit of more collisions. However if
> the problem is the access pattern, then I have serious questions about
> the choice of implementation for the page cache. If the cache can't
> support file random access, then we're barking up the wrong tree on the
> wrong continent.

I'm guessing the issue might be "get next", which for an "array" is probably
the operation tested for "perform well". We're not doing any of that, we're
directly addressing pages with our hashed index.

> Either way, I see avoiding linear searches for cookies as a benefit
> that is worth pursuing.

Me too. What about just kicking the seekdir users up into the second half
of the index, to use xxhash32() up there. Everyone else can hang out in the
bottom half with dense indexes and help each other out.

The vast majority of readdir() use is going to be short listings traversed
in order. The memory inflation created by a process that needs to walk a
tree and for every two pages of readdir data require 10 pages of indexes
seems pretty extreme.

Ben

2022-02-25 17:38:45

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v7 19/21] NFS: Convert readdir page cache to use a cookie based index

On Fri, 2022-02-25 at 07:33 -0500, Benjamin Coddington wrote:
> On 24 Feb 2022, at 23:25, Trond Myklebust wrote:
>
> > On Fri, 2022-02-25 at 14:17 +1100, NeilBrown wrote:
>
> > > I haven't looked at the code recently so this might not be 100%
> > > accurate,
> > > but XArray generally assumes that pages are often adjacent.  They
> > > don't
> > > have to be, but there is a cost.  It uses a multi-level array
> > > with 9 bits
> > > per level.  At each level there are a whole number of pages for
> > > indexes
> > > to the next level.
> > >
> > > If there are two entries, that are 2^45 separate, that is 5
> > > levels of
> > > indexing that cannot be shared.  So the path to one entry is 5
> > > pages,
> > > each of which contains a single pointer.  The path to the other
> > > entry is
> > > a separate set of 5 pages.
> > >
> > > So worst case, the index would be about 64/9 or 7 times the size
> > > of the
> > > data.  As the number of data pages increases, this would shrink
> > > slightly,
> > > but I suspect you wouldn't get below a factor of 3 before you
> > > fill up all
> > > of your memory.
>
> Yikes!
>
> > If the problem is just the range, then that is trivial to fix: we
> > can
> > just use xxhash32(), and take the hit of more collisions. However
> > if
> > the problem is the access pattern, then I have serious questions
> > about
> > the choice of implementation for the page cache. If the cache can't
> > support file random access, then we're barking up the wrong tree on
> > the
> > wrong continent.
>
> I'm guessing the issue might be "get next", which for an "array" is
> probably
> the operation tested for "perform well".  We're not doing any of
> that, we're
> directly addressing pages with our hashed index.
>
> > Either way, I see avoiding linear searches for cookies as a benefit
> > that is worth pursuing.
>
> Me too.  What about just kicking the seekdir users up into the second
> half
> of the index, to use xxhash32() up there.  Everyone else can hang out
> in the
> bottom half with dense indexes and help each other out.
>
> The vast  majority of readdir() use is going to be short listings
> traversed
> in order.  The memory inflation created by a process that needs to
> walk a
> tree and for every two pages of readdir data require 10 pages of
> indexes
> seems pretty extreme.
>
> Ben
>

#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14)
/*
* Hash algorithm allowing content addressible access to sequences
* of directory cookies. Content is addressed by the value of the
* cookie index of the first readdir entry in a page.
*
* The xxhash algorithm is chosen because it is fast, and is supposed
* to result in a decent flat distribution of hashes.
*
* We then select only the first 18 bits to avoid issues with excessive
* memory use for the page cache XArray. 18 bits should allow the caching
* of 262144 pages of sequences of readdir entries. Since each page holds
* 127 readdir entries for a typical 64-bit system, that works out to a
* cache of ~ 33 million entries per directory.
*/
static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
{
if (cookie == 0)
return 0;
return xxhash(&cookie, sizeof(cookie), 0) & NFS_READDIR_COOKIE_MASK;
}

So no, this is not a show-stopper.


--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]