From: Trond Myklebust <[email protected]>
Instead of relying on counting the page offsets as we walk through the
page cache, switch to calculating them algorithmically.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8f17aaebcd77..f2258e926df2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -248,17 +248,20 @@ static const char *nfs_readdir_copy_name(const char *name, unsigned int len)
return ret;
}
+static size_t nfs_readdir_array_maxentries(void)
+{
+ return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
+ sizeof(struct nfs_cache_array_entry);
+}
+
/*
* Check that the next array entry lies entirely within the page bounds
*/
static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
{
- struct nfs_cache_array_entry *cache_entry;
-
if (array->page_full)
return -ENOSPC;
- cache_entry = &array->array[array->size + 1];
- if ((char *)cache_entry - (char *)array > PAGE_SIZE) {
+ if (array->size == nfs_readdir_array_maxentries()) {
array->page_full = 1;
return -ENOSPC;
}
@@ -317,6 +320,11 @@ static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
return page;
}
+static loff_t nfs_readdir_page_offset(struct page *page)
+{
+ return (loff_t)page->index * (loff_t)nfs_readdir_array_maxentries();
+}
+
static u64 nfs_readdir_page_last_cookie(struct page *page)
{
struct nfs_cache_array *array;
@@ -447,7 +455,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
if (array->array[i].cookie == desc->dir_cookie) {
struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
- new_pos = desc->current_index + i;
+ new_pos = nfs_readdir_page_offset(desc->page) + i;
if (desc->attr_gencount != nfsi->attr_gencount ||
!nfs_readdir_inode_mapping_valid(nfsi)) {
desc->duped = 0;
--
2.35.1
From: Trond Myklebust <[email protected]>
Use the change attribute and the first cookie in a directory page cache
entry to validate that the page is up to date.
Suggested-by: Benjamin Coddington <[email protected]>
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 68 ++++++++++++++++++++++++++++------------------------
1 file changed, 37 insertions(+), 31 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f2258e926df2..5d9367d9b651 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
};
struct nfs_cache_array {
+ u64 change_attr;
u64 last_cookie;
unsigned int size;
unsigned char page_full : 1,
@@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct nfs_cache_array *array)
memset(array, 0, sizeof(struct nfs_cache_array));
}
-static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
+static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
+ u64 change_attr)
{
struct nfs_cache_array *array;
@@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
{
struct page *page = alloc_page(gfp_flags);
if (page)
- nfs_readdir_page_init_array(page, last_cookie);
+ nfs_readdir_page_init_array(page, last_cookie, 0);
return page;
}
@@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
return ret;
}
+static bool nfs_readdir_page_cookie_match(struct page *page, u64 last_cookie,
+ u64 change_attr)
+{
+ struct nfs_cache_array *array = kmap_atomic(page);
+ int ret = true;
+
+ if (array->change_attr != change_attr)
+ ret = false;
+ if (array->size > 0 && array->array[0].cookie != last_cookie)
+ ret = false;
+ kunmap_atomic(array);
+ return ret;
+}
+
+static void nfs_readdir_page_unlock_and_put(struct page *page)
+{
+ unlock_page(page);
+ put_page(page);
+}
+
static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
pgoff_t index, u64 last_cookie)
{
struct page *page;
+ u64 change_attr;
page = grab_cache_page(mapping, index);
- if (page && !PageUptodate(page)) {
- nfs_readdir_page_init_array(page, last_cookie);
- if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0)
- nfs_zap_mapping(mapping->host, mapping);
- SetPageUptodate(page);
+ if (!page)
+ return NULL;
+ change_attr = inode_peek_iversion_raw(mapping->host);
+ if (PageUptodate(page)) {
+ if (nfs_readdir_page_cookie_match(page, last_cookie,
+ change_attr))
+ return page;
+ nfs_readdir_clear_array(page);
}
-
+ nfs_readdir_page_init_array(page, last_cookie, change_attr);
+ SetPageUptodate(page);
return page;
}
@@ -356,12 +383,6 @@ static void nfs_readdir_page_set_eof(struct page *page)
kunmap_atomic(array);
}
-static void nfs_readdir_page_unlock_and_put(struct page *page)
-{
- unlock_page(page);
- put_page(page);
-}
-
static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
pgoff_t index, u64 cookie)
{
@@ -418,16 +439,6 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
return -EBADCOOKIE;
}
-static bool
-nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
-{
- if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE |
- NFS_INO_INVALID_DATA))
- return false;
- smp_rmb();
- return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
-}
-
static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
u64 cookie)
{
@@ -456,8 +467,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
new_pos = nfs_readdir_page_offset(desc->page) + i;
- if (desc->attr_gencount != nfsi->attr_gencount ||
- !nfs_readdir_inode_mapping_valid(nfsi)) {
+ if (desc->attr_gencount != nfsi->attr_gencount) {
desc->duped = 0;
desc->attr_gencount = nfsi->attr_gencount;
} else if (new_pos < desc->prev_index) {
@@ -1094,11 +1104,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
- if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
- res = nfs_revalidate_mapping(inode, file->f_mapping);
- if (res < 0)
- goto out;
- }
+ nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
res = -ENOMEM;
desc = kzalloc(sizeof(*desc), GFP_KERNEL);
--
2.35.1
On 23 Feb 2022, at 16:12, [email protected] wrote:
> From: Trond Myklebust <[email protected]>
>
> Use the change attribute and the first cookie in a directory page
> cache
> entry to validate that the page is up to date.
>
> Suggested-by: Benjamin Coddington <[email protected]>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/dir.c | 68
> ++++++++++++++++++++++++++++------------------------
> 1 file changed, 37 insertions(+), 31 deletions(-)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index f2258e926df2..5d9367d9b651 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
> };
>
> struct nfs_cache_array {
> + u64 change_attr;
> u64 last_cookie;
> unsigned int size;
> unsigned char page_full : 1,
> @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct
> nfs_cache_array *array)
> memset(array, 0, sizeof(struct nfs_cache_array));
> }
>
> -static void nfs_readdir_page_init_array(struct page *page, u64
> last_cookie)
> +static void nfs_readdir_page_init_array(struct page *page, u64
> last_cookie,
> + u64 change_attr)
> {
> struct nfs_cache_array *array;
There's a hunk missing here, something like:
@@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct page
*page, u64 last_cookie,
nfs_readdir_array_init(array);
array->last_cookie = last_cookie;
array->cookies_are_ordered = 1;
+ array->change_attr = change_attr;
kunmap_atomic(array);
}
>
> @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie,
> gfp_t gfp_flags)
> {
> struct page *page = alloc_page(gfp_flags);
> if (page)
> - nfs_readdir_page_init_array(page, last_cookie);
> + nfs_readdir_page_init_array(page, last_cookie, 0);
> return page;
> }
>
> @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct nfs_entry
> *entry, struct page *page)
> return ret;
> }
>
> +static bool nfs_readdir_page_cookie_match(struct page *page, u64
> last_cookie,
> + u64 change_attr)
How about "nfs_readdir_page_valid()"? There's more going on than a
cookie match.
> +{
> + struct nfs_cache_array *array = kmap_atomic(page);
> + int ret = true;
> +
> + if (array->change_attr != change_attr)
> + ret = false;
Can we skip the next test if ret = false?
> + if (array->size > 0 && array->array[0].cookie != last_cookie)
> + ret = false;
> + kunmap_atomic(array);
> + return ret;
> +}
> +
> +static void nfs_readdir_page_unlock_and_put(struct page *page)
> +{
> + unlock_page(page);
> + put_page(page);
> +}
> +
> static struct page *nfs_readdir_page_get_locked(struct address_space
> *mapping,
> pgoff_t index, u64 last_cookie)
> {
> struct page *page;
> + u64 change_attr;
>
> page = grab_cache_page(mapping, index);
> - if (page && !PageUptodate(page)) {
> - nfs_readdir_page_init_array(page, last_cookie);
> - if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0)
> - nfs_zap_mapping(mapping->host, mapping);
> - SetPageUptodate(page);
> + if (!page)
> + return NULL;
> + change_attr = inode_peek_iversion_raw(mapping->host);
> + if (PageUptodate(page)) {
> + if (nfs_readdir_page_cookie_match(page, last_cookie,
> + change_attr))
> + return page;
> + nfs_readdir_clear_array(page);
Why use i_version rather than nfs_save_change_attribute? Seems having a
consistent value across the pachecache and dir_verifiers would help
debugging, and we've already have a bunch of machinery around the
change_attribute.
Don't we need to send a GETATTR with READDIR for v4? Not doing so means
that the pagecache is going to behave differently for v3 and v4, and
we'll
potentially end up with totally bogus listings for cases where one
reader
has cached a page of entries in the middle of the pagecache marked with
i_version A, but entries are actually from i_version A++ on the server.
Then another reader comes along and follows earlier entries from
i_version A
on the server that lead into entries from A++. I don't think we can
detect
this case unless we're checking the directory on every READDIR.
Sending a GETATTR for v4 doesn't eliminate that race on the server side,
but
does remove the large window on the client created by the attribute
cache
timeouts, and I think its mostly harmless performance-wise.
Also, we don't need the local change_attr variable just to pass it to
other
functions that can access it themselves.
> }
> -
> + nfs_readdir_page_init_array(page, last_cookie, change_attr);
> + SetPageUptodate(page);
> return page;
> }
>
> @@ -356,12 +383,6 @@ static void nfs_readdir_page_set_eof(struct page
> *page)
> kunmap_atomic(array);
> }
>
> -static void nfs_readdir_page_unlock_and_put(struct page *page)
> -{
> - unlock_page(page);
> - put_page(page);
> -}
> -
> static struct page *nfs_readdir_page_get_next(struct address_space
> *mapping,
> pgoff_t index, u64 cookie)
> {
> @@ -418,16 +439,6 @@ static int nfs_readdir_search_for_pos(struct
> nfs_cache_array *array,
> return -EBADCOOKIE;
> }
>
> -static bool
> -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
> -{
> - if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE |
> - NFS_INO_INVALID_DATA))
> - return false;
> - smp_rmb();
> - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
> -}
> -
> static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array
> *array,
> u64 cookie)
> {
> @@ -456,8 +467,7 @@ static int nfs_readdir_search_for_cookie(struct
> nfs_cache_array *array,
> struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
>
> new_pos = nfs_readdir_page_offset(desc->page) + i;
> - if (desc->attr_gencount != nfsi->attr_gencount ||
> - !nfs_readdir_inode_mapping_valid(nfsi)) {
> + if (desc->attr_gencount != nfsi->attr_gencount) {
> desc->duped = 0;
> desc->attr_gencount = nfsi->attr_gencount;
> } else if (new_pos < desc->prev_index) {
> @@ -1094,11 +1104,7 @@ static int nfs_readdir(struct file *file,
> struct dir_context *ctx)
> * to either find the entry with the appropriate number or
> * revalidate the cookie.
> */
> - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
> - res = nfs_revalidate_mapping(inode, file->f_mapping);
> - if (res < 0)
> - goto out;
> - }
> + nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
Same as above -> why not send GETATTR with READDIR instead of doing it
in a
separate RPC?
Ben
On 23 Feb 2022, at 16:12, [email protected] wrote:
> From: Trond Myklebust <[email protected]>
>
> Instead of relying on counting the page offsets as we walk through the
> page cache, switch to calculating them algorithmically.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/dir.c | 18 +++++++++++++-----
> 1 file changed, 13 insertions(+), 5 deletions(-)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 8f17aaebcd77..f2258e926df2 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -248,17 +248,20 @@ static const char *nfs_readdir_copy_name(const
> char *name, unsigned int len)
> return ret;
> }
>
> +static size_t nfs_readdir_array_maxentries(void)
> +{
> + return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
> + sizeof(struct nfs_cache_array_entry);
> +}
> +
Why the choice to use a runtime function call rather than the compiler's
calculation? I suspect that the end result is the same, as the compiler
will optimize it away, but I'm curious if there's a good reason for
this.
Ben
On Thu, 2022-02-24 at 09:15 -0500, Benjamin Coddington wrote:
> On 23 Feb 2022, at 16:12, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > Instead of relying on counting the page offsets as we walk through
> > the
> > page cache, switch to calculating them algorithmically.
> >
> > Signed-off-by: Trond Myklebust <[email protected]>
> > ---
> > fs/nfs/dir.c | 18 +++++++++++++-----
> > 1 file changed, 13 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > index 8f17aaebcd77..f2258e926df2 100644
> > --- a/fs/nfs/dir.c
> > +++ b/fs/nfs/dir.c
> > @@ -248,17 +248,20 @@ static const char
> > *nfs_readdir_copy_name(const
> > char *name, unsigned int len)
> > return ret;
> > }
> >
> > +static size_t nfs_readdir_array_maxentries(void)
> > +{
> > + return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
> > + sizeof(struct nfs_cache_array_entry);
> > +}
> > +
>
> Why the choice to use a runtime function call rather than the
> compiler's
> calculation? I suspect that the end result is the same, as the
> compiler
> will optimize it away, but I'm curious if there's a good reason for
> this.
>
The comparison is more efficient because no pointer arithmetic is
needed. As you said, the above function always evaluates to a constant,
and the array->size has been pre-calculated.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
> On 23 Feb 2022, at 16:12, [email protected] wrote:
>
> > From: Trond Myklebust <[email protected]>
> >
> > Use the change attribute and the first cookie in a directory page
> > cache
> > entry to validate that the page is up to date.
> >
> > Suggested-by: Benjamin Coddington <[email protected]>
> > Signed-off-by: Trond Myklebust <[email protected]>
> > ---
> > fs/nfs/dir.c | 68
> > ++++++++++++++++++++++++++++------------------------
> > 1 file changed, 37 insertions(+), 31 deletions(-)
> >
> > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > index f2258e926df2..5d9367d9b651 100644
> > --- a/fs/nfs/dir.c
> > +++ b/fs/nfs/dir.c
> > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
> > };
> >
> > struct nfs_cache_array {
> > + u64 change_attr;
> > u64 last_cookie;
> > unsigned int size;
> > unsigned char page_full : 1,
> > @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct
> > nfs_cache_array *array)
> > memset(array, 0, sizeof(struct nfs_cache_array));
> > }
> >
> > -static void nfs_readdir_page_init_array(struct page *page, u64
> > last_cookie)
> > +static void nfs_readdir_page_init_array(struct page *page, u64
> > last_cookie,
> > + u64 change_attr)
> > {
> > struct nfs_cache_array *array;
>
>
> There's a hunk missing here, something like:
>
> @@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct
> page
> *page, u64 last_cookie,
> nfs_readdir_array_init(array);
> array->last_cookie = last_cookie;
> array->cookies_are_ordered = 1;
> + array->change_attr = change_attr;
> kunmap_atomic(array);
> }
>
> >
> > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie,
> > gfp_t gfp_flags)
> > {
> > struct page *page = alloc_page(gfp_flags);
> > if (page)
> > - nfs_readdir_page_init_array(page, last_cookie);
> > + nfs_readdir_page_init_array(page, last_cookie, 0);
> > return page;
> > }
> >
> > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct nfs_entry
> > *entry, struct page *page)
> > return ret;
> > }
> >
> > +static bool nfs_readdir_page_cookie_match(struct page *page, u64
> > last_cookie,
> > + u64 change_attr)
>
> How about "nfs_readdir_page_valid()"? There's more going on than a
> cookie match.
>
>
> > +{
> > + struct nfs_cache_array *array = kmap_atomic(page);
> > + int ret = true;
> > +
> > + if (array->change_attr != change_attr)
> > + ret = false;
>
> Can we skip the next test if ret = false?
I'd expect the compiler to do that.
>
> > + if (array->size > 0 && array->array[0].cookie !=
> > last_cookie)
> > + ret = false;
> > + kunmap_atomic(array);
> > + return ret;
> > +}
> > +
> > +static void nfs_readdir_page_unlock_and_put(struct page *page)
> > +{
> > + unlock_page(page);
> > + put_page(page);
> > +}
> > +
> > static struct page *nfs_readdir_page_get_locked(struct
> > address_space
> > *mapping,
> > pgoff_t index, u64
> > last_cookie)
> > {
> > struct page *page;
> > + u64 change_attr;
> >
> > page = grab_cache_page(mapping, index);
> > - if (page && !PageUptodate(page)) {
> > - nfs_readdir_page_init_array(page, last_cookie);
> > - if (invalidate_inode_pages2_range(mapping, index +
> > 1, -1) < 0)
> > - nfs_zap_mapping(mapping->host, mapping);
> > - SetPageUptodate(page);
> > + if (!page)
> > + return NULL;
> > + change_attr = inode_peek_iversion_raw(mapping->host);
> > + if (PageUptodate(page)) {
> > + if (nfs_readdir_page_cookie_match(page,
> > last_cookie,
> > + change_attr))
> > + return page;
> > + nfs_readdir_clear_array(page);
>
>
> Why use i_version rather than nfs_save_change_attribute? Seems
> having a
> consistent value across the pachecache and dir_verifiers would help
> debugging, and we've already have a bunch of machinery around the
> change_attribute.
The directory cache_change_attribute is not reported in tracepoints
because it is a directory-specific field, so it's not as useful for
debugging.
The inode change attribute is what we have traditionally used for
determining cache consistency, and when to invalidate the cache.
>
> Don't we need to send a GETATTR with READDIR for v4? Not doing so
> means
> that the pagecache is going to behave differently for v3 and v4, and
> we'll
> potentially end up with totally bogus listings for cases where one
> reader
> has cached a page of entries in the middle of the pagecache marked
> with
> i_version A, but entries are actually from i_version A++ on the
> server.
> Then another reader comes along and follows earlier entries from
> i_version A
> on the server that lead into entries from A++. I don't think we can
> detect
> this case unless we're checking the directory on every READDIR.
The value of the change attribute is determined when the page is
allocated. That value is unaffected by the READDIR call.
That works with NFSv2 as well as NFSv3+v4.
>
> Sending a GETATTR for v4 doesn't eliminate that race on the server
> side,
> but
> does remove the large window on the client created by the attribute
> cache
> timeouts, and I think its mostly harmless performance-wise.
>
> Also, we don't need the local change_attr variable just to pass it to
> other
> functions that can access it themselves.
>
> > }
> > -
> > + nfs_readdir_page_init_array(page, last_cookie,
> > change_attr);
> > + SetPageUptodate(page);
> > return page;
> > }
> >
> > @@ -356,12 +383,6 @@ static void nfs_readdir_page_set_eof(struct
> > page
> > *page)
> > kunmap_atomic(array);
> > }
> >
> > -static void nfs_readdir_page_unlock_and_put(struct page *page)
> > -{
> > - unlock_page(page);
> > - put_page(page);
> > -}
> > -
> > static struct page *nfs_readdir_page_get_next(struct address_space
> > *mapping,
> > pgoff_t index, u64
> > cookie)
> > {
> > @@ -418,16 +439,6 @@ static int nfs_readdir_search_for_pos(struct
> > nfs_cache_array *array,
> > return -EBADCOOKIE;
> > }
> >
> > -static bool
> > -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
> > -{
> > - if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE |
> > - NFS_INO_INVALID_DATA))
> > - return false;
> > - smp_rmb();
> > - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
> > -}
> > -
> > static bool nfs_readdir_array_cookie_in_range(struct
> > nfs_cache_array
> > *array,
> > u64 cookie)
> > {
> > @@ -456,8 +467,7 @@ static int nfs_readdir_search_for_cookie(struct
> > nfs_cache_array *array,
> > struct nfs_inode *nfsi =
> > NFS_I(file_inode(desc->file));
> >
> > new_pos = nfs_readdir_page_offset(desc-
> > >page) + i;
> > - if (desc->attr_gencount != nfsi-
> > >attr_gencount ||
> > - !nfs_readdir_inode_mapping_valid(nfsi))
> > {
> > + if (desc->attr_gencount != nfsi-
> > >attr_gencount) {
> > desc->duped = 0;
> > desc->attr_gencount = nfsi-
> > >attr_gencount;
> > } else if (new_pos < desc->prev_index) {
> > @@ -1094,11 +1104,7 @@ static int nfs_readdir(struct file *file,
> > struct dir_context *ctx)
> > * to either find the entry with the appropriate number or
> > * revalidate the cookie.
> > */
> > - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
> > - res = nfs_revalidate_mapping(inode, file-
> > >f_mapping);
> > - if (res < 0)
> > - goto out;
> > - }
> > + nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
>
> Same as above -> why not send GETATTR with READDIR instead of doing
> it
> in a
> separate RPC?
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote:
> On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
> > On 23 Feb 2022, at 16:12, [email protected] wrote:
> >
> > > From: Trond Myklebust <[email protected]>
> > >
> > > Use the change attribute and the first cookie in a directory page
> > > cache
> > > entry to validate that the page is up to date.
> > >
> > > Suggested-by: Benjamin Coddington <[email protected]>
> > > Signed-off-by: Trond Myklebust <[email protected]>
> > > ---
> > > fs/nfs/dir.c | 68
> > > ++++++++++++++++++++++++++++------------------------
> > > 1 file changed, 37 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > index f2258e926df2..5d9367d9b651 100644
> > > --- a/fs/nfs/dir.c
> > > +++ b/fs/nfs/dir.c
> > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
> > > };
> > >
> > > struct nfs_cache_array {
> > > + u64 change_attr;
> > > u64 last_cookie;
> > > unsigned int size;
> > > unsigned char page_full : 1,
> > > @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct
> > > nfs_cache_array *array)
> > > memset(array, 0, sizeof(struct nfs_cache_array));
> > > }
> > >
> > > -static void nfs_readdir_page_init_array(struct page *page, u64
> > > last_cookie)
> > > +static void nfs_readdir_page_init_array(struct page *page, u64
> > > last_cookie,
> > > + u64 change_attr)
> > > {
> > > struct nfs_cache_array *array;
> >
> >
> > There's a hunk missing here, something like:
> >
> > @@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct
> > page
> > *page, u64 last_cookie,
> > nfs_readdir_array_init(array);
> > array->last_cookie = last_cookie;
> > array->cookies_are_ordered = 1;
> > + array->change_attr = change_attr;
> > kunmap_atomic(array);
> > }
> >
> > >
> > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie,
> > > gfp_t gfp_flags)
> > > {
> > > struct page *page = alloc_page(gfp_flags);
> > > if (page)
> > > - nfs_readdir_page_init_array(page, last_cookie);
> > > + nfs_readdir_page_init_array(page, last_cookie,
> > > 0);
> > > return page;
> > > }
> > >
> > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct
> > > nfs_entry
> > > *entry, struct page *page)
> > > return ret;
> > > }
> > >
> > > +static bool nfs_readdir_page_cookie_match(struct page *page, u64
> > > last_cookie,
> > > + u64 change_attr)
> >
> > How about "nfs_readdir_page_valid()"? There's more going on than a
> > cookie match.
> >
> >
> > > +{
> > > + struct nfs_cache_array *array = kmap_atomic(page);
> > > + int ret = true;
> > > +
> > > + if (array->change_attr != change_attr)
> > > + ret = false;
> >
> > Can we skip the next test if ret = false?
>
> I'd expect the compiler to do that.
>
> >
> > > + if (array->size > 0 && array->array[0].cookie !=
> > > last_cookie)
> > > + ret = false;
> > > + kunmap_atomic(array);
> > > + return ret;
> > > +}
> > > +
> > > +static void nfs_readdir_page_unlock_and_put(struct page *page)
> > > +{
> > > + unlock_page(page);
> > > + put_page(page);
> > > +}
> > > +
> > > static struct page *nfs_readdir_page_get_locked(struct
> > > address_space
> > > *mapping,
> > > pgoff_t index,
> > > u64
> > > last_cookie)
> > > {
> > > struct page *page;
> > > + u64 change_attr;
> > >
> > > page = grab_cache_page(mapping, index);
> > > - if (page && !PageUptodate(page)) {
> > > - nfs_readdir_page_init_array(page, last_cookie);
> > > - if (invalidate_inode_pages2_range(mapping, index
> > > +
> > > 1, -1) < 0)
> > > - nfs_zap_mapping(mapping->host, mapping);
> > > - SetPageUptodate(page);
> > > + if (!page)
> > > + return NULL;
> > > + change_attr = inode_peek_iversion_raw(mapping->host);
> > > + if (PageUptodate(page)) {
> > > + if (nfs_readdir_page_cookie_match(page,
> > > last_cookie,
> > > + change_attr))
> > > + return page;
> > > + nfs_readdir_clear_array(page);
> >
> >
> > Why use i_version rather than nfs_save_change_attribute? Seems
> > having a
> > consistent value across the pachecache and dir_verifiers would help
> > debugging, and we've already have a bunch of machinery around the
> > change_attribute.
>
> The directory cache_change_attribute is not reported in tracepoints
> because it is a directory-specific field, so it's not as useful for
> debugging.
>
> The inode change attribute is what we have traditionally used for
> determining cache consistency, and when to invalidate the cache.
I should probably elaborate a little further on the differences between
the inode change attribute and the cache_change_attribute.
One of the main reasons for introducing the latter was to have
something that allows us to track changes to the directory, but to
avoid forcing unnecessary revalidations of the dcache.
What this means is that when we create or remove a file, and the
pre/post-op attributes tell us that there were no third party changes
to the directory, we update the dcache, but we do _not_ update the
cache_change_attribute, because we know that the rest of the directory
contents are valid, and so we don't have to revalidate the dentries.
However in that case, we _do_ want to update the readdir cache to
reflect the fact that an entry was added or deleted. While we could
figure out how to remove an entry (at least for the case where the
filesystem is case-sensitive), we do not know where the filesystem
added the new file, or what cookies was assigned.
This is why the inode change attribute is more appropriate for indexing
the page cache pages. It reflects the cases where we want to revalidate
the readdir cache, as opposed to the dcache.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 24 Feb 2022, at 21:11, Trond Myklebust wrote:
> On Thu, 2022-02-24 at 09:15 -0500, Benjamin Coddington wrote:
>> On 23 Feb 2022, at 16:12, [email protected] wrote:
>>
>>> From: Trond Myklebust <[email protected]>
>>>
>>> Instead of relying on counting the page offsets as we walk through
>>> the
>>> page cache, switch to calculating them algorithmically.
>>>
>>> Signed-off-by: Trond Myklebust <[email protected]>
>>> ---
>>> fs/nfs/dir.c | 18 +++++++++++++-----
>>> 1 file changed, 13 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
>>> index 8f17aaebcd77..f2258e926df2 100644
>>> --- a/fs/nfs/dir.c
>>> +++ b/fs/nfs/dir.c
>>> @@ -248,17 +248,20 @@ static const char
>>> *nfs_readdir_copy_name(const
>>> char *name, unsigned int len)
>>> return ret;
>>> }
>>>
>>> +static size_t nfs_readdir_array_maxentries(void)
>>> +{
>>> + return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
>>> + sizeof(struct nfs_cache_array_entry);
>>> +}
>>> +
>>
>> Why the choice to use a runtime function call rather than the
>> compiler's
>> calculation? I suspect that the end result is the same, as the
>> compiler
>> will optimize it away, but I'm curious if there's a good reason for
>> this.
>>
>
> The comparison is more efficient because no pointer arithmetic is
> needed. As you said, the above function always evaluates to a constant,
> and the array->size has been pre-calculated.
Comparisons are more efficient than using something like this?:
static const int nfs_readdir_array_maxentries =
(PAGE_SIZE - sizeof(struct nfs_cache_array)) /
sizeof(struct nfs_cache_array_entry);
I don't understand why, I must admit. I'm not saying it should be changed,
I'm just trying to figure out the reason for the function declaration when
the value is a constant, and I thought there was a hole in my head.
Ben
On Fri, 2022-02-25 at 09:44 -0500, Benjamin Coddington wrote:
> On 25 Feb 2022, at 8:10, Trond Myklebust wrote:
>
> > On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote:
> > > On 24 Feb 2022, at 22:51, Trond Myklebust wrote:
> > >
> > > > On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote:
> > > > > On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
> > > > > > On 23 Feb 2022, at 16:12, [email protected] wrote:
> > > > > >
> > > > > > > From: Trond Myklebust <[email protected]>
> > > > > > >
> > > > > > > Use the change attribute and the first cookie in a
> > > > > > > directory
> > > > > > > page
> > > > > > > cache
> > > > > > > entry to validate that the page is up to date.
> > > > > > >
> > > > > > > Suggested-by: Benjamin Coddington <[email protected]>
> > > > > > > Signed-off-by: Trond Myklebust
> > > > > > > <[email protected]>
> > > > > > > ---
> > > > > > > fs/nfs/dir.c | 68
> > > > > > > ++++++++++++++++++++++++++++------------------------
> > > > > > > 1 file changed, 37 insertions(+), 31 deletions(-)
> > > > > > >
> > > > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > > > > > index f2258e926df2..5d9367d9b651 100644
> > > > > > > --- a/fs/nfs/dir.c
> > > > > > > +++ b/fs/nfs/dir.c
> > > > > > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
> > > > > > > };
> > > > > > >
> > > > > > > struct nfs_cache_array {
> > > > > > > + u64 change_attr;
> > > > > > > u64 last_cookie;
> > > > > > > unsigned int size;
> > > > > > > unsigned char page_full : 1,
> > > > > > > @@ -175,7 +176,8 @@ static void
> > > > > > > nfs_readdir_array_init(struct
> > > > > > > nfs_cache_array *array)
> > > > > > > memset(array, 0, sizeof(struct nfs_cache_array));
> > > > > > > }
> > > > > > >
> > > > > > > -static void nfs_readdir_page_init_array(struct page
> > > > > > > *page,
> > > > > > > u64
> > > > > > > last_cookie)
> > > > > > > +static void nfs_readdir_page_init_array(struct page
> > > > > > > *page,
> > > > > > > u64
> > > > > > > last_cookie,
> > > > > > > + u64
> > > > > > > change_attr)
> > > > > > > {
> > > > > > > struct nfs_cache_array *array;
> > > > > >
> > > > > >
> > > > > > There's a hunk missing here, something like:
> > > > > >
> > > > > > @@ -185,6 +185,7 @@ static void
> > > > > > nfs_readdir_page_init_array(struct
> > > > > > page
> > > > > > *page, u64 last_cookie,
> > > > > > nfs_readdir_array_init(array);
> > > > > > array->last_cookie = last_cookie;
> > > > > > array->cookies_are_ordered = 1;
> > > > > > + array->change_attr = change_attr;
> > > > > > kunmap_atomic(array);
> > > > > > }
> > > > > >
> > > > > > >
> > > > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64
> > > > > > > last_cookie,
> > > > > > > gfp_t gfp_flags)
> > > > > > > {
> > > > > > > struct page *page = alloc_page(gfp_flags);
> > > > > > > if (page)
> > > > > > > - nfs_readdir_page_init_array(page,
> > > > > > > last_cookie);
> > > > > > > + nfs_readdir_page_init_array(page,
> > > > > > > last_cookie,
> > > > > > > 0);
> > > > > > > return page;
> > > > > > > }
> > > > > > >
> > > > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct
> > > > > > > nfs_entry
> > > > > > > *entry, struct page *page)
> > > > > > > return ret;
> > > > > > > }
> > > > > > >
> > > > > > > +static bool nfs_readdir_page_cookie_match(struct page
> > > > > > > *page,
> > > > > > > u64
> > > > > > > last_cookie,
> > > > > > > +
> > > > > > > u64 change_attr)
> > > > > >
> > > > > > How about "nfs_readdir_page_valid()"? There's more going
> > > > > > on
> > > > > > than a
> > > > > > cookie match.
> > > > > >
> > > > > >
> > > > > > > +{
> > > > > > > + struct nfs_cache_array *array =
> > > > > > > kmap_atomic(page);
> > > > > > > + int ret = true;
> > > > > > > +
> > > > > > > + if (array->change_attr != change_attr)
> > > > > > > + ret = false;
> > > > > >
> > > > > > Can we skip the next test if ret = false?
> > > > >
> > > > > I'd expect the compiler to do that.
> > > > >
> > > > > >
> > > > > > > + if (array->size > 0 && array->array[0].cookie !=
> > > > > > > last_cookie)
> > > > > > > + ret = false;
> > > > > > > + kunmap_atomic(array);
> > > > > > > + return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void nfs_readdir_page_unlock_and_put(struct page
> > > > > > > *page)
> > > > > > > +{
> > > > > > > + unlock_page(page);
> > > > > > > + put_page(page);
> > > > > > > +}
> > > > > > > +
> > > > > > > static struct page *nfs_readdir_page_get_locked(struct
> > > > > > > address_space
> > > > > > > *mapping,
> > > > > > > pgoff_t
> > > > > > > index,
> > > > > > > u64
> > > > > > > last_cookie)
> > > > > > > {
> > > > > > > struct page *page;
> > > > > > > + u64 change_attr;
> > > > > > >
> > > > > > > page = grab_cache_page(mapping, index);
> > > > > > > - if (page && !PageUptodate(page)) {
> > > > > > > - nfs_readdir_page_init_array(page,
> > > > > > > last_cookie);
> > > > > > > - if
> > > > > > > (invalidate_inode_pages2_range(mapping, index
> > > > > > > +
> > > > > > > 1, -1) < 0)
> > > > > > > - nfs_zap_mapping(mapping->host,
> > > > > > > mapping);
> > > > > > > - SetPageUptodate(page);
> > > > > > > + if (!page)
> > > > > > > + return NULL;
> > > > > > > + change_attr =
> > > > > > > inode_peek_iversion_raw(mapping->host);
> > > > > > > + if (PageUptodate(page)) {
> > > > > > > + if
> > > > > > > (nfs_readdir_page_cookie_match(page,
> > > > > > > last_cookie,
> > > > > > > +
> > > > > > > change_attr))
> > > > > > > + return page;
> > > > > > > + nfs_readdir_clear_array(page);
> > > > > >
> > > > > >
> > > > > > Why use i_version rather than nfs_save_change_attribute?
> > > > > > Seems
> > > > > > having a
> > > > > > consistent value across the pachecache and dir_verifiers
> > > > > > would
> > > > > > help
> > > > > > debugging, and we've already have a bunch of machinery
> > > > > > around
> > > > > > the
> > > > > > change_attribute.
> > > > >
> > > > > The directory cache_change_attribute is not reported in
> > > > > tracepoints
> > > > > because it is a directory-specific field, so it's not as
> > > > > useful
> > > > > for
> > > > > debugging.
> > > > >
> > > > > The inode change attribute is what we have traditionally used
> > > > > for
> > > > > determining cache consistency, and when to invalidate the
> > > > > cache.
> > > >
> > > > I should probably elaborate a little further on the differences
> > > > between
> > > > the inode change attribute and the cache_change_attribute.
> > > >
> > > > One of the main reasons for introducing the latter was to have
> > > > something that allows us to track changes to the directory, but
> > > > to
> > > > avoid forcing unnecessary revalidations of the dcache.
> > > >
> > > > What this means is that when we create or remove a file, and
> > > > the
> > > > pre/post-op attributes tell us that there were no third party
> > > > changes
> > > > to the directory, we update the dcache, but we do _not_ update
> > > > the
> > > > cache_change_attribute, because we know that the rest of the
> > > > directory
> > > > contents are valid, and so we don't have to revalidate the
> > > > dentries.
> > > > However in that case, we _do_ want to update the readdir cache
> > > > to
> > > > reflect the fact that an entry was added or deleted. While we
> > > > could
> > > > figure out how to remove an entry (at least for the case where
> > > > the
> > > > filesystem is case-sensitive), we do not know where the
> > > > filesystem
> > > > added the new file, or what cookies was assigned.
> > > >
> > > > This is why the inode change attribute is more appropriate for
> > > > indexing
> > > > the page cache pages. It reflects the cases where we want to
> > > > revalidate
> > > > the readdir cache, as opposed to the dcache.
> > >
> > > Ok, thanks for explaining this.
> > >
> > > I've noticed that you haven't responded about my concerns about
> > > not
> > > checking
> > > the directory for changes with every v4 READDIR. For v3, we have
> > > post-op
> > > updates to the directory, but with v4 the directory can change
> > > and
> > > we'll
> > > end up with entries in the cache that are marked with an old
> > > change_attr.
> > >
> >
> > Then they will be rejected by nfs_readdir_page_cookie_match() if a
> > user
> > looks up that page again after we've revalidated the change
> > attribute
> > on the directory.
> >
> > ...and note that NFSv4 does returns a struct change_info4 for all
> > operations that change the directory, so we will update the change
> > attribute in all those cases.
>
> I'm not worried about changes from the same client.
>
> > If the change is made on the server, well then we will detect it
> > through the standard revalidation process that usually decides when
> > to
> > invalidate the directory page cache.
>
> The environments I'm concerned about are setup very frequently: they
> look
> like multiple NFS clients co-ordinating on a directory with millions
> of
> files. Some clients are adding files as they do work, other clients
> are
> then looking for those files by walking the directory entries to
> validate
> their existence. The systems that do this have a "very bad time" if
> some
> of them produce listings that are _dramatically_ and transiently
> different
> from a listing they produced before.
>
> That can happen really easily with what we've got here, and it can
> create a
> huge problem for these setups. And it won't be easily reproduceable,
> and it
> will be hard to find. It will cost everyone involved a lot of time
> and
> effort to track down, and we can fix it easily.
>
> > > I'm pretty positive that not checking for changes to the
> > > directory
> > > (not
> > > sending GETATTR with READDIR) is going to create cases of double-
> > > listed
> > > and
> > > truncated-listings for dirctory listers. Not handling those
> > > cases
> > > means
> > > I'm
> > > going to have some very unhappy customers that complain about
> > > their
> > > files
> > > disappearing/reappearing on NFS.
> > >
> > > If you need me to prove that its an issue, I can take the time to
> > > write
> > > up
> > > program that shows this problem.
> > >
> >
> > If you label the page contents with an attribute that was retrieved
> > _after_ the READDIR op, then you will introduce this as a problem
> > for
> > your customers.
>
> No the problem is already here, we're not introducing it. By
> labeling
> the
> page contents with every call we're shifting the race window from the
> client
> where it's a very large window to the server where the window is
> small.
>
> Its still possible, but *much* less likely.
>
> > The reason is that there is no atomicity between operations in a
> > COMPOUND. Worse, the implementation of readdir in scalable modern
> > systems, including Linux, does not even guarantee atomicity of the
> > readdir operation itself. Instead each readdir entry is filled
> > without
> > holding any locks or preventing any changes to the directory or to
> > the
> > object itself.
>
> I understand all this, but its not a reason to make the problem
> worse.
>
> > POSIX states very explicitly that if you're making changes to the
> > directory after the call to opendir() or rewinddir(), then the
> > behaviour w.r.t. whether that file appears in the readdir() call is
> > unspecified. See
> > https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
>
> Yes, but again - just because the problem exists doesn't give us
> reason
> to
> amplify it when we can easily make a better choice for almost no
> cost.
>
> Here are my reasons for wanting the GETATTR added:
> - it makes it *much* less likely for this problem to occur, with
> the
> minor
> downside of decreased caching for unstable directories.
> - it makes v3 and v4 readdir pagecache behavior consistent WRT
> changing
> directories.
>
> I spent a non-trivial amount of time working on this problem, and saw
> this
> exact issue appear. Its definitely something that's going to come
> back
> and
> bite us if we don't fix it.
>
> How can I convince you? I've offered to produce a working example of
> this
> problem. Will you review those results? If I cannot convince you, I
> feel
> I'll have to pursue distro-specific changes for this work.
Ben, the main cause of this kind of issue in the current code is the
following line:
/*
* ctx->pos points to the dirent entry number.
* *desc->dir_cookie has the cookie for the next entry. We have
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
}
That line protects the page cache against changes aften opendir(). It
was introduced by Red Hat in commmit 07b5ce8ef2d8 in order to fix a
claim of a severe performance problem.
These patches _remove_ that protection, because we're now able to cope
with more frequent revalidation without needing to restart directory
reads from scratch.
So no. Without further proof, I don't accept your claim that this
patchset introduces a regression. I don't accept your claim that we are
required to revalidate the change attribute on every readdir call. We
can't do that for NFSv2 or NFSv3 (the latter offers a post_op
attribute, not pre-op attribute) and as I already pointed out, there is
nothing in POSIX that requires this.
If you want to fork the Red Hat kernel over it, then that's your
decision.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 25 Feb 2022, at 10:18, Trond Myklebust wrote:
> On Fri, 2022-02-25 at 09:44 -0500, Benjamin Coddington wrote:
>> On 25 Feb 2022, at 8:10, Trond Myklebust wrote:
>>
>>> On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote:
>>>> On 24 Feb 2022, at 22:51, Trond Myklebust wrote:
>>>>
>>>>> On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote:
>>>>>> On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
>>>>>>> On 23 Feb 2022, at 16:12, [email protected] wrote:
>>>>>>>
>>>>>>>> From: Trond Myklebust <[email protected]>
>>>>>>>>
>>>>>>>> Use the change attribute and the first cookie in a
>>>>>>>> directory
>>>>>>>> page
>>>>>>>> cache
>>>>>>>> entry to validate that the page is up to date.
>>>>>>>>
>>>>>>>> Suggested-by: Benjamin Coddington <[email protected]>
>>>>>>>> Signed-off-by: Trond Myklebust
>>>>>>>> <[email protected]>
>>>>>>>> ---
>>>>>>>> fs/nfs/dir.c | 68
>>>>>>>> ++++++++++++++++++++++++++++------------------------
>>>>>>>> 1 file changed, 37 insertions(+), 31 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
>>>>>>>> index f2258e926df2..5d9367d9b651 100644
>>>>>>>> --- a/fs/nfs/dir.c
>>>>>>>> +++ b/fs/nfs/dir.c
>>>>>>>> @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
>>>>>>>> };
>>>>>>>>
>>>>>>>> struct nfs_cache_array {
>>>>>>>> + u64 change_attr;
>>>>>>>> u64 last_cookie;
>>>>>>>> unsigned int size;
>>>>>>>> unsigned char page_full : 1,
>>>>>>>> @@ -175,7 +176,8 @@ static void
>>>>>>>> nfs_readdir_array_init(struct
>>>>>>>> nfs_cache_array *array)
>>>>>>>> memset(array, 0, sizeof(struct
>>>>>>>> nfs_cache_array));
>>>>>>>> }
>>>>>>>>
>>>>>>>> -static void nfs_readdir_page_init_array(struct page
>>>>>>>> *page,
>>>>>>>> u64
>>>>>>>> last_cookie)
>>>>>>>> +static void nfs_readdir_page_init_array(struct page
>>>>>>>> *page,
>>>>>>>> u64
>>>>>>>> last_cookie,
>>>>>>>> + u64
>>>>>>>> change_attr)
>>>>>>>> {
>>>>>>>> struct nfs_cache_array *array;
>>>>>>>
>>>>>>>
>>>>>>> There's a hunk missing here, something like:
>>>>>>>
>>>>>>> @@ -185,6 +185,7 @@ static void
>>>>>>> nfs_readdir_page_init_array(struct
>>>>>>> page
>>>>>>> *page, u64 last_cookie,
>>>>>>> nfs_readdir_array_init(array);
>>>>>>> array->last_cookie = last_cookie;
>>>>>>> array->cookies_are_ordered = 1;
>>>>>>> + array->change_attr = change_attr;
>>>>>>> kunmap_atomic(array);
>>>>>>> }
>>>>>>>
>>>>>>>>
>>>>>>>> @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64
>>>>>>>> last_cookie,
>>>>>>>> gfp_t gfp_flags)
>>>>>>>> {
>>>>>>>> struct page *page = alloc_page(gfp_flags);
>>>>>>>> if (page)
>>>>>>>> - nfs_readdir_page_init_array(page,
>>>>>>>> last_cookie);
>>>>>>>> + nfs_readdir_page_init_array(page,
>>>>>>>> last_cookie,
>>>>>>>> 0);
>>>>>>>> return page;
>>>>>>>> }
>>>>>>>>
>>>>>>>> @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct
>>>>>>>> nfs_entry
>>>>>>>> *entry, struct page *page)
>>>>>>>> return ret;
>>>>>>>> }
>>>>>>>>
>>>>>>>> +static bool nfs_readdir_page_cookie_match(struct page
>>>>>>>> *page,
>>>>>>>> u64
>>>>>>>> last_cookie,
>>>>>>>> +
>>>>>>>> u64 change_attr)
>>>>>>>
>>>>>>> How about "nfs_readdir_page_valid()"? There's more going
>>>>>>> on
>>>>>>> than a
>>>>>>> cookie match.
>>>>>>>
>>>>>>>
>>>>>>>> +{
>>>>>>>> + struct nfs_cache_array *array =
>>>>>>>> kmap_atomic(page);
>>>>>>>> + int ret = true;
>>>>>>>> +
>>>>>>>> + if (array->change_attr != change_attr)
>>>>>>>> + ret = false;
>>>>>>>
>>>>>>> Can we skip the next test if ret = false?
>>>>>>
>>>>>> I'd expect the compiler to do that.
>>>>>>
>>>>>>>
>>>>>>>> + if (array->size > 0 && array->array[0].cookie !=
>>>>>>>> last_cookie)
>>>>>>>> + ret = false;
>>>>>>>> + kunmap_atomic(array);
>>>>>>>> + return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void nfs_readdir_page_unlock_and_put(struct page
>>>>>>>> *page)
>>>>>>>> +{
>>>>>>>> + unlock_page(page);
>>>>>>>> + put_page(page);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> static struct page *nfs_readdir_page_get_locked(struct
>>>>>>>> address_space
>>>>>>>> *mapping,
>>>>>>>> pgoff_t
>>>>>>>> index,
>>>>>>>> u64
>>>>>>>> last_cookie)
>>>>>>>> {
>>>>>>>> struct page *page;
>>>>>>>> + u64 change_attr;
>>>>>>>>
>>>>>>>> page = grab_cache_page(mapping, index);
>>>>>>>> - if (page && !PageUptodate(page)) {
>>>>>>>> - nfs_readdir_page_init_array(page,
>>>>>>>> last_cookie);
>>>>>>>> - if
>>>>>>>> (invalidate_inode_pages2_range(mapping, index
>>>>>>>> +
>>>>>>>> 1, -1) < 0)
>>>>>>>> - nfs_zap_mapping(mapping->host,
>>>>>>>> mapping);
>>>>>>>> - SetPageUptodate(page);
>>>>>>>> + if (!page)
>>>>>>>> + return NULL;
>>>>>>>> + change_attr =
>>>>>>>> inode_peek_iversion_raw(mapping->host);
>>>>>>>> + if (PageUptodate(page)) {
>>>>>>>> + if
>>>>>>>> (nfs_readdir_page_cookie_match(page,
>>>>>>>> last_cookie,
>>>>>>>> +
>>>>>>>> change_attr))
>>>>>>>> + return page;
>>>>>>>> + nfs_readdir_clear_array(page);
>>>>>>>
>>>>>>>
>>>>>>> Why use i_version rather than nfs_save_change_attribute?
>>>>>>> Seems
>>>>>>> having a
>>>>>>> consistent value across the pachecache and dir_verifiers
>>>>>>> would
>>>>>>> help
>>>>>>> debugging, and we've already have a bunch of machinery
>>>>>>> around
>>>>>>> the
>>>>>>> change_attribute.
>>>>>>
>>>>>> The directory cache_change_attribute is not reported in
>>>>>> tracepoints
>>>>>> because it is a directory-specific field, so it's not as
>>>>>> useful
>>>>>> for
>>>>>> debugging.
>>>>>>
>>>>>> The inode change attribute is what we have traditionally used
>>>>>> for
>>>>>> determining cache consistency, and when to invalidate the
>>>>>> cache.
>>>>>
>>>>> I should probably elaborate a little further on the differences
>>>>> between
>>>>> the inode change attribute and the cache_change_attribute.
>>>>>
>>>>> One of the main reasons for introducing the latter was to have
>>>>> something that allows us to track changes to the directory, but
>>>>> to
>>>>> avoid forcing unnecessary revalidations of the dcache.
>>>>>
>>>>> What this means is that when we create or remove a file, and
>>>>> the
>>>>> pre/post-op attributes tell us that there were no third party
>>>>> changes
>>>>> to the directory, we update the dcache, but we do _not_ update
>>>>> the
>>>>> cache_change_attribute, because we know that the rest of the
>>>>> directory
>>>>> contents are valid, and so we don't have to revalidate the
>>>>> dentries.
>>>>> However in that case, we _do_ want to update the readdir cache
>>>>> to
>>>>> reflect the fact that an entry was added or deleted. While we
>>>>> could
>>>>> figure out how to remove an entry (at least for the case where
>>>>> the
>>>>> filesystem is case-sensitive), we do not know where the
>>>>> filesystem
>>>>> added the new file, or what cookies was assigned.
>>>>>
>>>>> This is why the inode change attribute is more appropriate for
>>>>> indexing
>>>>> the page cache pages. It reflects the cases where we want to
>>>>> revalidate
>>>>> the readdir cache, as opposed to the dcache.
>>>>
>>>> Ok, thanks for explaining this.
>>>>
>>>> I've noticed that you haven't responded about my concerns about
>>>> not
>>>> checking
>>>> the directory for changes with every v4 READDIR. For v3, we have
>>>> post-op
>>>> updates to the directory, but with v4 the directory can change
>>>> and
>>>> we'll
>>>> end up with entries in the cache that are marked with an old
>>>> change_attr.
>>>>
>>>
>>> Then they will be rejected by nfs_readdir_page_cookie_match() if a
>>> user
>>> looks up that page again after we've revalidated the change
>>> attribute
>>> on the directory.
>>>
>>> ...and note that NFSv4 does returns a struct change_info4 for all
>>> operations that change the directory, so we will update the change
>>> attribute in all those cases.
>>
>> I'm not worried about changes from the same client.
>>
>>> If the change is made on the server, well then we will detect it
>>> through the standard revalidation process that usually decides when
>>> to
>>> invalidate the directory page cache.
>>
>> The environments I'm concerned about are setup very frequently: they
>> look
>> like multiple NFS clients co-ordinating on a directory with millions
>> of
>> files. Some clients are adding files as they do work, other clients
>> are
>> then looking for those files by walking the directory entries to
>> validate
>> their existence. The systems that do this have a "very bad time" if
>> some
>> of them produce listings that are _dramatically_ and transiently
>> different
>> from a listing they produced before.
>>
>> That can happen really easily with what we've got here, and it can
>> create a
>> huge problem for these setups. And it won't be easily
>> reproduceable,
>> and it
>> will be hard to find. It will cost everyone involved a lot of time
>> and
>> effort to track down, and we can fix it easily.
>>
>>>> I'm pretty positive that not checking for changes to the
>>>> directory
>>>> (not
>>>> sending GETATTR with READDIR) is going to create cases of double-
>>>> listed
>>>> and
>>>> truncated-listings for dirctory listers. Not handling those
>>>> cases
>>>> means
>>>> I'm
>>>> going to have some very unhappy customers that complain about
>>>> their
>>>> files
>>>> disappearing/reappearing on NFS.
>>>>
>>>> If you need me to prove that its an issue, I can take the time to
>>>> write
>>>> up
>>>> program that shows this problem.
>>>>
>>>
>>> If you label the page contents with an attribute that was retrieved
>>> _after_ the READDIR op, then you will introduce this as a problem
>>> for
>>> your customers.
>>
>> No the problem is already here, we're not introducing it. By
>> labeling
>> the
>> page contents with every call we're shifting the race window from the
>> client
>> where it's a very large window to the server where the window is
>> small.
>>
>> Its still possible, but *much* less likely.
>>
>>> The reason is that there is no atomicity between operations in a
>>> COMPOUND. Worse, the implementation of readdir in scalable modern
>>> systems, including Linux, does not even guarantee atomicity of the
>>> readdir operation itself. Instead each readdir entry is filled
>>> without
>>> holding any locks or preventing any changes to the directory or to
>>> the
>>> object itself.
>>
>> I understand all this, but its not a reason to make the problem
>> worse.
>>
>>> POSIX states very explicitly that if you're making changes to the
>>> directory after the call to opendir() or rewinddir(), then the
>>> behaviour w.r.t. whether that file appears in the readdir() call is
>>> unspecified. See
>>> https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
>>
>> Yes, but again - just because the problem exists doesn't give us
>> reason
>> to
>> amplify it when we can easily make a better choice for almost no
>> cost.
>>
>> Here are my reasons for wanting the GETATTR added:
>> - it makes it *much* less likely for this problem to occur, with
>> the
>> minor
>> downside of decreased caching for unstable directories.
>> - it makes v3 and v4 readdir pagecache behavior consistent WRT
>> changing
>> directories.
>>
>> I spent a non-trivial amount of time working on this problem, and saw
>> this
>> exact issue appear. Its definitely something that's going to come
>> back
>> and
>> bite us if we don't fix it.
>>
>> How can I convince you? I've offered to produce a working example
>> of
>> this
>> problem. Will you review those results? If I cannot convince you,
>> I
>> feel
>> I'll have to pursue distro-specific changes for this work.
>
> Ben, the main cause of this kind of issue in the current code is the
> following line:
>
> /*
> * ctx->pos points to the dirent entry number.
> * *desc->dir_cookie has the cookie for the next entry. We
> have
> * to either find the entry with the appropriate number or
> * revalidate the cookie.
> */
> if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
> res = nfs_revalidate_mapping(inode, file->f_mapping);
> if (res < 0)
> goto out;
> }
>
>
> That line protects the page cache against changes aften opendir(). It
> was introduced by Red Hat in commmit 07b5ce8ef2d8 in order to fix a
> claim of a severe performance problem.
>
> These patches _remove_ that protection, because we're now able to cope
> with more frequent revalidation without needing to restart directory
> reads from scratch.
Yes, I know. But the big change is that now we're heavily relying on
page validation to produce sane listing results, and proper page
validation
relies on up-to-date change info.
> So no. Without further proof, I don't accept your claim that this
> patchset introduces a regression. I don't accept your claim that we
> are
> required to revalidate the change attribute on every readdir call. We
> can't do that for NFSv2 or NFSv3 (the latter offers a post_op
> attribute, not pre-op attribute) and as I already pointed out, there
> is
> nothing in POSIX that requires this.
You don't need a pre-op attribute. You just need to detect the case
where
you're walking into pages that contain entries that don't match the ones
you're currently using, and post-op is as good as we can get it.
Ok, so I'm reading that further proof is required, and I'm happy to do
the
work. Thanks for the replies here and elsewhere.
Ben
On Fri, 2022-02-25 at 13:10 +0000, Trond Myklebust wrote:
> On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote:
> > On 24 Feb 2022, at 22:51, Trond Myklebust wrote:
> >
> > > On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote:
> > > > On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
> > > > > On 23 Feb 2022, at 16:12, [email protected] wrote:
> > > > >
> > > > > > From: Trond Myklebust <[email protected]>
> > > > > >
> > > > > > Use the change attribute and the first cookie in a
> > > > > > directory
> > > > > > page
> > > > > > cache
> > > > > > entry to validate that the page is up to date.
> > > > > >
> > > > > > Suggested-by: Benjamin Coddington <[email protected]>
> > > > > > Signed-off-by: Trond Myklebust
> > > > > > <[email protected]>
> > > > > > ---
> > > > > > fs/nfs/dir.c | 68
> > > > > > ++++++++++++++++++++++++++++------------------------
> > > > > > 1 file changed, 37 insertions(+), 31 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > > > > index f2258e926df2..5d9367d9b651 100644
> > > > > > --- a/fs/nfs/dir.c
> > > > > > +++ b/fs/nfs/dir.c
> > > > > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
> > > > > > };
> > > > > >
> > > > > > struct nfs_cache_array {
> > > > > > + u64 change_attr;
> > > > > > u64 last_cookie;
> > > > > > unsigned int size;
> > > > > > unsigned char page_full : 1,
> > > > > > @@ -175,7 +176,8 @@ static void
> > > > > > nfs_readdir_array_init(struct
> > > > > > nfs_cache_array *array)
> > > > > > memset(array, 0, sizeof(struct nfs_cache_array));
> > > > > > }
> > > > > >
> > > > > > -static void nfs_readdir_page_init_array(struct page *page,
> > > > > > u64
> > > > > > last_cookie)
> > > > > > +static void nfs_readdir_page_init_array(struct page *page,
> > > > > > u64
> > > > > > last_cookie,
> > > > > > + u64
> > > > > > change_attr)
> > > > > > {
> > > > > > struct nfs_cache_array *array;
> > > > >
> > > > >
> > > > > There's a hunk missing here, something like:
> > > > >
> > > > > @@ -185,6 +185,7 @@ static void
> > > > > nfs_readdir_page_init_array(struct
> > > > > page
> > > > > *page, u64 last_cookie,
> > > > > nfs_readdir_array_init(array);
> > > > > array->last_cookie = last_cookie;
> > > > > array->cookies_are_ordered = 1;
> > > > > + array->change_attr = change_attr;
> > > > > kunmap_atomic(array);
> > > > > }
> > > > >
> > > > > >
> > > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64
> > > > > > last_cookie,
> > > > > > gfp_t gfp_flags)
> > > > > > {
> > > > > > struct page *page = alloc_page(gfp_flags);
> > > > > > if (page)
> > > > > > - nfs_readdir_page_init_array(page,
> > > > > > last_cookie);
> > > > > > + nfs_readdir_page_init_array(page,
> > > > > > last_cookie,
> > > > > > 0);
> > > > > > return page;
> > > > > > }
> > > > > >
> > > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct
> > > > > > nfs_entry
> > > > > > *entry, struct page *page)
> > > > > > return ret;
> > > > > > }
> > > > > >
> > > > > > +static bool nfs_readdir_page_cookie_match(struct page
> > > > > > *page,
> > > > > > u64
> > > > > > last_cookie,
> > > > > > +
> > > > > > u64 change_attr)
> > > > >
> > > > > How about "nfs_readdir_page_valid()"? There's more going on
> > > > > than a
> > > > > cookie match.
> > > > >
> > > > >
> > > > > > +{
> > > > > > + struct nfs_cache_array *array = kmap_atomic(page);
> > > > > > + int ret = true;
> > > > > > +
> > > > > > + if (array->change_attr != change_attr)
> > > > > > + ret = false;
> > > > >
> > > > > Can we skip the next test if ret = false?
> > > >
> > > > I'd expect the compiler to do that.
> > > >
> > > > >
> > > > > > + if (array->size > 0 && array->array[0].cookie !=
> > > > > > last_cookie)
> > > > > > + ret = false;
> > > > > > + kunmap_atomic(array);
> > > > > > + return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static void nfs_readdir_page_unlock_and_put(struct page
> > > > > > *page)
> > > > > > +{
> > > > > > + unlock_page(page);
> > > > > > + put_page(page);
> > > > > > +}
> > > > > > +
> > > > > > static struct page *nfs_readdir_page_get_locked(struct
> > > > > > address_space
> > > > > > *mapping,
> > > > > > pgoff_t
> > > > > > index,
> > > > > > u64
> > > > > > last_cookie)
> > > > > > {
> > > > > > struct page *page;
> > > > > > + u64 change_attr;
> > > > > >
> > > > > > page = grab_cache_page(mapping, index);
> > > > > > - if (page && !PageUptodate(page)) {
> > > > > > - nfs_readdir_page_init_array(page,
> > > > > > last_cookie);
> > > > > > - if
> > > > > > (invalidate_inode_pages2_range(mapping, index
> > > > > > +
> > > > > > 1, -1) < 0)
> > > > > > - nfs_zap_mapping(mapping->host,
> > > > > > mapping);
> > > > > > - SetPageUptodate(page);
> > > > > > + if (!page)
> > > > > > + return NULL;
> > > > > > + change_attr =
> > > > > > inode_peek_iversion_raw(mapping->host);
> > > > > > + if (PageUptodate(page)) {
> > > > > > + if
> > > > > > (nfs_readdir_page_cookie_match(page,
> > > > > > last_cookie,
> > > > > > +
> > > > > > change_attr))
> > > > > > + return page;
> > > > > > + nfs_readdir_clear_array(page);
> > > > >
> > > > >
> > > > > Why use i_version rather than nfs_save_change_attribute?
> > > > > Seems
> > > > > having a
> > > > > consistent value across the pachecache and dir_verifiers
> > > > > would
> > > > > help
> > > > > debugging, and we've already have a bunch of machinery around
> > > > > the
> > > > > change_attribute.
> > > >
> > > > The directory cache_change_attribute is not reported in
> > > > tracepoints
> > > > because it is a directory-specific field, so it's not as useful
> > > > for
> > > > debugging.
> > > >
> > > > The inode change attribute is what we have traditionally used
> > > > for
> > > > determining cache consistency, and when to invalidate the
> > > > cache.
> > >
> > > I should probably elaborate a little further on the differences
> > > between
> > > the inode change attribute and the cache_change_attribute.
> > >
> > > One of the main reasons for introducing the latter was to have
> > > something that allows us to track changes to the directory, but
> > > to
> > > avoid forcing unnecessary revalidations of the dcache.
> > >
> > > What this means is that when we create or remove a file, and the
> > > pre/post-op attributes tell us that there were no third party
> > > changes
> > > to the directory, we update the dcache, but we do _not_ update
> > > the
> > > cache_change_attribute, because we know that the rest of the
> > > directory
> > > contents are valid, and so we don't have to revalidate the
> > > dentries.
> > > However in that case, we _do_ want to update the readdir cache to
> > > reflect the fact that an entry was added or deleted. While we
> > > could
> > > figure out how to remove an entry (at least for the case where
> > > the
> > > filesystem is case-sensitive), we do not know where the
> > > filesystem
> > > added the new file, or what cookies was assigned.
> > >
> > > This is why the inode change attribute is more appropriate for
> > > indexing
> > > the page cache pages. It reflects the cases where we want to
> > > revalidate
> > > the readdir cache, as opposed to the dcache.
> >
> > Ok, thanks for explaining this.
> >
> > I've noticed that you haven't responded about my concerns about not
> > checking
> > the directory for changes with every v4 READDIR. For v3, we have
> > post-op
> > updates to the directory, but with v4 the directory can change and
> > we'll
> > end up with entries in the cache that are marked with an old
> > change_attr.
> >
>
> Then they will be rejected by nfs_readdir_page_cookie_match() if a
> user
> looks up that page again after we've revalidated the change attribute
> on the directory.
>
> ...and note that NFSv4 does returns a struct change_info4 for all
> operations that change the directory, so we will update the change
> attribute in all those cases.
>
> If the change is made on the server, well then we will detect it
> through the standard revalidation process that usually decides when
> to
> invalidate the directory page cache.
>
> > I'm pretty positive that not checking for changes to the directory
> > (not
> > sending GETATTR with READDIR) is going to create cases of double-
> > listed
> > and
> > truncated-listings for dirctory listers. Not handling those cases
> > means
> > I'm
> > going to have some very unhappy customers that complain about their
> > files
> > disappearing/reappearing on NFS.
> >
> > If you need me to prove that its an issue, I can take the time to
> > write
> > up
> > program that shows this problem.
> >
>
> If you label the page contents with an attribute that was retrieved
> _after_ the READDIR op, then you will introduce this as a problem for
> your customers.
>
> The reason is that there is no atomicity between operations in a
> COMPOUND. Worse, the implementation of readdir in scalable modern
> systems, including Linux, does not even guarantee atomicity of the
> readdir operation itself. Instead each readdir entry is filled
> without
> holding any locks or preventing any changes to the directory or to
> the
> object itself.
>
> POSIX states very explicitly that if you're making changes to the
> directory after the call to opendir() or rewinddir(), then the
> behaviour w.r.t. whether that file appears in the readdir() call is
> unspecified. See
> https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
>
> This is also consistent with how glibc caches the results of a
> getdents() call.
>
Ah, wait a minute...
There is a problem with the call to nfs_readdir_page_get_next(). It
will allocate the page _after_ the readdir call itself, and so might
label it with a newer change attribute... I'll fix that so we can pass
in the change attribute associated with the readdir call.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 24 Feb 2022, at 22:51, Trond Myklebust wrote:
> On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote:
>> On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
>>> On 23 Feb 2022, at 16:12, [email protected] wrote:
>>>
>>>> From: Trond Myklebust <[email protected]>
>>>>
>>>> Use the change attribute and the first cookie in a directory page
>>>> cache
>>>> entry to validate that the page is up to date.
>>>>
>>>> Suggested-by: Benjamin Coddington <[email protected]>
>>>> Signed-off-by: Trond Myklebust <[email protected]>
>>>> ---
>>>> fs/nfs/dir.c | 68
>>>> ++++++++++++++++++++++++++++------------------------
>>>> 1 file changed, 37 insertions(+), 31 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
>>>> index f2258e926df2..5d9367d9b651 100644
>>>> --- a/fs/nfs/dir.c
>>>> +++ b/fs/nfs/dir.c
>>>> @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
>>>> };
>>>>
>>>> struct nfs_cache_array {
>>>> + u64 change_attr;
>>>> u64 last_cookie;
>>>> unsigned int size;
>>>> unsigned char page_full : 1,
>>>> @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct
>>>> nfs_cache_array *array)
>>>> memset(array, 0, sizeof(struct nfs_cache_array));
>>>> }
>>>>
>>>> -static void nfs_readdir_page_init_array(struct page *page, u64
>>>> last_cookie)
>>>> +static void nfs_readdir_page_init_array(struct page *page, u64
>>>> last_cookie,
>>>> + u64
>>>> change_attr)
>>>> {
>>>> struct nfs_cache_array *array;
>>>
>>>
>>> There's a hunk missing here, something like:
>>>
>>> @@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct
>>> page
>>> *page, u64 last_cookie,
>>> nfs_readdir_array_init(array);
>>> array->last_cookie = last_cookie;
>>> array->cookies_are_ordered = 1;
>>> + array->change_attr = change_attr;
>>> kunmap_atomic(array);
>>> }
>>>
>>>>
>>>> @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie,
>>>> gfp_t gfp_flags)
>>>> {
>>>> struct page *page = alloc_page(gfp_flags);
>>>> if (page)
>>>> - nfs_readdir_page_init_array(page,
>>>> last_cookie);
>>>> + nfs_readdir_page_init_array(page,
>>>> last_cookie,
>>>> 0);
>>>> return page;
>>>> }
>>>>
>>>> @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct
>>>> nfs_entry
>>>> *entry, struct page *page)
>>>> return ret;
>>>> }
>>>>
>>>> +static bool nfs_readdir_page_cookie_match(struct page *page, u64
>>>> last_cookie,
>>>> +
>>>> u64 change_attr)
>>>
>>> How about "nfs_readdir_page_valid()"? There's more going on than a
>>> cookie match.
>>>
>>>
>>>> +{
>>>> + struct nfs_cache_array *array = kmap_atomic(page);
>>>> + int ret = true;
>>>> +
>>>> + if (array->change_attr != change_attr)
>>>> + ret = false;
>>>
>>> Can we skip the next test if ret = false?
>>
>> I'd expect the compiler to do that.
>>
>>>
>>>> + if (array->size > 0 && array->array[0].cookie !=
>>>> last_cookie)
>>>> + ret = false;
>>>> + kunmap_atomic(array);
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static void nfs_readdir_page_unlock_and_put(struct page *page)
>>>> +{
>>>> + unlock_page(page);
>>>> + put_page(page);
>>>> +}
>>>> +
>>>> static struct page *nfs_readdir_page_get_locked(struct
>>>> address_space
>>>> *mapping,
>>>> pgoff_t
>>>> index,
>>>> u64
>>>> last_cookie)
>>>> {
>>>> struct page *page;
>>>> + u64 change_attr;
>>>>
>>>> page = grab_cache_page(mapping, index);
>>>> - if (page && !PageUptodate(page)) {
>>>> - nfs_readdir_page_init_array(page,
>>>> last_cookie);
>>>> - if
>>>> (invalidate_inode_pages2_range(mapping, index
>>>> +
>>>> 1, -1) < 0)
>>>> - nfs_zap_mapping(mapping->host,
>>>> mapping);
>>>> - SetPageUptodate(page);
>>>> + if (!page)
>>>> + return NULL;
>>>> + change_attr =
>>>> inode_peek_iversion_raw(mapping->host);
>>>> + if (PageUptodate(page)) {
>>>> + if
>>>> (nfs_readdir_page_cookie_match(page,
>>>> last_cookie,
>>>> +
>>>> change_attr))
>>>> + return page;
>>>> + nfs_readdir_clear_array(page);
>>>
>>>
>>> Why use i_version rather than nfs_save_change_attribute? Seems
>>> having a
>>> consistent value across the pachecache and dir_verifiers would help
>>> debugging, and we've already have a bunch of machinery around the
>>> change_attribute.
>>
>> The directory cache_change_attribute is not reported in tracepoints
>> because it is a directory-specific field, so it's not as useful for
>> debugging.
>>
>> The inode change attribute is what we have traditionally used for
>> determining cache consistency, and when to invalidate the cache.
>
> I should probably elaborate a little further on the differences
> between
> the inode change attribute and the cache_change_attribute.
>
> One of the main reasons for introducing the latter was to have
> something that allows us to track changes to the directory, but to
> avoid forcing unnecessary revalidations of the dcache.
>
> What this means is that when we create or remove a file, and the
> pre/post-op attributes tell us that there were no third party changes
> to the directory, we update the dcache, but we do _not_ update the
> cache_change_attribute, because we know that the rest of the directory
> contents are valid, and so we don't have to revalidate the dentries.
> However in that case, we _do_ want to update the readdir cache to
> reflect the fact that an entry was added or deleted. While we could
> figure out how to remove an entry (at least for the case where the
> filesystem is case-sensitive), we do not know where the filesystem
> added the new file, or what cookies was assigned.
>
> This is why the inode change attribute is more appropriate for
> indexing
> the page cache pages. It reflects the cases where we want to
> revalidate
> the readdir cache, as opposed to the dcache.
Ok, thanks for explaining this.
I've noticed that you haven't responded about my concerns about not
checking
the directory for changes with every v4 READDIR. For v3, we have
post-op
updates to the directory, but with v4 the directory can change and we'll
end up with entries in the cache that are marked with an old
change_attr.
I'm pretty positive that not checking for changes to the directory (not
sending GETATTR with READDIR) is going to create cases of double-listed
and
truncated-listings for dirctory listers. Not handling those cases means
I'm
going to have some very unhappy customers that complain about their
files
disappearing/reappearing on NFS.
If you need me to prove that its an issue, I can take the time to write
up
program that shows this problem.
Ben
On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote:
> On 24 Feb 2022, at 22:51, Trond Myklebust wrote:
>
> > On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote:
> > > On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
> > > > On 23 Feb 2022, at 16:12, [email protected] wrote:
> > > >
> > > > > From: Trond Myklebust <[email protected]>
> > > > >
> > > > > Use the change attribute and the first cookie in a directory
> > > > > page
> > > > > cache
> > > > > entry to validate that the page is up to date.
> > > > >
> > > > > Suggested-by: Benjamin Coddington <[email protected]>
> > > > > Signed-off-by: Trond Myklebust
> > > > > <[email protected]>
> > > > > ---
> > > > > fs/nfs/dir.c | 68
> > > > > ++++++++++++++++++++++++++++------------------------
> > > > > 1 file changed, 37 insertions(+), 31 deletions(-)
> > > > >
> > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > > > index f2258e926df2..5d9367d9b651 100644
> > > > > --- a/fs/nfs/dir.c
> > > > > +++ b/fs/nfs/dir.c
> > > > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
> > > > > };
> > > > >
> > > > > struct nfs_cache_array {
> > > > > + u64 change_attr;
> > > > > u64 last_cookie;
> > > > > unsigned int size;
> > > > > unsigned char page_full : 1,
> > > > > @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct
> > > > > nfs_cache_array *array)
> > > > > memset(array, 0, sizeof(struct nfs_cache_array));
> > > > > }
> > > > >
> > > > > -static void nfs_readdir_page_init_array(struct page *page,
> > > > > u64
> > > > > last_cookie)
> > > > > +static void nfs_readdir_page_init_array(struct page *page,
> > > > > u64
> > > > > last_cookie,
> > > > > + u64
> > > > > change_attr)
> > > > > {
> > > > > struct nfs_cache_array *array;
> > > >
> > > >
> > > > There's a hunk missing here, something like:
> > > >
> > > > @@ -185,6 +185,7 @@ static void
> > > > nfs_readdir_page_init_array(struct
> > > > page
> > > > *page, u64 last_cookie,
> > > > nfs_readdir_array_init(array);
> > > > array->last_cookie = last_cookie;
> > > > array->cookies_are_ordered = 1;
> > > > + array->change_attr = change_attr;
> > > > kunmap_atomic(array);
> > > > }
> > > >
> > > > >
> > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64
> > > > > last_cookie,
> > > > > gfp_t gfp_flags)
> > > > > {
> > > > > struct page *page = alloc_page(gfp_flags);
> > > > > if (page)
> > > > > - nfs_readdir_page_init_array(page,
> > > > > last_cookie);
> > > > > + nfs_readdir_page_init_array(page,
> > > > > last_cookie,
> > > > > 0);
> > > > > return page;
> > > > > }
> > > > >
> > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct
> > > > > nfs_entry
> > > > > *entry, struct page *page)
> > > > > return ret;
> > > > > }
> > > > >
> > > > > +static bool nfs_readdir_page_cookie_match(struct page *page,
> > > > > u64
> > > > > last_cookie,
> > > > > +
> > > > > u64 change_attr)
> > > >
> > > > How about "nfs_readdir_page_valid()"? There's more going on
> > > > than a
> > > > cookie match.
> > > >
> > > >
> > > > > +{
> > > > > + struct nfs_cache_array *array = kmap_atomic(page);
> > > > > + int ret = true;
> > > > > +
> > > > > + if (array->change_attr != change_attr)
> > > > > + ret = false;
> > > >
> > > > Can we skip the next test if ret = false?
> > >
> > > I'd expect the compiler to do that.
> > >
> > > >
> > > > > + if (array->size > 0 && array->array[0].cookie !=
> > > > > last_cookie)
> > > > > + ret = false;
> > > > > + kunmap_atomic(array);
> > > > > + return ret;
> > > > > +}
> > > > > +
> > > > > +static void nfs_readdir_page_unlock_and_put(struct page
> > > > > *page)
> > > > > +{
> > > > > + unlock_page(page);
> > > > > + put_page(page);
> > > > > +}
> > > > > +
> > > > > static struct page *nfs_readdir_page_get_locked(struct
> > > > > address_space
> > > > > *mapping,
> > > > > pgoff_t
> > > > > index,
> > > > > u64
> > > > > last_cookie)
> > > > > {
> > > > > struct page *page;
> > > > > + u64 change_attr;
> > > > >
> > > > > page = grab_cache_page(mapping, index);
> > > > > - if (page && !PageUptodate(page)) {
> > > > > - nfs_readdir_page_init_array(page,
> > > > > last_cookie);
> > > > > - if
> > > > > (invalidate_inode_pages2_range(mapping, index
> > > > > +
> > > > > 1, -1) < 0)
> > > > > - nfs_zap_mapping(mapping->host,
> > > > > mapping);
> > > > > - SetPageUptodate(page);
> > > > > + if (!page)
> > > > > + return NULL;
> > > > > + change_attr =
> > > > > inode_peek_iversion_raw(mapping->host);
> > > > > + if (PageUptodate(page)) {
> > > > > + if
> > > > > (nfs_readdir_page_cookie_match(page,
> > > > > last_cookie,
> > > > > +
> > > > > change_attr))
> > > > > + return page;
> > > > > + nfs_readdir_clear_array(page);
> > > >
> > > >
> > > > Why use i_version rather than nfs_save_change_attribute? Seems
> > > > having a
> > > > consistent value across the pachecache and dir_verifiers would
> > > > help
> > > > debugging, and we've already have a bunch of machinery around
> > > > the
> > > > change_attribute.
> > >
> > > The directory cache_change_attribute is not reported in
> > > tracepoints
> > > because it is a directory-specific field, so it's not as useful
> > > for
> > > debugging.
> > >
> > > The inode change attribute is what we have traditionally used for
> > > determining cache consistency, and when to invalidate the cache.
> >
> > I should probably elaborate a little further on the differences
> > between
> > the inode change attribute and the cache_change_attribute.
> >
> > One of the main reasons for introducing the latter was to have
> > something that allows us to track changes to the directory, but to
> > avoid forcing unnecessary revalidations of the dcache.
> >
> > What this means is that when we create or remove a file, and the
> > pre/post-op attributes tell us that there were no third party
> > changes
> > to the directory, we update the dcache, but we do _not_ update the
> > cache_change_attribute, because we know that the rest of the
> > directory
> > contents are valid, and so we don't have to revalidate the
> > dentries.
> > However in that case, we _do_ want to update the readdir cache to
> > reflect the fact that an entry was added or deleted. While we could
> > figure out how to remove an entry (at least for the case where the
> > filesystem is case-sensitive), we do not know where the filesystem
> > added the new file, or what cookies was assigned.
> >
> > This is why the inode change attribute is more appropriate for
> > indexing
> > the page cache pages. It reflects the cases where we want to
> > revalidate
> > the readdir cache, as opposed to the dcache.
>
> Ok, thanks for explaining this.
>
> I've noticed that you haven't responded about my concerns about not
> checking
> the directory for changes with every v4 READDIR. For v3, we have
> post-op
> updates to the directory, but with v4 the directory can change and
> we'll
> end up with entries in the cache that are marked with an old
> change_attr.
>
Then they will be rejected by nfs_readdir_page_cookie_match() if a user
looks up that page again after we've revalidated the change attribute
on the directory.
...and note that NFSv4 does returns a struct change_info4 for all
operations that change the directory, so we will update the change
attribute in all those cases.
If the change is made on the server, well then we will detect it
through the standard revalidation process that usually decides when to
invalidate the directory page cache.
> I'm pretty positive that not checking for changes to the directory
> (not
> sending GETATTR with READDIR) is going to create cases of double-
> listed
> and
> truncated-listings for dirctory listers. Not handling those cases
> means
> I'm
> going to have some very unhappy customers that complain about their
> files
> disappearing/reappearing on NFS.
>
> If you need me to prove that its an issue, I can take the time to
> write
> up
> program that shows this problem.
>
If you label the page contents with an attribute that was retrieved
_after_ the READDIR op, then you will introduce this as a problem for
your customers.
The reason is that there is no atomicity between operations in a
COMPOUND. Worse, the implementation of readdir in scalable modern
systems, including Linux, does not even guarantee atomicity of the
readdir operation itself. Instead each readdir entry is filled without
holding any locks or preventing any changes to the directory or to the
object itself.
POSIX states very explicitly that if you're making changes to the
directory after the call to opendir() or rewinddir(), then the
behaviour w.r.t. whether that file appears in the readdir() call is
unspecified. See
https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
This is also consistent with how glibc caches the results of a
getdents() call.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 25 Feb 2022, at 10:34, Benjamin Coddington wrote:
> Ok, so I'm reading that further proof is required, and I'm happy to do
> the
> work. Thanks for the replies here and elsewhere.
Here's an example of this problem on a tmpfs export using v8 of your
patchset with the fix to set the change_attr in
nfs_readdir_page_init_array().
I'm using tmpfs, because it reliably orders cookies in reverse order of
creation (or perhaps sorted by name).
The program drives both the client-side and server-side - so on this one
system, /exports/tmpfs is:
tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0
and /mnt/localhost is:
localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4
rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=none,addr=127.0.0.1
0 0
The program creates 256 files on the server, walks through them once on
the
client, deletes the last 127 on the server, drops the first page from
the
pagecache, and walks through them again on the client.
The second listing produces 124 duplicate entries.
I just have to say again: this behavior is _new_ (but not new to me),
and it
is absolutely going to crop up on our customer's systems that are
walking
through millions of directory entries on loaded servers under memory
pressure. The directory listings as a whole become very likely to be
nonsense at random times. I realize they are not /supposed/ to be
coherent,
but what we're getting here is going to be far far less coherent, and
its
going to be a mess.
There are other scenarios that are worse when the cookies aren't
ordered,
you can end up with EOF, or get into repeating patterns.
Please compare this with v3, and before this patchset, and tell me if
I'm
not justified playing chicken little.
Here's what I do to run this:
mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/
exportfs -ofsid=0 *:/exports
exportfs -ofsid=1 *:/exports/tmpfs
mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost
./getdents2
Compare "Listing 1" with "Listing 2".
I would also do a "rm -f /export/tmpfs/*" between each run.
Thanks again for your time and work.
Ben
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <string.h>
#define NFSDIR "/mnt/localhost/tmpfs"
#define LOCDIR "/exports/tmpfs"
#define BUF_SIZE 4096
int main(int argc, char **argv)
{
int i, dir_fd, bpos, total = 0;
size_t nread;
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
struct linux_dirent *d;
char buf[BUF_SIZE];
/* create files: */
for (i = 0; i < 256; i++) {
sprintf(buf, LOCDIR "/file_%03d", i);
close(open(buf, O_CREAT, 666));
}
dir_fd = open(NFSDIR, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC);
if (dir_fd < 0) {
perror("cannot open dir");
return 1;
}
while (1) {
nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE);
if (nread == 0 || nread == -1)
break;
for (bpos = 0; bpos < nread;) {
d = (struct linux_dirent *) (buf + bpos);
printf("%s\n", d->d_name);
total++;
bpos += d->d_reclen;
}
}
printf("Listing 1: %d total dirents\n", total);
/* rewind */
lseek(dir_fd, 0, SEEK_SET);
/* drop the first page */
posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED);
/* delete the last 127 files: */
for (i = 127; i < 256; i++) {
sprintf(buf, LOCDIR "/file_%03d", i);
unlink(buf);
}
total = 0;
while (1) {
nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE);
if (nread == 0 || nread == -1)
break;
for (bpos = 0; bpos < nread;) {
d = (struct linux_dirent *) (buf + bpos);
printf("%s\n", d->d_name);
total++;
bpos += d->d_reclen;
}
}
printf("Listing 2: %d total dirents\n", total);
close(dir_fd);
return 0;
}
On 25 Feb 2022, at 15:23, Benjamin Coddington wrote:
> int main(int argc, char **argv)
> {
> int i, dir_fd, bpos, total = 0;
> size_t nread;
> struct linux_dirent {
Ugh.. and sorry about the whitespace mess.
On Fri, 2022-02-25 at 06:28 -0500, Benjamin Coddington wrote:
> On 24 Feb 2022, at 21:11, Trond Myklebust wrote:
>
> > On Thu, 2022-02-24 at 09:15 -0500, Benjamin Coddington wrote:
> > > On 23 Feb 2022, at 16:12, [email protected] wrote:
> > >
> > > > From: Trond Myklebust <[email protected]>
> > > >
> > > > Instead of relying on counting the page offsets as we walk
> > > > through
> > > > the
> > > > page cache, switch to calculating them algorithmically.
> > > >
> > > > Signed-off-by: Trond Myklebust
> > > > <[email protected]>
> > > > ---
> > > > fs/nfs/dir.c | 18 +++++++++++++-----
> > > > 1 file changed, 13 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > > index 8f17aaebcd77..f2258e926df2 100644
> > > > --- a/fs/nfs/dir.c
> > > > +++ b/fs/nfs/dir.c
> > > > @@ -248,17 +248,20 @@ static const char
> > > > *nfs_readdir_copy_name(const
> > > > char *name, unsigned int len)
> > > > return ret;
> > > > }
> > > >
> > > > +static size_t nfs_readdir_array_maxentries(void)
> > > > +{
> > > > + return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
> > > > + sizeof(struct nfs_cache_array_entry);
> > > > +}
> > > > +
> > >
> > > Why the choice to use a runtime function call rather than the
> > > compiler's
> > > calculation? I suspect that the end result is the same, as the
> > > compiler
> > > will optimize it away, but I'm curious if there's a good reason
> > > for
> > > this.
> > >
> >
> > The comparison is more efficient because no pointer arithmetic is
> > needed. As you said, the above function always evaluates to a
> > constant,
> > and the array->size has been pre-calculated.
>
> Comparisons are more efficient than using something like this?:
>
> static const int nfs_readdir_array_maxentries =
> (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
> sizeof(struct nfs_cache_array_entry);
>
> I don't understand why, I must admit. I'm not saying it should be
> changed,
> I'm just trying to figure out the reason for the function declaration
> when
> the value is a constant, and I thought there was a hole in my head.
>
Unless we're talking about a compiler from the 1960s, there is little
difference between the two proposals. Any modern C compiler worth its
salt will know to inline the numeric value into the comparison (and
will optimise away the storage of your variable).
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 25 Feb 2022, at 8:10, Trond Myklebust wrote:
> On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote:
>> On 24 Feb 2022, at 22:51, Trond Myklebust wrote:
>>
>>> On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote:
>>>> On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote:
>>>>> On 23 Feb 2022, at 16:12, [email protected] wrote:
>>>>>
>>>>>> From: Trond Myklebust <[email protected]>
>>>>>>
>>>>>> Use the change attribute and the first cookie in a directory
>>>>>> page
>>>>>> cache
>>>>>> entry to validate that the page is up to date.
>>>>>>
>>>>>> Suggested-by: Benjamin Coddington <[email protected]>
>>>>>> Signed-off-by: Trond Myklebust
>>>>>> <[email protected]>
>>>>>> ---
>>>>>> fs/nfs/dir.c | 68
>>>>>> ++++++++++++++++++++++++++++------------------------
>>>>>> 1 file changed, 37 insertions(+), 31 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
>>>>>> index f2258e926df2..5d9367d9b651 100644
>>>>>> --- a/fs/nfs/dir.c
>>>>>> +++ b/fs/nfs/dir.c
>>>>>> @@ -139,6 +139,7 @@ struct nfs_cache_array_entry {
>>>>>> };
>>>>>>
>>>>>> struct nfs_cache_array {
>>>>>> + u64 change_attr;
>>>>>> u64 last_cookie;
>>>>>> unsigned int size;
>>>>>> unsigned char page_full : 1,
>>>>>> @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct
>>>>>> nfs_cache_array *array)
>>>>>> memset(array, 0, sizeof(struct nfs_cache_array));
>>>>>> }
>>>>>>
>>>>>> -static void nfs_readdir_page_init_array(struct page *page,
>>>>>> u64
>>>>>> last_cookie)
>>>>>> +static void nfs_readdir_page_init_array(struct page *page,
>>>>>> u64
>>>>>> last_cookie,
>>>>>> + u64
>>>>>> change_attr)
>>>>>> {
>>>>>> struct nfs_cache_array *array;
>>>>>
>>>>>
>>>>> There's a hunk missing here, something like:
>>>>>
>>>>> @@ -185,6 +185,7 @@ static void
>>>>> nfs_readdir_page_init_array(struct
>>>>> page
>>>>> *page, u64 last_cookie,
>>>>> nfs_readdir_array_init(array);
>>>>> array->last_cookie = last_cookie;
>>>>> array->cookies_are_ordered = 1;
>>>>> + array->change_attr = change_attr;
>>>>> kunmap_atomic(array);
>>>>> }
>>>>>
>>>>>>
>>>>>> @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64
>>>>>> last_cookie,
>>>>>> gfp_t gfp_flags)
>>>>>> {
>>>>>> struct page *page = alloc_page(gfp_flags);
>>>>>> if (page)
>>>>>> - nfs_readdir_page_init_array(page,
>>>>>> last_cookie);
>>>>>> + nfs_readdir_page_init_array(page,
>>>>>> last_cookie,
>>>>>> 0);
>>>>>> return page;
>>>>>> }
>>>>>>
>>>>>> @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct
>>>>>> nfs_entry
>>>>>> *entry, struct page *page)
>>>>>> return ret;
>>>>>> }
>>>>>>
>>>>>> +static bool nfs_readdir_page_cookie_match(struct page *page,
>>>>>> u64
>>>>>> last_cookie,
>>>>>> +
>>>>>> u64 change_attr)
>>>>>
>>>>> How about "nfs_readdir_page_valid()"? There's more going on
>>>>> than a
>>>>> cookie match.
>>>>>
>>>>>
>>>>>> +{
>>>>>> + struct nfs_cache_array *array = kmap_atomic(page);
>>>>>> + int ret = true;
>>>>>> +
>>>>>> + if (array->change_attr != change_attr)
>>>>>> + ret = false;
>>>>>
>>>>> Can we skip the next test if ret = false?
>>>>
>>>> I'd expect the compiler to do that.
>>>>
>>>>>
>>>>>> + if (array->size > 0 && array->array[0].cookie !=
>>>>>> last_cookie)
>>>>>> + ret = false;
>>>>>> + kunmap_atomic(array);
>>>>>> + return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static void nfs_readdir_page_unlock_and_put(struct page
>>>>>> *page)
>>>>>> +{
>>>>>> + unlock_page(page);
>>>>>> + put_page(page);
>>>>>> +}
>>>>>> +
>>>>>> static struct page *nfs_readdir_page_get_locked(struct
>>>>>> address_space
>>>>>> *mapping,
>>>>>> pgoff_t
>>>>>> index,
>>>>>> u64
>>>>>> last_cookie)
>>>>>> {
>>>>>> struct page *page;
>>>>>> + u64 change_attr;
>>>>>>
>>>>>> page = grab_cache_page(mapping, index);
>>>>>> - if (page && !PageUptodate(page)) {
>>>>>> - nfs_readdir_page_init_array(page,
>>>>>> last_cookie);
>>>>>> - if
>>>>>> (invalidate_inode_pages2_range(mapping, index
>>>>>> +
>>>>>> 1, -1) < 0)
>>>>>> - nfs_zap_mapping(mapping->host,
>>>>>> mapping);
>>>>>> - SetPageUptodate(page);
>>>>>> + if (!page)
>>>>>> + return NULL;
>>>>>> + change_attr =
>>>>>> inode_peek_iversion_raw(mapping->host);
>>>>>> + if (PageUptodate(page)) {
>>>>>> + if
>>>>>> (nfs_readdir_page_cookie_match(page,
>>>>>> last_cookie,
>>>>>> +
>>>>>> change_attr))
>>>>>> + return page;
>>>>>> + nfs_readdir_clear_array(page);
>>>>>
>>>>>
>>>>> Why use i_version rather than nfs_save_change_attribute? Seems
>>>>> having a
>>>>> consistent value across the pachecache and dir_verifiers would
>>>>> help
>>>>> debugging, and we've already have a bunch of machinery around
>>>>> the
>>>>> change_attribute.
>>>>
>>>> The directory cache_change_attribute is not reported in
>>>> tracepoints
>>>> because it is a directory-specific field, so it's not as useful
>>>> for
>>>> debugging.
>>>>
>>>> The inode change attribute is what we have traditionally used for
>>>> determining cache consistency, and when to invalidate the cache.
>>>
>>> I should probably elaborate a little further on the differences
>>> between
>>> the inode change attribute and the cache_change_attribute.
>>>
>>> One of the main reasons for introducing the latter was to have
>>> something that allows us to track changes to the directory, but to
>>> avoid forcing unnecessary revalidations of the dcache.
>>>
>>> What this means is that when we create or remove a file, and the
>>> pre/post-op attributes tell us that there were no third party
>>> changes
>>> to the directory, we update the dcache, but we do _not_ update the
>>> cache_change_attribute, because we know that the rest of the
>>> directory
>>> contents are valid, and so we don't have to revalidate the
>>> dentries.
>>> However in that case, we _do_ want to update the readdir cache to
>>> reflect the fact that an entry was added or deleted. While we could
>>> figure out how to remove an entry (at least for the case where the
>>> filesystem is case-sensitive), we do not know where the filesystem
>>> added the new file, or what cookies was assigned.
>>>
>>> This is why the inode change attribute is more appropriate for
>>> indexing
>>> the page cache pages. It reflects the cases where we want to
>>> revalidate
>>> the readdir cache, as opposed to the dcache.
>>
>> Ok, thanks for explaining this.
>>
>> I've noticed that you haven't responded about my concerns about not
>> checking
>> the directory for changes with every v4 READDIR. For v3, we have
>> post-op
>> updates to the directory, but with v4 the directory can change and
>> we'll
>> end up with entries in the cache that are marked with an old
>> change_attr.
>>
>
> Then they will be rejected by nfs_readdir_page_cookie_match() if a
> user
> looks up that page again after we've revalidated the change attribute
> on the directory.
>
> ...and note that NFSv4 does returns a struct change_info4 for all
> operations that change the directory, so we will update the change
> attribute in all those cases.
I'm not worried about changes from the same client.
> If the change is made on the server, well then we will detect it
> through the standard revalidation process that usually decides when to
> invalidate the directory page cache.
The environments I'm concerned about are setup very frequently: they
look
like multiple NFS clients co-ordinating on a directory with millions of
files. Some clients are adding files as they do work, other clients are
then looking for those files by walking the directory entries to
validate
their existence. The systems that do this have a "very bad time" if
some
of them produce listings that are _dramatically_ and transiently
different
from a listing they produced before.
That can happen really easily with what we've got here, and it can
create a
huge problem for these setups. And it won't be easily reproduceable,
and it
will be hard to find. It will cost everyone involved a lot of time and
effort to track down, and we can fix it easily.
>> I'm pretty positive that not checking for changes to the directory
>> (not
>> sending GETATTR with READDIR) is going to create cases of double-
>> listed
>> and
>> truncated-listings for dirctory listers. Not handling those cases
>> means
>> I'm
>> going to have some very unhappy customers that complain about their
>> files
>> disappearing/reappearing on NFS.
>>
>> If you need me to prove that its an issue, I can take the time to
>> write
>> up
>> program that shows this problem.
>>
>
> If you label the page contents with an attribute that was retrieved
> _after_ the READDIR op, then you will introduce this as a problem for
> your customers.
No the problem is already here, we're not introducing it. By labeling
the
page contents with every call we're shifting the race window from the
client
where it's a very large window to the server where the window is small.
Its still possible, but *much* less likely.
> The reason is that there is no atomicity between operations in a
> COMPOUND. Worse, the implementation of readdir in scalable modern
> systems, including Linux, does not even guarantee atomicity of the
> readdir operation itself. Instead each readdir entry is filled without
> holding any locks or preventing any changes to the directory or to the
> object itself.
I understand all this, but its not a reason to make the problem worse.
> POSIX states very explicitly that if you're making changes to the
> directory after the call to opendir() or rewinddir(), then the
> behaviour w.r.t. whether that file appears in the readdir() call is
> unspecified. See
> https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
Yes, but again - just because the problem exists doesn't give us reason
to
amplify it when we can easily make a better choice for almost no cost.
Here are my reasons for wanting the GETATTR added:
- it makes it *much* less likely for this problem to occur, with the
minor
downside of decreased caching for unstable directories.
- it makes v3 and v4 readdir pagecache behavior consistent WRT
changing
directories.
I spent a non-trivial amount of time working on this problem, and saw
this
exact issue appear. Its definitely something that's going to come back
and
bite us if we don't fix it.
How can I convince you? I've offered to produce a working example of
this
problem. Will you review those results? If I cannot convince you, I
feel
I'll have to pursue distro-specific changes for this work.
Ben
On Fri, 2022-02-25 at 15:41 -0500, Trond Myklebust wrote:
> On Fri, 2022-02-25 at 15:23 -0500, Benjamin Coddington wrote:
> > On 25 Feb 2022, at 10:34, Benjamin Coddington wrote:
> > > Ok, so I'm reading that further proof is required, and I'm happy
> > > to
> > > do
> > > the
> > > work. Thanks for the replies here and elsewhere.
> >
> > Here's an example of this problem on a tmpfs export using v8 of
> > your
> > patchset with the fix to set the change_attr in
> > nfs_readdir_page_init_array().
> >
> > I'm using tmpfs, because it reliably orders cookies in reverse
> > order
> > of
> > creation (or perhaps sorted by name).
> >
> > The program drives both the client-side and server-side - so on
> > this
> > one
> > system, /exports/tmpfs is:
> > tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0
> >
> > and /mnt/localhost is:
> > localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4
> > rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,pr
> > ot
> > o=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=n
> > on
> > e,addr=127.0.0.1
> > 0 0
> >
> > The program creates 256 files on the server, walks through them
> > once
> > on
> > the
> > client, deletes the last 127 on the server, drops the first page
> > from
> > the
> > pagecache, and walks through them again on the client.
> >
> > The second listing produces 124 duplicate entries.
> >
> > I just have to say again: this behavior is _new_ (but not new to
> > me),
> > and it
> > is absolutely going to crop up on our customer's systems that are
> > walking
> > through millions of directory entries on loaded servers under
> > memory
> > pressure. The directory listings as a whole become very likely to
> > be
> > nonsense at random times. I realize they are not /supposed/ to be
> > coherent,
> > but what we're getting here is going to be far far less coherent,
> > and
> > its
> > going to be a mess.
> >
> > There are other scenarios that are worse when the cookies aren't
> > ordered,
> > you can end up with EOF, or get into repeating patterns.
> >
> > Please compare this with v3, and before this patchset, and tell me
> > if
> > I'm
> > not justified playing chicken little.
> >
> > Here's what I do to run this:
> >
> > mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/
> > exportfs -ofsid=0 *:/exports
> > exportfs -ofsid=1 *:/exports/tmpfs
> > mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost
> > ./getdents2
> >
> > Compare "Listing 1" with "Listing 2".
> >
> > I would also do a "rm -f /export/tmpfs/*" between each run.
> >
> > Thanks again for your time and work.
> >
> > Ben
> >
> > #define _GNU_SOURCE
> > #include <stdio.h>
> > #include <unistd.h>
> > #include <fcntl.h>
> > #include <sched.h>
> > #include <sys/types.h>
> > #include <sys/stat.h>
> > #include <sys/syscall.h>
> > #include <string.h>
> >
> > #define NFSDIR "/mnt/localhost/tmpfs"
> > #define LOCDIR "/exports/tmpfs"
> > #define BUF_SIZE 4096
> >
> > int main(int argc, char **argv)
> > {
> > int i, dir_fd, bpos, total = 0;
> > size_t nread;
> > struct linux_dirent {
> > long d_ino;
> > off_t d_off;
> > unsigned short d_reclen;
> > char d_name[];
> > };
> > struct linux_dirent *d;
> > char buf[BUF_SIZE];
> >
> > /* create files: */
> > for (i = 0; i < 256; i++) {
> > sprintf(buf, LOCDIR "/file_%03d", i);
> > close(open(buf, O_CREAT, 666));
> > }
> >
> > dir_fd = open(NFSDIR,
> > O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC);
> > if (dir_fd < 0) {
> > perror("cannot open dir");
> > return 1;
> > }
> >
> > while (1) {
> > nread = syscall(SYS_getdents, dir_fd, buf,
> > BUF_SIZE);
> > if (nread == 0 || nread == -1)
> > break;
> > for (bpos = 0; bpos < nread;) {
> > d = (struct linux_dirent *) (buf + bpos);
> > printf("%s\n", d->d_name);
> > total++;
> > bpos += d->d_reclen;
> > }
> > }
> > printf("Listing 1: %d total dirents\n", total);
> >
> > /* rewind */
> > lseek(dir_fd, 0, SEEK_SET);
> >
> > /* drop the first page */
> > posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED);
> >
> > /* delete the last 127 files: */
> > for (i = 127; i < 256; i++) {
> > sprintf(buf, LOCDIR "/file_%03d", i);
> > unlink(buf);
> > }
> >
> > total = 0;
> > while (1) {
> > nread = syscall(SYS_getdents, dir_fd, buf,
> > BUF_SIZE);
> > if (nread == 0 || nread == -1)
> > break;
> > for (bpos = 0; bpos < nread;) {
> > d = (struct linux_dirent *) (buf + bpos);
> > printf("%s\n", d->d_name);
> > total++;
> > bpos += d->d_reclen;
> > }
> > }
> > printf("Listing 2: %d total dirents\n", total);
> >
> > close(dir_fd);
> > return 0;
> > }
>
>
> tmpfs is broken on the server. It doesn't provide stable cookies, and
> knfsd doesn't use the verifier to tell you that the cookie assignment
> has changed.
>
>
> Re-export of tmpfs has never worked reliably.
What I mean is that tmpfs is always a poor choice for NFS because
seekdir()/telldir() don't work reliably, and so READDIR cannot work
reliably, since it relies on open()+seekdir() to continue reading the
directory in successive RPC calls.
Anyhow, to get back to your question about whether we should or should
not be detecting that the directory changed when you delete the files
on the server. The answer is no... Nothing in the above guarantees that
the cache is revalidated.
NFS close to open cache consistency means that we guarantee to
revalidate the cached data on open(), and only then. That guarantee
does not extend to lseek() or to the rewinddir/seekdir wrappers.
If your application wants stronger cache consistency, then there are
tricks to enable that. Now that statx() has the AT_STATX_FORCE_SYNC
flag, you could use that to force a revalidation of the directory
attributes on the client. You might also use the posix_fadvise() trick
to try to clear the cache. However note that none of these tricks are
guaranteed to work. They're not reliable now, and that situation is
unlikely to change in the future barring a deliberate (and documented!)
change in kernel policy.
So as of now, the only way to reliably introduce a revalidation point
in your testcase above is to close() and then open().
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Fri, 2022-02-25 at 15:23 -0500, Benjamin Coddington wrote:
> On 25 Feb 2022, at 10:34, Benjamin Coddington wrote:
> > Ok, so I'm reading that further proof is required, and I'm happy to
> > do
> > the
> > work. Thanks for the replies here and elsewhere.
>
> Here's an example of this problem on a tmpfs export using v8 of your
> patchset with the fix to set the change_attr in
> nfs_readdir_page_init_array().
>
> I'm using tmpfs, because it reliably orders cookies in reverse order
> of
> creation (or perhaps sorted by name).
>
> The program drives both the client-side and server-side - so on this
> one
> system, /exports/tmpfs is:
> tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0
>
> and /mnt/localhost is:
> localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4
> rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,prot
> o=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=non
> e,addr=127.0.0.1
> 0 0
>
> The program creates 256 files on the server, walks through them once
> on
> the
> client, deletes the last 127 on the server, drops the first page from
> the
> pagecache, and walks through them again on the client.
>
> The second listing produces 124 duplicate entries.
>
> I just have to say again: this behavior is _new_ (but not new to me),
> and it
> is absolutely going to crop up on our customer's systems that are
> walking
> through millions of directory entries on loaded servers under memory
> pressure. The directory listings as a whole become very likely to be
> nonsense at random times. I realize they are not /supposed/ to be
> coherent,
> but what we're getting here is going to be far far less coherent, and
> its
> going to be a mess.
>
> There are other scenarios that are worse when the cookies aren't
> ordered,
> you can end up with EOF, or get into repeating patterns.
>
> Please compare this with v3, and before this patchset, and tell me if
> I'm
> not justified playing chicken little.
>
> Here's what I do to run this:
>
> mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/
> exportfs -ofsid=0 *:/exports
> exportfs -ofsid=1 *:/exports/tmpfs
> mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost
> ./getdents2
>
> Compare "Listing 1" with "Listing 2".
>
> I would also do a "rm -f /export/tmpfs/*" between each run.
>
> Thanks again for your time and work.
>
> Ben
>
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <sched.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <sys/syscall.h>
> #include <string.h>
>
> #define NFSDIR "/mnt/localhost/tmpfs"
> #define LOCDIR "/exports/tmpfs"
> #define BUF_SIZE 4096
>
> int main(int argc, char **argv)
> {
> int i, dir_fd, bpos, total = 0;
> size_t nread;
> struct linux_dirent {
> long d_ino;
> off_t d_off;
> unsigned short d_reclen;
> char d_name[];
> };
> struct linux_dirent *d;
> char buf[BUF_SIZE];
>
> /* create files: */
> for (i = 0; i < 256; i++) {
> sprintf(buf, LOCDIR "/file_%03d", i);
> close(open(buf, O_CREAT, 666));
> }
>
> dir_fd = open(NFSDIR,
> O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC);
> if (dir_fd < 0) {
> perror("cannot open dir");
> return 1;
> }
>
> while (1) {
> nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE);
> if (nread == 0 || nread == -1)
> break;
> for (bpos = 0; bpos < nread;) {
> d = (struct linux_dirent *) (buf + bpos);
> printf("%s\n", d->d_name);
> total++;
> bpos += d->d_reclen;
> }
> }
> printf("Listing 1: %d total dirents\n", total);
>
> /* rewind */
> lseek(dir_fd, 0, SEEK_SET);
>
> /* drop the first page */
> posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED);
>
> /* delete the last 127 files: */
> for (i = 127; i < 256; i++) {
> sprintf(buf, LOCDIR "/file_%03d", i);
> unlink(buf);
> }
>
> total = 0;
> while (1) {
> nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE);
> if (nread == 0 || nread == -1)
> break;
> for (bpos = 0; bpos < nread;) {
> d = (struct linux_dirent *) (buf + bpos);
> printf("%s\n", d->d_name);
> total++;
> bpos += d->d_reclen;
> }
> }
> printf("Listing 2: %d total dirents\n", total);
>
> close(dir_fd);
> return 0;
> }
tmpfs is broken on the server. It doesn't provide stable cookies, and
knfsd doesn't use the verifier to tell you that the cookie assignment
has changed.
Re-export of tmpfs has never worked reliably.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On 25 Feb 2022, at 15:41, Trond Myklebust wrote:
> On Fri, 2022-02-25 at 15:23 -0500, Benjamin Coddington wrote:
>> On 25 Feb 2022, at 10:34, Benjamin Coddington wrote:
>>> Ok, so I'm reading that further proof is required, and I'm happy to
>>> do
>>> the
>>> work. Thanks for the replies here and elsewhere.
>>
>> Here's an example of this problem on a tmpfs export using v8 of your
>> patchset with the fix to set the change_attr in
>> nfs_readdir_page_init_array().
>>
>> I'm using tmpfs, because it reliably orders cookies in reverse order
>> of
>> creation (or perhaps sorted by name).
>>
>> The program drives both the client-side and server-side - so on this
>> one
>> system, /exports/tmpfs is:
>> tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0
>>
>> and /mnt/localhost is:
>> localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4
>> rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,prot
>> o=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=non
>> e,addr=127.0.0.1
>> 0 0
>>
>> The program creates 256 files on the server, walks through them once
>> on
>> the
>> client, deletes the last 127 on the server, drops the first page from
>> the
>> pagecache, and walks through them again on the client.
>>
>> The second listing produces 124 duplicate entries.
>>
>> I just have to say again: this behavior is _new_ (but not new to me),
>> and it
>> is absolutely going to crop up on our customer's systems that are
>> walking
>> through millions of directory entries on loaded servers under memory
>> pressure. The directory listings as a whole become very likely to
>> be
>> nonsense at random times. I realize they are not /supposed/ to be
>> coherent,
>> but what we're getting here is going to be far far less coherent, and
>> its
>> going to be a mess.
>>
>> There are other scenarios that are worse when the cookies aren't
>> ordered,
>> you can end up with EOF, or get into repeating patterns.
>>
>> Please compare this with v3, and before this patchset, and tell me if
>> I'm
>> not justified playing chicken little.
>>
>> Here's what I do to run this:
>>
>> mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/
>> exportfs -ofsid=0 *:/exports
>> exportfs -ofsid=1 *:/exports/tmpfs
>> mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost
>> ./getdents2
>>
>> Compare "Listing 1" with "Listing 2".
>>
>> I would also do a "rm -f /export/tmpfs/*" between each run.
>>
>> Thanks again for your time and work.
>>
>> Ben
>>
>> #define _GNU_SOURCE
>> #include <stdio.h>
>> #include <unistd.h>
>> #include <fcntl.h>
>> #include <sched.h>
>> #include <sys/types.h>
>> #include <sys/stat.h>
>> #include <sys/syscall.h>
>> #include <string.h>
>>
>> #define NFSDIR "/mnt/localhost/tmpfs"
>> #define LOCDIR "/exports/tmpfs"
>> #define BUF_SIZE 4096
>>
>> int main(int argc, char **argv)
>> {
>> int i, dir_fd, bpos, total = 0;
>> size_t nread;
>> struct linux_dirent {
>> long d_ino;
>> off_t d_off;
>> unsigned short d_reclen;
>> char d_name[];
>> };
>> struct linux_dirent *d;
>> char buf[BUF_SIZE];
>>
>> /* create files: */
>> for (i = 0; i < 256; i++) {
>> sprintf(buf, LOCDIR "/file_%03d", i);
>> close(open(buf, O_CREAT, 666));
>> }
>>
>> dir_fd = open(NFSDIR,
>> O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC);
>> if (dir_fd < 0) {
>> perror("cannot open dir");
>> return 1;
>> }
>>
>> while (1) {
>> nread = syscall(SYS_getdents, dir_fd,
>> buf, BUF_SIZE);
>> if (nread == 0 || nread == -1)
>> break;
>> for (bpos = 0; bpos < nread;) {
>> d = (struct linux_dirent *) (buf + bpos);
>> printf("%s\n", d->d_name);
>> total++;
>> bpos += d->d_reclen;
>> }
>> }
>> printf("Listing 1: %d total dirents\n", total);
>>
>> /* rewind */
>> lseek(dir_fd, 0, SEEK_SET);
>>
>> /* drop the first page */
>> posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED);
>>
>> /* delete the last 127 files: */
>> for (i = 127; i < 256; i++) {
>> sprintf(buf, LOCDIR "/file_%03d", i);
>> unlink(buf);
>> }
>>
>> total = 0;
>> while (1) {
>> nread = syscall(SYS_getdents, dir_fd,
>> buf, BUF_SIZE);
>> if (nread == 0 || nread == -1)
>> break;
>> for (bpos = 0; bpos < nread;) {
>> d = (struct linux_dirent *) (buf + bpos);
>> printf("%s\n", d->d_name);
>> total++;
>> bpos += d->d_reclen;
>> }
>> }
>> printf("Listing 2: %d total dirents\n", total);
>>
>> close(dir_fd);
>> return 0;
>> }
>
>
> tmpfs is broken on the server. It doesn't provide stable cookies, and
> knfsd doesn't use the verifier to tell you that the cookie assignment
> has changed.
>
>
> Re-export of tmpfs has never worked reliably.
In this case, the cookies are stable, they can be verified with a wire
capture.
I've just adapted the program slightly to ext4 below. In this case the
Listing 2 shows files "file_125 file_126" that don't exist on the server
and
leave out files "elif_125 elif_126". It would take me more time to
produce
more dramatic results, but I'm sure I could produce them.
And its not hard to understand either, so I'm not sure why so much proof
is
needed.
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <string.h>
#define NFSDIR "/mnt/localhost/ext4"
#define LOCDIR "/exports/ext4"
#define BUF_SIZE 4096
int main(int argc, char **argv)
{
int i, dir_fd, bpos, total = 0;
size_t nread;
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
struct linux_dirent *d;
char buf[BUF_SIZE];
/* create files: */
for (i = 0; i < 256; i++) {
sprintf(buf, LOCDIR "/file_%03d", i);
close(open(buf, O_CREAT, 666));
}
dir_fd = open(NFSDIR, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC);
if (dir_fd < 0) {
perror("cannot open dir");
return 1;
}
while (1) {
nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE);
if (nread == 0 || nread == -1)
break;
for (bpos = 0; bpos < nread;) {
d = (struct linux_dirent *) (buf + bpos);
printf("%s\n", d->d_name);
total++;
bpos += d->d_reclen;
}
}
printf("Listing 1: %d total dirents\n", total);
/* rewind */
lseek(dir_fd, 0, SEEK_SET);
/* drop the first page */
posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED);
/* delete the first 127 files: */
for (i = 0; i < 127; i++) {
sprintf(buf, LOCDIR "/file_%03d", i);
unlink(buf);
}
/* create 127 more: */
for (i = 0; i < 127; i++) {
sprintf(buf, LOCDIR "/elif_%03d", i);
close(open(buf, O_CREAT, 666));
}
total = 0;
while (1) {
nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE);
if (nread == 0 || nread == -1)
break;
for (bpos = 0; bpos < nread;) {
d = (struct linux_dirent *) (buf + bpos);
printf("%s\n", d->d_name);
total++;
bpos += d->d_reclen;
}
}
printf("Listing 2: %d total dirents\n", total);
close(dir_fd);
return 0;
}