NFS current abort and attempt at filename lookup in RCU mode.
This can have serious performance impact on a highly parallel load.
The "Makefile" below generates just such a load. On a 40-core
machine "make -j 40" is about 6 times as fast at "make -j 5"
when a local filesystem is used (e.g. XFS), but as much as half
as fast when NFS is used.
With this patch set, "make -j 40" is about 3 times as fast as
"make -j 5" on NFS, and "perf" data doesn't show spinlocks to be a big
problem any more.
This is a re-submission with a few small improvements of a patch set
posted in March. Since then I have recieved confirmation that it
definitely fixes the problem, when combined with a patch set which
enhances autofs4 in a similar way. So it has had quite a bit of
testing.
NeilBrown
The makefile. Run
make -f this-makefile NFS_PATH=/some/nfs/mounted/empty/directory
and an 'objs' directory will be created with lots of objects.
Use "make clean" before trying again.
---
# Options available from the command line
NUM_NFS_INCLUDES := 50
NUM_COMPILES := 200
NFS_PATH:=/import/dummySrc
# When adding includes, they will be of the form $(NFS_PATH)/x where
# x ranges from 1 to NUM_NFS_INCLUDES. Those directories are created
# automatically by the 'dirs' target.
SHELL:=bash
OBJ_DIR:=objs
ifeq ($(VERBOSE),1)
Q =
else
Q = @
endif
MY_INCLUDES := $(addprefix -I$(NFS_PATH)/, $(shell echo {1..$(NUM_NFS_INCLUDES)}))
INCLUDE_DIRS := $(addprefix $(NFS_PATH)/, $(shell echo {1..$(NUM_NFS_INCLUDES)}))
MY_OBJS := $(addsuffix _test.o, $(addprefix $(OBJ_DIR)/, $(shell echo {1..$(NUM_COMPILES)})))
build: dirs $(MY_OBJS)
dirs:
$(Q)mkdir -p $(INCLUDE_DIRS)
clean:
$(Q)rm -rf $(OBJ_DIR)
$(OBJ_DIR)/:
$(Q)/bin/mkdir $(OBJ_DIR)
$(MY_OBJS): test.cpp | $(OBJ_DIR)/
$(Q)gcc $(MY_INCLUDES) -c test.cpp -o $@
test.cpp:
$(Q)echo "#include <iostream>" > test.cpp
.PHONY: all build clean
---
NeilBrown (7):
NFS: nfs4_lookup_revalidate: only evaluate parent if it will be used.
NFS: prepare for RCU-walk support but pushing tests later in code.
sunrpc/auth: allow lockless (rcu) lookup of credential cache.
NFS: support RCU_WALK in nfs_permission()
NFS: teach nfs_neg_need_reval to understand LOOKUP_RCU
NFS: teach nfs_lookup_verify_inode to handle LOOKUP_RCU
NFS: allow lockless access to access_cache
fs/nfs/dir.c | 166 ++++++++++++++++++++++++++++++++++---------
fs/nfs/inode.c | 9 ++
include/linux/nfs_fs.h | 2 +
include/linux/sunrpc/auth.h | 2 +
net/sunrpc/auth.c | 17 ++++
net/sunrpc/auth_generic.c | 6 ++
net/sunrpc/auth_null.c | 2 +
7 files changed, 168 insertions(+), 36 deletions(-)
--
Signature
On Mon, Jul 14, 2014 at 11:28:20AM +1000, NeilBrown wrote:
> nfs4_lookup_revalidate only uses 'parent' to get 'dir', and only
> uses 'dir' if 'inode == NULL'.
>
> So we don't need to find out what 'parent' or 'dir' is until we
> know that 'inode' is NULL.
>
> By moving 'dget_parent' inside the 'if', we can reduce the number of
> call sites for 'dput(parent)'.
>
> Signed-off-by: NeilBrown <[email protected]>
Looks good,
Reviewed-by: Christoph Hellwig <[email protected]>
>
> /* We can't create new files in nfs_open_revalidate(), so we
> * optimize away revalidation of negative dentries.
> */
> if (inode == NULL) {
> + struct dentry *parent;
> + struct inode *dir;
> +
> + parent = dget_parent(dentry);
> + dir = parent->d_inode;
> if (!nfs_neg_need_reval(dir, dentry, flags))
> ret = 1;
> + dput(parent);
> goto out;
Seems like this could be further condensed to:
struct dentry *parent = dget_parent(dentry);
if (!nfs_neg_need_reval(parent->d_inode, dentry, flags))
ret = 1;
dput(parent);
goto out;
or maybe even kill the goto out now that it's just a simple return
without additional work.
nfs_permission makes two calls which are not always safe in RCU_WALK,
rpc_lookup_cred and nfs_do_access.
The second can easily be made rcu-safe by aborting with -ECHILD before
making the RPC call.
The former can be made rcu-safe by calling rpc_lookup_cred_nonblock()
instead.
As this will almost always succeed, we use it even when RCU_WALK
isn't being used as it still saves some spinlocks in a common case.
We only fall back to rpc_lookup_cred() if rpc_lookup_cred_nonblock()
fails and MAY_NOT_BLOCK isn't set.
This optimisation (always trying rpc_lookup_cred_nonblock()) is
particularly important when a security module is active.
In that case inode_permission() may return -ECHILD from
security_inode_permission() even though ->permission() succeeded in
RCU_WALK mode.
This leads to may_lookup() retrying inode_permission after performing
unlazy_walk(). The spinlock that rpc_lookup_cred() takes is often
more expensive than anything security_inode_permission() does, so that
spinlock becomes the main bottleneck.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/dir.c | 28 ++++++++++++++++++++--------
1 file changed, 20 insertions(+), 8 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3c2b36acf291..ac958f29e558 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2288,6 +2288,10 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
if (status == 0)
goto out_cached;
+ status = -ECHILD;
+ if (mask & MAY_NOT_BLOCK)
+ goto out;
+
/* Be clever: ask server to check for all possible rights */
cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
cache.cred = cred;
@@ -2364,15 +2368,23 @@ force_lookup:
if (!NFS_PROTO(inode)->access)
goto out_notsup;
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
- cred = rpc_lookup_cred();
- if (!IS_ERR(cred)) {
- res = nfs_do_access(inode, cred, mask);
- put_rpccred(cred);
- } else
+ /* Always try fast lookups first */
+ rcu_read_lock();
+ cred = rpc_lookup_cred_nonblock();
+ if (!IS_ERR(cred))
+ res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+ else
res = PTR_ERR(cred);
+ rcu_read_unlock();
+ if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+ /* Fast lookup failed, try the slow way */
+ cred = rpc_lookup_cred();
+ if (!IS_ERR(cred)) {
+ res = nfs_do_access(inode, cred, mask);
+ put_rpccred(cred);
+ } else
+ res = PTR_ERR(cred);
+ }
out:
if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
res = -EACCES;
It fails with -ECHILD rather than make an RPC call.
This allows nfs_lookup_revalidate to call it in RCU-walk mode.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/dir.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index de9bc809c7ea..af8c7464d29b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1050,6 +1050,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
out:
return (inode->i_nlink == 0) ? -ENOENT : 0;
out_force:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
ret = __nfs_revalidate_inode(server, inode);
if (ret != 0)
return ret;
@@ -1135,11 +1137,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (!nfs_is_exclusive_create(dir, flags) &&
nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- if (nfs_lookup_verify_inode(inode, flags))
+ if (nfs_lookup_verify_inode(inode, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
goto out_zap_parent;
+ }
goto out_valid;
}
On Sun, Jul 13, 2014 at 9:28 PM, NeilBrown <[email protected]> wrote:
> NFS current abort and attempt at filename lookup in RCU mode.
> This can have serious performance impact on a highly parallel load.
>
> The "Makefile" below generates just such a load. On a 40-core
> machine "make -j 40" is about 6 times as fast at "make -j 5"
> when a local filesystem is used (e.g. XFS), but as much as half
> as fast when NFS is used.
> With this patch set, "make -j 40" is about 3 times as fast as
> "make -j 5" on NFS, and "perf" data doesn't show spinlocks to be a big
> problem any more.
>
> This is a re-submission with a few small improvements of a patch set
> posted in March. Since then I have recieved confirmation that it
> definitely fixes the problem, when combined with a patch set which
> enhances autofs4 in a similar way. So it has had quite a bit of
> testing.
Hi Neil,
What kind of tests have you personally (or SuSE if relevant) performed?
Have you run this under NFSometer in order to check for regressions,
and if so on what workloads?
The above are not requirements in order to get the patches into
mainline, I'm just curious.
Cheers
Trond
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Sun, 13 Jul 2014 22:00:12 -0400 Trond Myklebust
<[email protected]> wrote:
> On Sun, Jul 13, 2014 at 9:28 PM, NeilBrown <[email protected]> wrote:
> > NFS current abort and attempt at filename lookup in RCU mode.
> > This can have serious performance impact on a highly parallel load.
> >
> > The "Makefile" below generates just such a load. On a 40-core
> > machine "make -j 40" is about 6 times as fast at "make -j 5"
> > when a local filesystem is used (e.g. XFS), but as much as half
> > as fast when NFS is used.
> > With this patch set, "make -j 40" is about 3 times as fast as
> > "make -j 5" on NFS, and "perf" data doesn't show spinlocks to be a big
> > problem any more.
> >
> > This is a re-submission with a few small improvements of a patch set
> > posted in March. Since then I have recieved confirmation that it
> > definitely fixes the problem, when combined with a patch set which
> > enhances autofs4 in a similar way. So it has had quite a bit of
> > testing.
>
> Hi Neil,
>
> What kind of tests have you personally (or SuSE if relevant) performed?
> Have you run this under NFSometer in order to check for regressions,
> and if so on what workloads?
>
> The above are not requirements in order to get the patches into
> mainline, I'm just curious.
I hadn't come across NFSometer before, looks useful!
The only testing is that Makefile, and that was done mostly by the customer.
Further, that testing was a version of the patchset for Linux 3.0.
This particular patchset I've only tested lightly on my little 2-core test
machine.
I'm certainly happy to try beating on it a bit hard, and can see if I can get
access to an 80-core machine that I had a brief play with a while back and do
some more testing there.
NeilBrown
The new flag RPCAUTH_LOOKUP_RCU to credential lookup avoids locking,
does not take a reference on the returned credential, and returns
-ECHILD if a simple lookup was not possible.
The returned value can only be used within an rcu_read_lock protected
region.
The main user of this is the new rpc_lookup_cred_nonblock() which
returns a pointer to the current credential which is only rcu-safe (no
ref-count held), and might return -ECHILD if allocation was required.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/sunrpc/auth.h | 2 ++
net/sunrpc/auth.c | 17 +++++++++++++++--
net/sunrpc/auth_generic.c | 6 ++++++
net/sunrpc/auth_null.c | 2 ++
4 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 790be1472792..28ffc8476875 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -103,6 +103,7 @@ struct rpc_auth_create_args {
/* Flags for rpcauth_lookupcred() */
#define RPCAUTH_LOOKUP_NEW 0x01 /* Accept an uninitialised cred */
+#define RPCAUTH_LOOKUP_RCU 0x02 /* lock-less lookup */
/*
* Client authentication ops
@@ -153,6 +154,7 @@ void rpc_destroy_generic_auth(void);
void rpc_destroy_authunix(void);
struct rpc_cred * rpc_lookup_cred(void);
+struct rpc_cred * rpc_lookup_cred_nonblock(void);
struct rpc_cred * rpc_lookup_machine_cred(const char *service_name);
int rpcauth_register(const struct rpc_authops *);
int rpcauth_unregister(const struct rpc_authops *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index f77366717420..794fc0f4fc4c 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -523,6 +523,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
if (!entry->cr_ops->crmatch(acred, entry, flags))
continue;
+ if (flags & RPCAUTH_LOOKUP_RCU) {
+ if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
+ !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
+ cred = entry;
+ break;
+ }
spin_lock(&cache->lock);
if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
spin_unlock(&cache->lock);
@@ -537,6 +543,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
if (cred != NULL)
goto found;
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return ERR_PTR(-ECHILD);
+
new = auth->au_ops->crcreate(auth, acred, flags);
if (IS_ERR(new)) {
cred = new;
@@ -586,10 +595,14 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
memset(&acred, 0, sizeof(acred));
acred.uid = cred->fsuid;
acred.gid = cred->fsgid;
- acred.group_info = get_group_info(((struct cred *)cred)->group_info);
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ acred.group_info = rcu_dereference(cred->group_info);
+ else
+ acred.group_info = get_group_info(((struct cred *)cred)->group_info);
ret = auth->au_ops->lookup_cred(auth, &acred, flags);
- put_group_info(acred.group_info);
+ if (!(flags & RPCAUTH_LOOKUP_RCU))
+ put_group_info(acred.group_info);
return ret;
}
EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index ed04869b2d4f..6f6b829c9e8e 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void)
}
EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+struct rpc_cred *rpc_lookup_cred_nonblock(void)
+{
+ return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
+
/*
* Public call interface for looking up machine creds.
*/
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index f0ebe07978a2..712c123e04e9 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth)
static struct rpc_cred *
nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return &null_cred;
return get_rpccred(&null_cred);
}
On Sun, Jul 13, 2014 at 10:25 PM, NeilBrown <[email protected]> wrote:
> On Sun, 13 Jul 2014 22:00:12 -0400 Trond Myklebust
> <[email protected]> wrote:
>
>> On Sun, Jul 13, 2014 at 9:28 PM, NeilBrown <[email protected]> wrote:
>> > NFS current abort and attempt at filename lookup in RCU mode.
>> > This can have serious performance impact on a highly parallel load.
>> >
>> > The "Makefile" below generates just such a load. On a 40-core
>> > machine "make -j 40" is about 6 times as fast at "make -j 5"
>> > when a local filesystem is used (e.g. XFS), but as much as half
>> > as fast when NFS is used.
>> > With this patch set, "make -j 40" is about 3 times as fast as
>> > "make -j 5" on NFS, and "perf" data doesn't show spinlocks to be a big
>> > problem any more.
>> >
>> > This is a re-submission with a few small improvements of a patch set
>> > posted in March. Since then I have recieved confirmation that it
>> > definitely fixes the problem, when combined with a patch set which
>> > enhances autofs4 in a similar way. So it has had quite a bit of
>> > testing.
>>
>> Hi Neil,
>>
>> What kind of tests have you personally (or SuSE if relevant) performed?
>> Have you run this under NFSometer in order to check for regressions,
>> and if so on what workloads?
>>
>> The above are not requirements in order to get the patches into
>> mainline, I'm just curious.
>
> I hadn't come across NFSometer before, looks useful!
Dros Adamson is the main developer. He'd be happy to take any feedback
you may have.
> The only testing is that Makefile, and that was done mostly by the customer.
>
> Further, that testing was a version of the patchset for Linux 3.0.
> This particular patchset I've only tested lightly on my little 2-core test
> machine.
> I'm certainly happy to try beating on it a bit hard, and can see if I can get
> access to an 80-core machine that I had a brief play with a while back and do
> some more testing there.
As I said, it's not required for merging, but would definitely be an
interesting test in order to see how it affects NFS performance in
general. If you have access to a good test setup with an interesting
workload, then I'd love to see the results of a "before vs after"
comparison using nfsometer.
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Sun, 13 Jul 2014 22:39:34 -0400 Trond Myklebust
<[email protected]> wrote:
> > I'm certainly happy to try beating on it a bit hard, and can see if I can get
> > access to an 80-core machine that I had a brief play with a while back and do
> > some more testing there.
>
> As I said, it's not required for merging, but would definitely be an
> interesting test in order to see how it affects NFS performance in
> general. If you have access to a good test setup with an interesting
> workload, then I'd love to see the results of a "before vs after"
> comparison using nfsometer.
>
I managed to get access to a big machine for a while so I ran nfsometer.
Seemed to run quite well but there are a couple of little issues I might
mention separately.
I didn't let iozone_direct complete because it was taking *forever*.
The results are at http://neil.brown.name/nfsometer_results/
I cannot find a good summary page...
I tested three kernels on the same 80-core (160 if you count hyperthreads)
machine:
A v3.12 that was installed,
a vanilla 3.16-rc5,
and the 3.16-rc5 with my patches.
Some numbers look good, some look strange.
Can you (or someone) have a look?
I also ran my compile-test and graphed some results, which I attach.
This is just 3 kernels, 3.16-rc5 with and without my patches.
I tested both v3 and v4.
I ran the "make" command with "-j" values from 4 to 80 in steps of 4.
The runs lots of compiles of a trivial file but has a very long list of
include directories which are all on NFS and all empty. So there are lots of
attempts to open non-existent files, and the non-existence is cached early.
As you see, with the "-rcu" very performance is enormously better. Without
my patches, more concurrency means worse performance.
Thanks,
NeilBrown
nfs4_lookup_revalidate only uses 'parent' to get 'dir', and only
uses 'dir' if 'inode == NULL'.
So we don't need to find out what 'parent' or 'dir' is until we
know that 'inode' is NULL.
By moving 'dget_parent' inside the 'if', we can reduce the number of
call sites for 'dput(parent)'.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/dir.c | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a3d4ef76127..c5a4a89637fb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1529,9 +1529,7 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent = NULL;
struct inode *inode;
- struct inode *dir;
int ret = 0;
if (flags & LOOKUP_RCU)
@@ -1545,34 +1543,35 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto no_open;
inode = dentry->d_inode;
- parent = dget_parent(dentry);
- dir = parent->d_inode;
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
if (inode == NULL) {
+ struct dentry *parent;
+ struct inode *dir;
+
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
if (!nfs_neg_need_reval(dir, dentry, flags))
ret = 1;
+ dput(parent);
goto out;
}
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
- goto no_open_dput;
+ goto no_open;
/* We cannot do exclusive creation on a positive dentry */
if (flags & LOOKUP_EXCL)
- goto no_open_dput;
+ goto no_open;
/* Let f_op->open() actually open (and revalidate) the file */
ret = 1;
out:
- dput(parent);
return ret;
-no_open_dput:
- dput(parent);
no_open:
return nfs_lookup_revalidate(dentry, flags);
}
This requires nfs_check_verifier to take an rcu_walk flag, and requires
an rcu version of nfs_revalidate_inode which returns -ECHILD rather
than making an RPC call.
With this, nfs_lookup_revalidate can call nfs_neg_need_reval in
RCU-walk mode.
We can also move the LOOKUP_RCU check past the nfs_check_verifier()
call in nfs_lookup_revalidate.
If RCU_WALK prevents nfs_check_verifier or nfs_neg_need_reval from
doing a full check, they return a status indicating that a revalidation
is required. As this revalidation will not be possible in RCU_WALK
mode, -ECHILD will ultimately be returned, which is the desired result.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/dir.c | 59 ++++++++++++++++++++++++++++++++++--------------
fs/nfs/inode.c | 9 +++++++
include/linux/nfs_fs.h | 1 +
3 files changed, 52 insertions(+), 17 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ac958f29e558..de9bc809c7ea 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -988,9 +988,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
* A check for whether or not the parent directory has changed.
* In the case it has, we assume that the dentries are untrustworthy
* and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
*/
-static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+ int rcu_walk)
{
+ int ret;
+
if (IS_ROOT(dentry))
return 1;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1002,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
- if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ if (rcu_walk)
+ ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+ else
+ ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+ if (ret < 0)
return 0;
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
@@ -1054,6 +1062,9 @@ out_force:
*
* If parent mtime has changed, we revalidate, else we wait for a
* period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
*/
static inline
int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1075,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
return 0;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
return 1;
- return !nfs_check_verifier(dir, dentry);
+ return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
}
/*
@@ -1101,11 +1112,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
inode = dentry->d_inode;
if (!inode) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- if (nfs_neg_need_reval(dir, dentry, flags))
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
goto out_bad;
+ }
goto out_valid_noent;
}
@@ -1120,16 +1131,21 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
goto out_set_verifier;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
/* Force a full look up iff the parent directory has changed */
- if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
+ if (!nfs_is_exclusive_create(dir, flags) &&
+ nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
if (nfs_lookup_verify_inode(inode, flags))
goto out_zap_parent;
goto out_valid;
}
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
if (NFS_STALE(inode))
goto out_bad;
@@ -1566,14 +1582,23 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *dir;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- parent = dget_parent(dentry);
- dir = parent->d_inode;
+ if (flags & LOOKUP_RCU) {
+ parent = rcu_dereference(dentry);
+ dir = ACCESS_ONCE(parent->d_inode);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ }
if (!nfs_neg_need_reval(dir, dentry, flags))
ret = 1;
- dput(parent);
+ else if (flags & LOOKUP_RCU)
+ ret = -ECHILD;
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
+ else if (parent != rcu_dereference(dentry))
+ return -ECHILD;
goto out;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index abd37a380535..432b6c4bea47 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1002,6 +1002,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
}
EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+ if (!(NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+ && !nfs_attribute_cache_expired(inode))
+ return NFS_STALE(inode) ? -ESTALE : 0;
+ return -ECHILD;
+}
+
static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e30f6059ecd6..60cd9e377926 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -352,6 +352,7 @@ extern int nfs_release(struct inode *, struct file *);
extern int nfs_attribute_timeout(struct inode *inode);
extern int nfs_attribute_cache_expired(struct inode *inode);
extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode);
extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
extern int nfs_setattr(struct dentry *, struct iattr *);
The access cache is used during RCU-walk path lookups, so it is best
to avoid locking if possible as taking a lock kills concurrency.
The rbtree is not rcu-safe and cannot easily be made so.
Instead we simply check the last (i.e. most recent) entry on the LRU
list. If this doesn't match, then we return -ECHILD and retry in
lock/refcount mode.
This requires freeing the nfs_access_entry struct with rcu, and
requires using rcu access primatives when adding entries to the lru, and
when examining the last entry.
Calling put_rpccred before kfree_rcu looks a bit odd, but as
put_rpccred already provides rcu protection, we know that the cred will
not actually be freed until the next grace period, so any concurrent
access will be safe.
This patch provides about 5% performance improvement on a stat-heavy
synthetic work load with 4 threads on a 2-core CPU.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/dir.c | 43 +++++++++++++++++++++++++++++++++++++++++--
include/linux/nfs_fs.h | 1 +
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index af8c7464d29b..7e3b5b8751d8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2075,7 +2075,7 @@ static atomic_long_t nfs_access_nr_entries;
static void nfs_access_free_entry(struct nfs_access_entry *entry)
{
put_rpccred(entry->cred);
- kfree(entry);
+ kfree_rcu(entry, rcu_head);
smp_mb__before_atomic();
atomic_long_dec(&nfs_access_nr_entries);
smp_mb__after_atomic();
@@ -2230,6 +2230,38 @@ out_zap:
return -ENOENT;
}
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+ /* Only check the most recently returned cache entry,
+ * but do it without locking.
+ */
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_access_entry *cache;
+ int err = -ECHILD;
+ struct list_head *lh;
+
+ rcu_read_lock();
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out;
+ lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+ cache = list_entry(lh, struct nfs_access_entry, lru);
+ if (lh == &nfsi->access_cache_entry_lru ||
+ cred != cache->cred)
+ cache = NULL;
+ if (cache == NULL)
+ goto out;
+ if (!nfs_have_delegated_attributes(inode) &&
+ !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ goto out;
+ res->jiffies = cache->jiffies;
+ res->cred = cache->cred;
+ res->mask = cache->mask;
+ err = 0;
+out:
+ rcu_read_unlock();
+ return err;
+}
+
static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -2273,6 +2305,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
cache->cred = get_rpccred(set->cred);
cache->mask = set->mask;
+ /* The above field assignments must be visible
+ * before this item appears on the lru. We cannot easily
+ * use rcu_assign_pointer, so just force the memory barrier.
+ */
+ smp_wmb();
nfs_access_add_rbtree(inode, cache);
/* Update accounting */
@@ -2311,7 +2348,9 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
trace_nfs_access_enter(inode);
- status = nfs_access_get_cached(inode, cred, &cache);
+ status = nfs_access_get_cached_rcu(inode, cred, &cache);
+ if (status != 0)
+ status = nfs_access_get_cached(inode, cred, &cache);
if (status == 0)
goto out_cached;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 60cd9e377926..5180a7ededec 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -52,6 +52,7 @@ struct nfs_access_entry {
unsigned long jiffies;
struct rpc_cred * cred;
int mask;
+ struct rcu_head rcu_head;
};
struct nfs_lockowner {
nfs_lookup_revalidate, nfs4_lookup_revalidate, and nfs_permission
all need to understand and handle RCU-walk for NFS to gain the
benefits of RCU-walk for cached information.
Currently these functions all immediately return -ECHILD
if the relevant flag (LOOKUP_RCU or MAY_NOT_BLOCK) is set.
This patch pushes those tests later in the code so that we only abort
immediately before we enter rcu-unsafe code. As subsequent patches
make that rcu-unsafe code rcu-safe, several of these new tests will
disappear.
With this patch there are several paths through the code which will no
longer return -ECHILD during an RCU-walk. However these are mostly
error paths or other uninteresting cases.
A noteworthy change in nfs_lookup_revalidate is that we don't take
(or put) the reference to ->d_parent when LOOKUP_RCU is set.
Rather we rcu_dereference ->d_parent, and check that ->d_inode
is not NULL. We also check that ->d_parent hasn't changed after
all the tests.
In nfs4_lookup_revalidate we simply avoid testing LOOKUP_RCU on the
path that only calls nfs_lookup_revalidate() as that function
already performs the required test.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/dir.c | 45 +++++++++++++++++++++++++++++++++------------
1 file changed, 33 insertions(+), 12 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c5a4a89637fb..3c2b36acf291 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1088,21 +1088,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct nfs4_label *label = NULL;
int error;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- parent = dget_parent(dentry);
- dir = parent->d_inode;
+ if (flags & LOOKUP_RCU) {
+ parent = rcu_dereference(dentry->d_parent);
+ dir = ACCESS_ONCE(parent->d_inode);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ }
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = dentry->d_inode;
if (!inode) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
if (nfs_neg_need_reval(dir, dentry, flags))
goto out_bad;
goto out_valid_noent;
}
if (is_bad_inode(inode)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
__func__, dentry);
goto out_bad;
@@ -1111,6 +1120,9 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
goto out_set_verifier;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
/* Force a full look up iff the parent directory has changed */
if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
if (nfs_lookup_verify_inode(inode, flags))
@@ -1153,13 +1165,18 @@ out_set_verifier:
/* Success: notify readdir to use READDIRPLUS */
nfs_advise_use_readdirplus(dir);
out_valid_noent:
- dput(parent);
+ if (flags & LOOKUP_RCU) {
+ if (parent != rcu_dereference(dentry->d_parent))
+ return -ECHILD;
+ } else
+ dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
__func__, dentry);
return 1;
out_zap_parent:
nfs_zap_caches(dir);
out_bad:
+ WARN_ON(flags & LOOKUP_RCU);
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
@@ -1185,6 +1202,7 @@ out_zap_parent:
__func__, dentry);
return 0;
out_error:
+ WARN_ON(flags & LOOKUP_RCU);
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
@@ -1532,9 +1550,6 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct inode *inode;
int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
goto no_open;
if (d_mountpoint(dentry))
@@ -1551,6 +1566,9 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *dir;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
parent = dget_parent(dentry);
dir = parent->d_inode;
if (!nfs_neg_need_reval(dir, dentry, flags))
@@ -2320,9 +2338,6 @@ int nfs_permission(struct inode *inode, int mask)
struct rpc_cred *cred;
int res = 0;
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
nfs_inc_stats(inode, NFSIOS_VFSACCESS);
if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2349,6 +2364,9 @@ force_lookup:
if (!NFS_PROTO(inode)->access)
goto out_notsup;
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
cred = rpc_lookup_cred();
if (!IS_ERR(cred)) {
res = nfs_do_access(inode, cred, mask);
@@ -2363,6 +2381,9 @@ out:
inode->i_sb->s_id, inode->i_ino, mask, res);
return res;
out_notsup:
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (res == 0)
res = generic_permission(inode, mask);