I'm proposing this series for v6.2 (for-next).
For quite some time, we've been encouraged to disable filecache
garbage collection for NFSv4 files, and I think I found a surgical
way to do just that. That is presented in "NFSD: Add an NFSD_FILE_GC
flag to enable nfsd_file garbage collection".
The other major change in this set is reworking the file_hashtbl to
resize itself dynamically. This reduces the average size of its
bucket chains, greatly speeding up hash insertion, which holds the
state_lock.
The current rhashtable changes are failing some tests, so expect
another revision of this series soon.
Comments and opinions are welcome.
Changes since v4:
- Addressed some comments in the GC patch; more to come
- Split clean-ups out of the rhashtable patch, reordered the series
- Removed state_lock from the rhashtable helpers
Changes since v3:
- the new filehandle alias check was still not right
Changes since v2:
- Converted nfs4_file_rhashtbl to nfs4_file_rhltable
- Addressed most or all other review comments
Changes since RFC:
- checking nfs4_files for inode aliases is now done only on hash
insertion
- the nfs4_file reference count is now bumped only while the RCU
read lock is held
- comments and function names have been revised and clarified
---
Chuck Lever (13):
NFSD: Pass the target nfsd_file to nfsd_commit()
NFSD: Revert "NFSD: NFSv4 CLOSE should release an nfsd_file immediately"
NFSD: Add an NFSD_FILE_GC flag to enable nfsd_file garbage collection
NFSD: Clean up nfs4_preprocess_stateid_op() call sites
NFSD: Trace delegation revocations
NFSD: Use const pointers as parameters to fh_ helpers
NFSD: Update file_hashtbl() helpers
NFSD: Clean up nfsd4_init_file()
NFSD: Add a remove_nfs4_file() helper
NFSD: Clean up find_or_add_file()
NFSD: Refactor find_file()
NFSD: Allocate an rhashtable for nfs4_file objects
NFSD: Use rhashtable for managing nfs4_file objects
fs/nfsd/filecache.c | 81 +++++++++++++++--------
fs/nfsd/filecache.h | 4 +-
fs/nfsd/nfs3proc.c | 10 ++-
fs/nfsd/nfs4proc.c | 42 +++++-------
fs/nfsd/nfs4state.c | 157 +++++++++++++++++++++++---------------------
fs/nfsd/nfsfh.h | 10 +--
fs/nfsd/state.h | 5 +-
fs/nfsd/trace.h | 58 +++++++++++++++-
fs/nfsd/vfs.c | 19 ++----
fs/nfsd/vfs.h | 3 +-
10 files changed, 239 insertions(+), 150 deletions(-)
--
Chuck Lever
NFSv4 operations manage the lifetime of nfsd_file items they use by
means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
garbage collected.
Introduce a mechanism to enable garbage collection for nfsd_file
items used only by NFSv2/3 callers.
Note that the change in nfsd_file_put() ensures that both CLOSE and
DELEGRETURN will actually close out and free an nfsd_file on last
reference of a non-garbage-collected file.
Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
Suggested-by: Trond Myklebust <[email protected]>
Tested-by: Jeff Layton <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/filecache.c | 63 +++++++++++++++++++++++++++++++++++++++++++++------
fs/nfsd/filecache.h | 3 ++
fs/nfsd/nfs3proc.c | 4 ++-
fs/nfsd/trace.h | 3 ++
fs/nfsd/vfs.c | 4 ++-
5 files changed, 64 insertions(+), 13 deletions(-)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index beb41a507623..965c7b13fddc 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
struct net *net;
const struct cred *cred;
unsigned char need;
+ bool gc;
enum nfsd_file_lookup_type type;
};
@@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
return 1;
if (!nfsd_match_cred(nf->nf_cred, key->cred))
return 1;
+ if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
+ return 1;
if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
return 1;
break;
@@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
nf->nf_flags = 0;
__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ if (key->gc)
+ __set_bit(NFSD_FILE_GC, &nf->nf_flags);
nf->nf_inode = key->inode;
/* nf_ref is pre-incremented for hash table */
refcount_set(&nf->nf_ref, 2);
@@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
}
}
+static void
+nfsd_file_unhash_and_put(struct nfsd_file *nf)
+{
+ if (nfsd_file_unhash(nf))
+ nfsd_file_put_noref(nf);
+}
+
void
nfsd_file_put(struct nfsd_file *nf)
{
might_sleep();
- nfsd_file_lru_add(nf);
- if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
+ nfsd_file_lru_add(nf);
+ else if (refcount_read(&nf->nf_ref) == 2)
+ nfsd_file_unhash_and_put(nf);
+
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
- } else if (nf->nf_file) {
+ } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
nfsd_file_put_noref(nf);
nfsd_file_schedule_laundrette();
} else
@@ -1017,12 +1033,14 @@ nfsd_file_is_cached(struct inode *inode)
static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf, bool open)
+ unsigned int may_flags, struct nfsd_file **pnf,
+ bool open, bool want_gc)
{
struct nfsd_file_lookup_key key = {
.type = NFSD_FILE_KEY_FULL,
.need = may_flags & NFSD_FILE_MAY_MASK,
.net = SVC_NET(rqstp),
+ .gc = want_gc,
};
bool open_retry = true;
struct nfsd_file *nf;
@@ -1117,14 +1135,35 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* then unhash.
*/
if (status != nfs_ok || key.inode->i_nlink == 0)
- if (nfsd_file_unhash(nf))
- nfsd_file_put_noref(nf);
+ nfsd_file_unhash_and_put(nf);
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
goto out;
}
+/**
+ * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file object returned by this API is reference-counted
+ * and garbage-collected. The object is retained for a few
+ * seconds after the final nfsd_file_put() in case the caller
+ * wants to re-use it.
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, true);
+}
+
/**
* nfsd_file_acquire - Get a struct nfsd_file with an open file
* @rqstp: the RPC transaction being executed
@@ -1132,6 +1171,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* @may_flags: NFSD_MAY_ settings for the file
* @pnf: OUT: new or found "struct nfsd_file" object
*
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
+ *
* Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
* network byte order is returned.
*/
@@ -1139,7 +1182,7 @@ __be32
nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, false);
}
/**
@@ -1149,6 +1192,10 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* @may_flags: NFSD_MAY_ settings for the file
* @pnf: OUT: new or found "struct nfsd_file" object
*
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is released immediately
+ * one RCU grace period after the final nfsd_file_put().
+ *
* Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
* network byte order is returned.
*/
@@ -1156,7 +1203,7 @@ __be32
nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, false);
}
/*
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 6b012ea4bd9d..b7efb2c3ddb1 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -38,6 +38,7 @@ struct nfsd_file {
#define NFSD_FILE_HASHED (0)
#define NFSD_FILE_PENDING (1)
#define NFSD_FILE_REFERENCED (2)
+#define NFSD_FILE_GC (3)
unsigned long nf_flags;
struct inode *nf_inode; /* don't deref */
refcount_t nf_ref;
@@ -55,6 +56,8 @@ void nfsd_file_put(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index ff2920546333..d01b29aba662 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -772,8 +772,8 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
(unsigned long long) argp->offset);
fh_copy(&resp->fh, &argp->fh);
- resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE |
- NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
+ NFSD_MAY_NOT_BREAK_LEASE, &nf);
if (resp->status)
goto out;
resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 06a96e955bd0..b065a4b1e0dc 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -814,7 +814,8 @@ DEFINE_CLID_EVENT(confirmed_r);
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
{ 1 << NFSD_FILE_PENDING, "PENDING" }, \
- { 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
+ { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}, \
+ { 1 << NFSD_FILE_GC, "GC"})
DECLARE_EVENT_CLASS(nfsd_file_class,
TP_PROTO(struct nfsd_file *nf),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 91c34cef11d8..df1830a1a295 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1084,7 +1084,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 err;
trace_nfsd_read_start(rqstp, fhp, offset, *count);
- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
if (err)
return err;
@@ -1116,7 +1116,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
+ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
if (err)
goto out;
In a moment I'm going to introduce separate nfsd_file types, one of
which is garbage-collected; the other, not. The garbage-collected
variety is to be used by NFSv2 and v3, and the non-garbage-collected
variety is to be used by NFSv4.
nfsd_commit() is invoked by both NFSv3 and NFSv4 consumers. We want
nfsd_commit() to find and use the correct variety of cached
nfsd_file object for the NFS version that is in use.
This is a refactoring change. No behavior change is expected.
Signed-off-by: Chuck Lever <[email protected]>
Tested-by: Jeff Layton <[email protected]>
Reviewed-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs3proc.c | 10 +++++++++-
fs/nfsd/nfs4proc.c | 11 ++++++++++-
fs/nfsd/vfs.c | 15 ++++-----------
fs/nfsd/vfs.h | 3 ++-
4 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 923d9a80df92..ff2920546333 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -13,6 +13,7 @@
#include "cache.h"
#include "xdr3.h"
#include "vfs.h"
+#include "filecache.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -763,6 +764,7 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
{
struct nfsd3_commitargs *argp = rqstp->rq_argp;
struct nfsd3_commitres *resp = rqstp->rq_resp;
+ struct nfsd_file *nf;
dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
SVCFH_fmt(&argp->fh),
@@ -770,8 +772,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
(unsigned long long) argp->offset);
fh_copy(&resp->fh, &argp->fh);
- resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
+ resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE |
+ NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ if (resp->status)
+ goto out;
+ resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
argp->count, resp->verf);
+ nfsd_file_put(nf);
+out:
return rpc_success;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8beb2bc4c328..42db62413890 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -731,10 +731,19 @@ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_commit *commit = &u->commit;
+ struct nfsd_file *nf;
+ __be32 status;
- return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE |
+ NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ if (status != nfs_ok)
+ return status;
+
+ status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset,
commit->co_count,
(__be32 *)commit->co_verf.data);
+ nfsd_file_put(nf);
+ return status;
}
static __be32
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f650afedd67f..91c34cef11d8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1132,6 +1132,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
* nfsd_commit - Commit pending writes to stable storage
* @rqstp: RPC request being processed
* @fhp: NFS filehandle
+ * @nf: target file
* @offset: raw offset from beginning of file
* @count: raw count of bytes to sync
* @verf: filled in with the server's current write verifier
@@ -1148,19 +1149,13 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
* An nfsstat value in network byte order.
*/
__be32
-nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
- u32 count, __be32 *verf)
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ u64 offset, u32 count, __be32 *verf)
{
+ __be32 err = nfs_ok;
u64 maxbytes;
loff_t start, end;
struct nfsd_net *nn;
- struct nfsd_file *nf;
- __be32 err;
-
- err = nfsd_file_acquire(rqstp, fhp,
- NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
- if (err)
- goto out;
/*
* Convert the client-provided (offset, count) range to a
@@ -1201,8 +1196,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
} else
nfsd_copy_write_verifier(verf, nn);
- nfsd_file_put(nf);
-out:
return err;
}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 120521bc7b24..9744b041105b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -88,7 +88,8 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct svc_fh *resfhp, struct nfsd_attrs *iap);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
- u64 offset, u32 count, __be32 *verf);
+ struct nfsd_file *nf, u64 offset, u32 count,
+ __be32 *verf);
#ifdef CONFIG_NFSD_V4
__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *name, void **bufp, int *lenp);
Delegation revocation is an exceptional event that is not otherwise
visible externally (eg, no network traffic is emitted). Generate a
trace record when it occurs so that revocation can be observed or
other activity can be triggered. Example:
nfsd-1104 [005] 1912.002544: nfsd_stid_revoke: client 633c9343:4e82788d stateid 00000003:00000001 ref=2 type=DELEG
Trace infrastructure is provided for subsequent additional tracing
related to nfs4_stid activity.
Reviewed-by: Jeff Layton <[email protected]>
Tested-by: Jeff Layton <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 2 ++
fs/nfsd/trace.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c829b828b6fd..db58d5c0bba8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1355,6 +1355,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ trace_nfsd_stid_revoke(&dp->dl_stid);
+
if (clp->cl_minorversion) {
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index b065a4b1e0dc..2809c2053acc 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -633,6 +633,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
DEFINE_STATESEQID_EVENT(preprocess);
DEFINE_STATESEQID_EVENT(open_confirm);
+TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
+TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
+TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
+TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
+
+#define show_stid_type(x) \
+ __print_flags(x, "|", \
+ { NFS4_OPEN_STID, "OPEN" }, \
+ { NFS4_LOCK_STID, "LOCK" }, \
+ { NFS4_DELEG_STID, "DELEG" }, \
+ { NFS4_CLOSED_STID, "CLOSED" }, \
+ { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \
+ { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \
+ { NFS4_LAYOUT_STID, "LAYOUT" })
+
+DECLARE_EVENT_CLASS(nfsd_stid_class,
+ TP_PROTO(
+ const struct nfs4_stid *stid
+ ),
+ TP_ARGS(stid),
+ TP_STRUCT__entry(
+ __field(unsigned long, sc_type)
+ __field(int, sc_count)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ const stateid_t *stp = &stid->sc_stateid;
+
+ __entry->sc_type = stid->sc_type;
+ __entry->sc_count = refcount_read(&stid->sc_count);
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+ __entry->cl_boot, __entry->cl_id,
+ __entry->si_id, __entry->si_generation,
+ __entry->sc_count, show_stid_type(__entry->sc_type)
+ )
+);
+
+#define DEFINE_STID_EVENT(name) \
+DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \
+ TP_PROTO(const struct nfs4_stid *stid), \
+ TP_ARGS(stid))
+
+DEFINE_STID_EVENT(revoke);
+
DECLARE_EVENT_CLASS(nfsd_clientid_class,
TP_PROTO(const clientid_t *clid),
TP_ARGS(clid),
Name nfs4_file-related helpers consistently. There are already
nfs4_file_yada functions, so let's go with the same convention used
by put_nfs4_file(): init_nfs4_file().
Change the @fh parameter to be const pointer for better type safety.
Finally, move the hash insertion operation to the caller. This is
typical for most other "init_object" type helpers, and it is where
most of the other nfs4_file hash table operations are located.
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 90c991e500a8..644ff4310fa9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4262,11 +4262,8 @@ static struct nfs4_file *nfsd4_alloc_file(void)
}
/* OPEN Share state helper functions */
-static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
- struct nfs4_file *fp)
+static void init_nfs4_file(const struct svc_fh *fh, struct nfs4_file *fp)
{
- lockdep_assert_held(&state_lock);
-
refcount_set(&fp->fi_ref, 1);
spin_lock_init(&fp->fi_lock);
INIT_LIST_HEAD(&fp->fi_stateids);
@@ -4284,7 +4281,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
INIT_LIST_HEAD(&fp->fi_lo_states);
atomic_set(&fp->fi_lo_recalls, 0);
#endif
- hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
}
void
@@ -4702,7 +4698,8 @@ static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
fp->fi_aliased = alias_found = true;
}
if (likely(ret == NULL)) {
- nfsd4_init_file(fh, hashval, new);
+ init_nfs4_file(fh, new);
+ hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
new->fi_aliased = alias_found;
ret = new;
}
Remove the call to find_file_locked() in insert_nfs4_file(). Tracing
shows that over 99% of these calls return NULL. Thus it is not worth
the expense of the extra bucket list traversal. insert_file() already
deals correctly with the case where the item is already in the hash
bucket.
Since insert_nfs4_file() is now just a wrapper around insert_file(),
move the meat of insert_file() into insert_nfs4_file() and get rid
of it.
Name nfs4_file-related helpers consistently. There are already
nfs4_file_yada functions, so let's go with the same convention used
by put_nfs4_file(): insert_nfs4_file().
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 64 ++++++++++++++++++++++-----------------------------
1 file changed, 28 insertions(+), 36 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 373b1d52fcd7..529995a9e916 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4682,24 +4682,42 @@ find_file_locked(const struct svc_fh *fh, unsigned int hashval)
return NULL;
}
-static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
- unsigned int hashval)
+static struct nfs4_file * find_file(struct svc_fh *fh)
{
struct nfs4_file *fp;
+ unsigned int hashval = file_hashval(fh);
+
+ rcu_read_lock();
+ fp = find_file_locked(fh, hashval);
+ rcu_read_unlock();
+ return fp;
+}
+
+/*
+ * On hash insertion, identify entries with the same inode but
+ * distinct filehandles. They will all be in the same hash bucket
+ * because nfs4_file's are hashed by the address in the fi_inode
+ * field.
+ */
+static noinline_for_stack struct nfs4_file *
+insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
+{
+ unsigned int hashval = file_hashval(fhp);
struct nfs4_file *ret = NULL;
bool alias_found = false;
+ struct nfs4_file *fi;
spin_lock(&state_lock);
- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
+ hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
lockdep_is_held(&state_lock)) {
- if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
- if (refcount_inc_not_zero(&fp->fi_ref))
- ret = fp;
- } else if (d_inode(fh->fh_dentry) == fp->fi_inode)
- fp->fi_aliased = alias_found = true;
+ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+ if (refcount_inc_not_zero(&fi->fi_ref))
+ ret = fi;
+ } else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
+ fi->fi_aliased = alias_found = true;
}
if (likely(ret == NULL)) {
- init_nfs4_file(fh, new);
+ init_nfs4_file(fhp, new);
hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
new->fi_aliased = alias_found;
ret = new;
@@ -4708,32 +4726,6 @@ static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
return ret;
}
-static struct nfs4_file * find_file(struct svc_fh *fh)
-{
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(fh);
-
- rcu_read_lock();
- fp = find_file_locked(fh, hashval);
- rcu_read_unlock();
- return fp;
-}
-
-static struct nfs4_file *
-find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
-{
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(fh);
-
- rcu_read_lock();
- fp = find_file_locked(fh, hashval);
- rcu_read_unlock();
- if (fp)
- return fp;
-
- return insert_file(new, fh, hashval);
-}
-
static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
{
hlist_del_rcu(&fi->fi_hash);
@@ -5624,7 +5616,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* and check for delegations in the process of being recalled.
* If not found, create the nfs4_file struct
*/
- fp = find_or_add_file(open->op_file, current_fh);
+ fp = insert_nfs4_file(open->op_file, current_fh);
if (fp != open->op_file) {
status = nfs4_check_deleg(cl, open, &dp);
if (status)
Refactor to relocate hash deletion operation to a helper function
that is close to most other nfs4_file data structure operations.
The "noinline" annotation will become useful in a moment when the
hlist_del_rcu() is replaced with a more complex rhash remove
operation. It also guarantees that hash remove operations can be
traced with "-p function -l remove_nfs4_file_locked".
This also simplifies the organization of forward declarations: the
to-be-added rhashtable and its param structure will be defined
/after/ put_nfs4_file().
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 644ff4310fa9..373b1d52fcd7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -84,6 +84,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
void nfsd4_end_grace(struct nfsd_net *nn);
static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
+static void remove_nfs4_file_locked(struct nfs4_file *fi);
/* Locking: */
@@ -591,7 +592,7 @@ put_nfs4_file(struct nfs4_file *fi)
might_lock(&state_lock);
if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
- hlist_del_rcu(&fi->fi_hash);
+ remove_nfs4_file_locked(fi);
spin_unlock(&state_lock);
WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
@@ -4733,6 +4734,11 @@ find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
return insert_file(new, fh, hashval);
}
+static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
+{
+ hlist_del_rcu(&fi->fi_hash);
+}
+
/*
* Called to check deny when READ with all zero stateid or
* WRITE with all zero or all one stateid
find_file() is now the only caller of find_file_locked(), so just
fold these two together.
Name nfs4_file-related helpers consistently. There are already
nfs4_file_yada functions, so let's go with the same convention used
by put_nfs4_file(): find_nfs4_file().
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 35 ++++++++++++++---------------------
1 file changed, 14 insertions(+), 21 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 529995a9e916..abed795bb4ec 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4666,31 +4666,23 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
nfs4_put_stid(&last->st_stid);
}
-/* search file_hashtbl[] for file */
-static struct nfs4_file *
-find_file_locked(const struct svc_fh *fh, unsigned int hashval)
+static noinline_for_stack struct nfs4_file *
+find_nfs4_file(const struct svc_fh *fhp)
{
- struct nfs4_file *fp;
+ unsigned int hashval = file_hashval(fhp);
+ struct nfs4_file *fi = NULL;
- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
- lockdep_is_held(&state_lock)) {
- if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
- if (refcount_inc_not_zero(&fp->fi_ref))
- return fp;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
+ lockdep_is_held(&state_lock)) {
+ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+ if (!refcount_inc_not_zero(&fi->fi_ref))
+ fi = NULL;
+ break;
}
}
- return NULL;
-}
-
-static struct nfs4_file * find_file(struct svc_fh *fh)
-{
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(fh);
-
- rcu_read_lock();
- fp = find_file_locked(fh, hashval);
rcu_read_unlock();
- return fp;
+ return fi;
}
/*
@@ -4741,9 +4733,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
struct nfs4_file *fp;
__be32 ret = nfs_ok;
- fp = find_file(current_fh);
+ fp = find_nfs4_file(current_fh);
if (!fp)
return ret;
+
/* Check for conflicting share reservations */
spin_lock(&fp->fi_lock);
if (fp->fi_share_deny & deny_type)
Introduce the infrastructure for managing nfs4_file objects in an
rhashtable. This infrastructure will be used by the next patch.
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 23 ++++++++++++++++++++++-
fs/nfsd/state.h | 1 +
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index abed795bb4ec..681cb2daa843 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,7 +44,9 @@
#include <linux/jhash.h>
#include <linux/string_helpers.h>
#include <linux/fsnotify.h>
+#include <linux/rhashtable.h>
#include <linux/nfs_ssc.h>
+
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -721,6 +723,18 @@ static unsigned int file_hashval(const struct svc_fh *fh)
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
+
+static const struct rhashtable_params nfs4_file_rhash_params = {
+ .key_len = sizeof_field(struct nfs4_file, fi_inode),
+ .key_offset = offsetof(struct nfs4_file, fi_inode),
+ .head_offset = offsetof(struct nfs4_file, fi_rlist),
+
+ /* Reduce resizing churn on light workloads */
+ .min_size = 256, /* buckets */
+ .automatic_shrinking = true,
+};
+
/*
* Check if courtesy clients have conflicting access and resolve it if possible
*
@@ -8023,10 +8037,16 @@ nfs4_state_start(void)
{
int ret;
- ret = nfsd4_create_callback_queue();
+ ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
if (ret)
return ret;
+ ret = nfsd4_create_callback_queue();
+ if (ret) {
+ rhltable_destroy(&nfs4_file_rhltable);
+ return ret;
+ }
+
set_max_delegations();
return 0;
}
@@ -8057,6 +8077,7 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
+ rhltable_destroy(&nfs4_file_rhltable);
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
nfsd4_ssc_shutdown_umount(nn);
#endif
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e2daef3cc003..190fc7e418a4 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -546,6 +546,7 @@ struct nfs4_file {
bool fi_aliased;
spinlock_t fi_lock;
struct hlist_node fi_hash; /* hash on fi_fhandle */
+ struct rhlist_head fi_rlist;
struct list_head fi_stateids;
union {
struct list_head fi_delegations;
fh_match() is costly, especially when filehandles are large (as is
the case for NFSv4). It needs to be used sparingly when searching
data structures. Unfortunately, with common workloads, I see
multiple thousands of objects stored in file_hashtbl[], which always
has only 256 buckets, which makes the hash chains quite lengthy.
Walking long hash chains with the state_lock held blocks other
activity that needs that lock.
To help mitigate the cost of searching with fh_match(), replace the
nfs4_file hash table with an rhashtable, which can dynamically
resize its bucket array to minimize hash chain length. The ideal for
this use case is one bucket per inode.
The result is an improvement in the latency of NFSv4 operations
and the reduction of nfsd CPU utilization due to the cost of
fh_match() and the CPU cache misses incurred while walking long
hash chains in the nfs4_file hash table.
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 64 +++++++++++++++++++++++++--------------------------
fs/nfsd/state.h | 4 ---
2 files changed, 31 insertions(+), 37 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 681cb2daa843..5b90e5a6a04f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
void
put_nfs4_file(struct nfs4_file *fi)
{
- might_lock(&state_lock);
-
- if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
+ if (refcount_dec_and_test(&fi->fi_ref)) {
remove_nfs4_file_locked(fi);
- spin_unlock(&state_lock);
WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
@@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
return ret & OWNER_HASH_MASK;
}
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS 8
-#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
-
-static unsigned int file_hashval(const struct svc_fh *fh)
-{
- struct inode *inode = d_inode(fh->fh_dentry);
-
- /* XXX: why not (here & in file cache) use inode? */
- return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
-}
-
-static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
-
static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
static const struct rhashtable_params nfs4_file_rhash_params = {
@@ -4683,12 +4666,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
static noinline_for_stack struct nfs4_file *
find_nfs4_file(const struct svc_fh *fhp)
{
- unsigned int hashval = file_hashval(fhp);
+ struct rhlist_head *tmp, *list;
struct nfs4_file *fi = NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
- lockdep_is_held(&state_lock)) {
+ list = rhltable_lookup(&nfs4_file_rhltable, fhp,
+ nfs4_file_rhash_params);
+ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
if (!refcount_inc_not_zero(&fi->fi_ref))
fi = NULL;
@@ -4708,33 +4692,45 @@ find_nfs4_file(const struct svc_fh *fhp)
static noinline_for_stack struct nfs4_file *
insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
{
- unsigned int hashval = file_hashval(fhp);
+ struct rhlist_head *tmp, *list;
struct nfs4_file *ret = NULL;
bool alias_found = false;
struct nfs4_file *fi;
+ int err;
- spin_lock(&state_lock);
- hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
- lockdep_is_held(&state_lock)) {
+ rcu_read_lock();
+
+ list = rhltable_lookup(&nfs4_file_rhltable, fhp,
+ nfs4_file_rhash_params);
+ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
if (refcount_inc_not_zero(&fi->fi_ref))
ret = fi;
} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
fi->fi_aliased = alias_found = true;
}
- if (likely(ret == NULL)) {
- init_nfs4_file(fhp, new);
- hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
- new->fi_aliased = alias_found;
- ret = new;
- }
- spin_unlock(&state_lock);
+ if (ret)
+ goto out_unlock;
+
+ init_nfs4_file(fhp, new);
+ err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
+ &new->fi_rlist,
+ nfs4_file_rhash_params);
+ if (err)
+ goto out_unlock;
+
+ new->fi_aliased = alias_found;
+ ret = new;
+
+out_unlock:
+ rcu_read_unlock();
return ret;
}
static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
{
- hlist_del_rcu(&fi->fi_hash);
+ rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
+ nfs4_file_rhash_params);
}
/*
@@ -5624,6 +5620,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* If not found, create the nfs4_file struct
*/
fp = insert_nfs4_file(open->op_file, current_fh);
+ if (unlikely(!fp))
+ return nfserr_jukebox;
if (fp != open->op_file) {
status = nfs4_check_deleg(cl, open, &dp);
if (status)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 190fc7e418a4..eadd7f465bf5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -536,16 +536,12 @@ struct nfs4_clnt_odstate {
* inode can have multiple filehandles associated with it, so there is
* (potentially) a many to one relationship between this struct and struct
* inode.
- *
- * These are hashed by filehandle in the file_hashtbl, which is protected by
- * the global state_lock spinlock.
*/
struct nfs4_file {
refcount_t fi_ref;
struct inode * fi_inode;
bool fi_aliased;
spinlock_t fi_lock;
- struct hlist_node fi_hash; /* hash on fi_fhandle */
struct rhlist_head fi_rlist;
struct list_head fi_stateids;
union {
This reverts commit 5e138c4a750dc140d881dab4a8804b094bbc08d2.
That commit attempted to make files available to other users as soon
as all NFSv4 clients were done with them, rather than waiting until
the filecache LRU had garbage collected them.
It gets the reference counting wrong, for one thing.
But it also misses that DELEGRETURN should release a file in the
same fashion. In fact, any nfsd_file_put() on an file held open
by an NFSv4 client needs potentially to release the file
immediately...
Clear the way for implementing that idea.
Signed-off-by: Chuck Lever <[email protected]>
Reviewed-by: Jeff Layton <[email protected]>
---
fs/nfsd/filecache.c | 18 ------------------
fs/nfsd/filecache.h | 1 -
fs/nfsd/nfs4state.c | 4 ++--
3 files changed, 2 insertions(+), 21 deletions(-)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 29a62db155fb..beb41a507623 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -444,24 +444,6 @@ nfsd_file_put(struct nfsd_file *nf)
nfsd_file_put_noref(nf);
}
-/**
- * nfsd_file_close - Close an nfsd_file
- * @nf: nfsd_file to close
- *
- * If this is the final reference for @nf, free it immediately.
- * This reflects an on-the-wire CLOSE or DELEGRETURN into the
- * VFS and exported filesystem.
- */
-void nfsd_file_close(struct nfsd_file *nf)
-{
- nfsd_file_put(nf);
- if (refcount_dec_if_one(&nf->nf_ref)) {
- nfsd_file_unhash(nf);
- nfsd_file_lru_remove(nf);
- nfsd_file_free(nf);
- }
-}
-
struct nfsd_file *
nfsd_file_get(struct nfsd_file *nf)
{
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 357832bac736..6b012ea4bd9d 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -52,7 +52,6 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
-void nfsd_file_close(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4e718500a00c..c829b828b6fd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -831,9 +831,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
swap(f2, fp->fi_fds[O_RDWR]);
spin_unlock(&fp->fi_lock);
if (f1)
- nfsd_file_close(f1);
+ nfsd_file_put(f1);
if (f2)
- nfsd_file_close(f2);
+ nfsd_file_put(f2);
}
}
Remove the lame-duck dprintk()s around nfs4_preprocess_stateid_op()
call sites.
Reviewed-by: Jeff Layton <[email protected]>
Tested-by: Jeff Layton <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4proc.c | 31 +++++++------------------------
1 file changed, 7 insertions(+), 24 deletions(-)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 42db62413890..6f7e0c5e62d2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -943,12 +943,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&read->rd_stateid, RD_STATE,
&read->rd_nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
- goto out;
- }
- status = nfs_ok;
-out:
+
read->rd_rqstp = rqstp;
read->rd_fhp = &cstate->current_fh;
return status;
@@ -1117,10 +1112,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate,
&cstate->current_fh, &setattr->sa_stateid,
WR_STATE, NULL, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+ if (status)
return status;
- }
}
err = fh_want_write(&cstate->current_fh);
if (err)
@@ -1168,10 +1161,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_offset, cnt);
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
stateid, WR_STATE, &nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+ if (status)
return status;
- }
write->wr_how_written = write->wr_stable_how;
@@ -1202,17 +1193,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
src_stateid, RD_STATE, src, NULL);
- if (status) {
- dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+ if (status)
goto out;
- }
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
dst_stateid, WR_STATE, dst, NULL);
- if (status) {
- dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+ if (status)
goto out_put_src;
- }
/* fix up for NFS-specific error code */
if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
@@ -1957,10 +1944,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&fallocate->falloc_stateid,
WR_STATE, &nf, NULL);
- if (status != nfs_ok) {
- dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
+ if (status != nfs_ok)
return status;
- }
status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
fallocate->falloc_offset,
@@ -2016,10 +2001,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&seek->seek_stateid,
RD_STATE, &nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+ if (status)
return status;
- }
switch (seek->seek_whence) {
case NFS4_CONTENT_DATA:
Enable callers to use const pointers where they are able to.
Reviewed-by: Jeff Layton <[email protected]>
Tested-by: Jeff Layton <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfsfh.h | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index c3ae6414fc5c..513e028b0bbe 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -220,7 +220,7 @@ __be32 fh_update(struct svc_fh *);
void fh_put(struct svc_fh *);
static __inline__ struct svc_fh *
-fh_copy(struct svc_fh *dst, struct svc_fh *src)
+fh_copy(struct svc_fh *dst, const struct svc_fh *src)
{
WARN_ON(src->fh_dentry);
@@ -229,7 +229,7 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src)
}
static inline void
-fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src)
{
dst->fh_size = src->fh_size;
memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
@@ -243,7 +243,8 @@ fh_init(struct svc_fh *fhp, int maxsize)
return fhp;
}
-static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_match(const struct knfsd_fh *fh1,
+ const struct knfsd_fh *fh2)
{
if (fh1->fh_size != fh2->fh_size)
return false;
@@ -252,7 +253,8 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
return true;
}
-static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
+ const struct knfsd_fh *fh2)
{
if (fh1->fh_fsid_type != fh2->fh_fsid_type)
return false;
Enable callers to use const pointers for type safety.
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfs4state.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index db58d5c0bba8..90c991e500a8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -710,7 +710,7 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
#define FILE_HASH_BITS 8
#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
-static unsigned int file_hashval(struct svc_fh *fh)
+static unsigned int file_hashval(const struct svc_fh *fh)
{
struct inode *inode = d_inode(fh->fh_dentry);
@@ -4671,7 +4671,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
/* search file_hashtbl[] for file */
static struct nfs4_file *
-find_file_locked(struct svc_fh *fh, unsigned int hashval)
+find_file_locked(const struct svc_fh *fh, unsigned int hashval)
{
struct nfs4_file *fp;
On Tue, 25 Oct 2022, Chuck Lever wrote:
> find_file() is now the only caller of find_file_locked(), so just
> fold these two together.
>
> Name nfs4_file-related helpers consistently. There are already
> nfs4_file_yada functions, so let's go with the same convention used
> by put_nfs4_file(): find_nfs4_file().
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> fs/nfsd/nfs4state.c | 35 ++++++++++++++---------------------
> 1 file changed, 14 insertions(+), 21 deletions(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 529995a9e916..abed795bb4ec 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4666,31 +4666,23 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> nfs4_put_stid(&last->st_stid);
> }
>
> -/* search file_hashtbl[] for file */
> -static struct nfs4_file *
> -find_file_locked(const struct svc_fh *fh, unsigned int hashval)
> +static noinline_for_stack struct nfs4_file *
> +find_nfs4_file(const struct svc_fh *fhp)
> {
> - struct nfs4_file *fp;
> + unsigned int hashval = file_hashval(fhp);
> + struct nfs4_file *fi = NULL;
>
> - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
> - lockdep_is_held(&state_lock)) {
> - if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
> - if (refcount_inc_not_zero(&fp->fi_ref))
> - return fp;
> + rcu_read_lock();
> + hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> + lockdep_is_held(&state_lock)) {
> + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> + if (!refcount_inc_not_zero(&fi->fi_ref))
> + fi = NULL;
This looks .... imperfect to me.
If the refcount_inc_not_zero fails, it means we haven't found what we
are looking for, so why break out of the loop?
I guess we can *know* that if some other thread has deleted the entry
and some third thread has added a new entry then the new entry will be
early in the list so it doesn't hurt to stop now. But it seems
unnecessary.
I would write this.
if (fh_match(&fi->fi_fhandle, &fhp->fh_handle) &&
refcount_inc_not_zero(&fi->fi_ref))
break;
> + break;
> }
> }
..
> rcu_read_unlock();
..
> + return fi;
This assumes that when the loop completes normally, the loop variable is
NULL. That is the case for this particular for_each loop, but isn't for
some others. There was a recent spate of patches for for_each loops
that made the wrong assumption so I feel a bit wary.
At most I might suggest a comment ... but that probably wouldn't help at
all.. So it is probably fine as it is.
So I'd like to see the loop simplified to remove the assignment
(fi = NULL), but either way
Reviewed-by: NeilBrown <[email protected]>
Thanks,
NeilBrown
On Tue, 25 Oct 2022, Chuck Lever wrote:
> Introduce the infrastructure for managing nfs4_file objects in an
> rhashtable. This infrastructure will be used by the next patch.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> fs/nfsd/nfs4state.c | 23 ++++++++++++++++++++++-
> fs/nfsd/state.h | 1 +
> 2 files changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index abed795bb4ec..681cb2daa843 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -44,7 +44,9 @@
> #include <linux/jhash.h>
> #include <linux/string_helpers.h>
> #include <linux/fsnotify.h>
> +#include <linux/rhashtable.h>
> #include <linux/nfs_ssc.h>
> +
> #include "xdr4.h"
> #include "xdr4cb.h"
> #include "vfs.h"
> @@ -721,6 +723,18 @@ static unsigned int file_hashval(const struct svc_fh *fh)
>
> static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>
> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
> +
> +static const struct rhashtable_params nfs4_file_rhash_params = {
> + .key_len = sizeof_field(struct nfs4_file, fi_inode),
> + .key_offset = offsetof(struct nfs4_file, fi_inode),
> + .head_offset = offsetof(struct nfs4_file, fi_rlist),
> +
> + /* Reduce resizing churn on light workloads */
> + .min_size = 256, /* buckets */
Every time I see this line a groan a little bit. Probably I'm groaning
at rhashtable - you shouldn't need to have to worry about these sorts of
details when using an API... but I agree that avoiding churn is likely
a good idea.
Where did 256 come from? Would PAGE_SIZE/sizeof(void*) make more sense
(though that is 512).
How much churn is too much? The default is 4 and we grow at >75% and
shrink at <30%, so at 4 entries the table would resize to 8, and that 2
entries it would shrink back. That does sound like churn.
If we choose 8, then at 7 we grow to 16 and at 4 we go back to 8.
If we choose 16, then at 13 we grow to 32 and at 9 we go back to 16.
If we choose 64, then at 49 we grow to 128 and at 39 we go back to 64.
The margin seems rather narrow. May 30% is too high - 15% might be a
lot better. But changing that might be difficult.
So I don't have a good recommendation, but I don't like magic numbers.
Maybe PAGE_SIZE/sizeof(void*) ??
But either way
Reviewed-by: NeilBrown <[email protected]>
Thanks,
NeilBrown
> + .automatic_shrinking = true,
> +};
> +
> /*
> * Check if courtesy clients have conflicting access and resolve it if possible
> *
> @@ -8023,10 +8037,16 @@ nfs4_state_start(void)
> {
> int ret;
>
> - ret = nfsd4_create_callback_queue();
> + ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
> if (ret)
> return ret;
>
> + ret = nfsd4_create_callback_queue();
> + if (ret) {
> + rhltable_destroy(&nfs4_file_rhltable);
> + return ret;
> + }
> +
> set_max_delegations();
> return 0;
> }
> @@ -8057,6 +8077,7 @@ nfs4_state_shutdown_net(struct net *net)
>
> nfsd4_client_tracking_exit(net);
> nfs4_state_destroy_net(net);
> + rhltable_destroy(&nfs4_file_rhltable);
> #ifdef CONFIG_NFSD_V4_2_INTER_SSC
> nfsd4_ssc_shutdown_umount(nn);
> #endif
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index e2daef3cc003..190fc7e418a4 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -546,6 +546,7 @@ struct nfs4_file {
> bool fi_aliased;
> spinlock_t fi_lock;
> struct hlist_node fi_hash; /* hash on fi_fhandle */
> + struct rhlist_head fi_rlist;
> struct list_head fi_stateids;
> union {
> struct list_head fi_delegations;
>
>
>
On Tue, 25 Oct 2022, Chuck Lever wrote:
> fh_match() is costly, especially when filehandles are large (as is
> the case for NFSv4). It needs to be used sparingly when searching
> data structures. Unfortunately, with common workloads, I see
> multiple thousands of objects stored in file_hashtbl[], which always
> has only 256 buckets, which makes the hash chains quite lengthy.
>
> Walking long hash chains with the state_lock held blocks other
> activity that needs that lock.
>
> To help mitigate the cost of searching with fh_match(), replace the
> nfs4_file hash table with an rhashtable, which can dynamically
> resize its bucket array to minimize hash chain length. The ideal for
> this use case is one bucket per inode.
>
> The result is an improvement in the latency of NFSv4 operations
> and the reduction of nfsd CPU utilization due to the cost of
> fh_match() and the CPU cache misses incurred while walking long
> hash chains in the nfs4_file hash table.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> fs/nfsd/nfs4state.c | 64 +++++++++++++++++++++++++--------------------------
> fs/nfsd/state.h | 4 ---
> 2 files changed, 31 insertions(+), 37 deletions(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 681cb2daa843..5b90e5a6a04f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
> void
> put_nfs4_file(struct nfs4_file *fi)
> {
> - might_lock(&state_lock);
> -
> - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
> + if (refcount_dec_and_test(&fi->fi_ref)) {
> remove_nfs4_file_locked(fi);
> - spin_unlock(&state_lock);
> WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
> WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
> call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
> @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
> return ret & OWNER_HASH_MASK;
> }
>
> -/* hash table for nfs4_file */
> -#define FILE_HASH_BITS 8
> -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
> -
> -static unsigned int file_hashval(const struct svc_fh *fh)
> -{
> - struct inode *inode = d_inode(fh->fh_dentry);
> -
> - /* XXX: why not (here & in file cache) use inode? */
> - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
> -}
> -
> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
> -
> static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>
> static const struct rhashtable_params nfs4_file_rhash_params = {
> @@ -4683,12 +4666,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> static noinline_for_stack struct nfs4_file *
> find_nfs4_file(const struct svc_fh *fhp)
> {
> - unsigned int hashval = file_hashval(fhp);
> + struct rhlist_head *tmp, *list;
> struct nfs4_file *fi = NULL;
>
> rcu_read_lock();
> - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> - lockdep_is_held(&state_lock)) {
> + list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> + nfs4_file_rhash_params);
> + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
> if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> if (!refcount_inc_not_zero(&fi->fi_ref))
> fi = NULL;
> @@ -4708,33 +4692,45 @@ find_nfs4_file(const struct svc_fh *fhp)
> static noinline_for_stack struct nfs4_file *
> insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
> {
> - unsigned int hashval = file_hashval(fhp);
> + struct rhlist_head *tmp, *list;
> struct nfs4_file *ret = NULL;
> bool alias_found = false;
> struct nfs4_file *fi;
> + int err;
>
> - spin_lock(&state_lock);
> - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> - lockdep_is_held(&state_lock)) {
> + rcu_read_lock();
> +
As mentioned separately, we need some sort of lock around this loop and
the later insert.
> + list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> + nfs4_file_rhash_params);
> + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
> if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> if (refcount_inc_not_zero(&fi->fi_ref))
> ret = fi;
> } else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
> fi->fi_aliased = alias_found = true;
This d_inde)( == fi->fi_inode test is no longer relevant. Everything in
the list must have the same inode. Best remove it.
Thanks,
NeilBrown
> }
> - if (likely(ret == NULL)) {
> - init_nfs4_file(fhp, new);
> - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
> - new->fi_aliased = alias_found;
> - ret = new;
> - }
> - spin_unlock(&state_lock);
> + if (ret)
> + goto out_unlock;
> +
> + init_nfs4_file(fhp, new);
> + err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
> + &new->fi_rlist,
> + nfs4_file_rhash_params);
> + if (err)
> + goto out_unlock;
> +
> + new->fi_aliased = alias_found;
> + ret = new;
> +
> +out_unlock:
> + rcu_read_unlock();
> return ret;
> }
>
> static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
> {
> - hlist_del_rcu(&fi->fi_hash);
> + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
> + nfs4_file_rhash_params);
> }
>
> /*
> @@ -5624,6 +5620,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> * If not found, create the nfs4_file struct
> */
> fp = insert_nfs4_file(open->op_file, current_fh);
> + if (unlikely(!fp))
> + return nfserr_jukebox;
> if (fp != open->op_file) {
> status = nfs4_check_deleg(cl, open, &dp);
> if (status)
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 190fc7e418a4..eadd7f465bf5 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate {
> * inode can have multiple filehandles associated with it, so there is
> * (potentially) a many to one relationship between this struct and struct
> * inode.
> - *
> - * These are hashed by filehandle in the file_hashtbl, which is protected by
> - * the global state_lock spinlock.
> */
> struct nfs4_file {
> refcount_t fi_ref;
> struct inode * fi_inode;
> bool fi_aliased;
> spinlock_t fi_lock;
> - struct hlist_node fi_hash; /* hash on fi_fhandle */
> struct rhlist_head fi_rlist;
> struct list_head fi_stateids;
> union {
>
>
>
> On Oct 24, 2022, at 7:37 PM, NeilBrown <[email protected]> wrote:
>
> On Tue, 25 Oct 2022, Chuck Lever wrote:
>> Introduce the infrastructure for managing nfs4_file objects in an
>> rhashtable. This infrastructure will be used by the next patch.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> fs/nfsd/nfs4state.c | 23 ++++++++++++++++++++++-
>> fs/nfsd/state.h | 1 +
>> 2 files changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index abed795bb4ec..681cb2daa843 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -44,7 +44,9 @@
>> #include <linux/jhash.h>
>> #include <linux/string_helpers.h>
>> #include <linux/fsnotify.h>
>> +#include <linux/rhashtable.h>
>> #include <linux/nfs_ssc.h>
>> +
>> #include "xdr4.h"
>> #include "xdr4cb.h"
>> #include "vfs.h"
>> @@ -721,6 +723,18 @@ static unsigned int file_hashval(const struct svc_fh *fh)
>>
>> static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>>
>> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>> +
>> +static const struct rhashtable_params nfs4_file_rhash_params = {
>> + .key_len = sizeof_field(struct nfs4_file, fi_inode),
>> + .key_offset = offsetof(struct nfs4_file, fi_inode),
>> + .head_offset = offsetof(struct nfs4_file, fi_rlist),
>> +
>> + /* Reduce resizing churn on light workloads */
>> + .min_size = 256, /* buckets */
>
> Every time I see this line a groan a little bit. Probably I'm groaning
> at rhashtable - you shouldn't need to have to worry about these sorts of
> details when using an API... but I agree that avoiding churn is likely
> a good idea.
>
> Where did 256 come from?
Here's the current file_hashtbl definition:
710 /* hash table for nfs4_file */
711 #define FILE_HASH_BITS 8
712 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
713
714 static unsigned int file_hashval(const struct svc_fh *fh)
715 {
716 struct inode *inode = d_inode(fh->fh_dentry);
717
718 /* XXX: why not (here & in file cache) use inode? */
719 return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
720 }
721
722 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
256 buckets is the size of the existing file_hashtbl.
> Would PAGE_SIZE/sizeof(void*) make more sense
> (though that is 512).
For rhashtable, you need to account for sizeof(struct bucket_table),
if I'm reading nested_bucket_table_alloc() correctly.
256 is 2048 bytes + sizeof(struct bucket_table). 512 buckets would
push us over a page.
> How much churn is too much? The default is 4 and we grow at >75% and
> shrink at <30%, so at 4 entries the table would resize to 8, and that 2
> entries it would shrink back. That does sound like churn.
> If we choose 8, then at 7 we grow to 16 and at 4 we go back to 8.
> If we choose 16, then at 13 we grow to 32 and at 9 we go back to 16.
>
> If we choose 64, then at 49 we grow to 128 and at 39 we go back to 64.
>
> The margin seems rather narrow. May 30% is too high - 15% might be a
> lot better. But changing that might be difficult.
I could go a little smaller. Basically table resizing is OK when we're
talking about a lot of buckets because that overhead is very likely to
be amortized over many insertions and removals.
> So I don't have a good recommendation, but I don't like magic numbers.
> Maybe PAGE_SIZE/sizeof(void*) ??
The only thing I can think of would be
#define NFS4_FILE_HASH_SIZE (some number or constant calculation)
which to me isn't much better than
.size = 256, /* buckets */
I will ponder some more.
> But either way
> Reviewed-by: NeilBrown <[email protected]>
>
> Thanks,
> NeilBrown
>
>
>> + .automatic_shrinking = true,
>> +};
>> +
>> /*
>> * Check if courtesy clients have conflicting access and resolve it if possible
>> *
>> @@ -8023,10 +8037,16 @@ nfs4_state_start(void)
>> {
>> int ret;
>>
>> - ret = nfsd4_create_callback_queue();
>> + ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
>> if (ret)
>> return ret;
>>
>> + ret = nfsd4_create_callback_queue();
>> + if (ret) {
>> + rhltable_destroy(&nfs4_file_rhltable);
>> + return ret;
>> + }
>> +
>> set_max_delegations();
>> return 0;
>> }
>> @@ -8057,6 +8077,7 @@ nfs4_state_shutdown_net(struct net *net)
>>
>> nfsd4_client_tracking_exit(net);
>> nfs4_state_destroy_net(net);
>> + rhltable_destroy(&nfs4_file_rhltable);
>> #ifdef CONFIG_NFSD_V4_2_INTER_SSC
>> nfsd4_ssc_shutdown_umount(nn);
>> #endif
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index e2daef3cc003..190fc7e418a4 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -546,6 +546,7 @@ struct nfs4_file {
>> bool fi_aliased;
>> spinlock_t fi_lock;
>> struct hlist_node fi_hash; /* hash on fi_fhandle */
>> + struct rhlist_head fi_rlist;
>> struct list_head fi_stateids;
>> union {
>> struct list_head fi_delegations;
>>
>>
>>
--
Chuck Lever
> On Oct 24, 2022, at 7:43 PM, NeilBrown <[email protected]> wrote:
>
> On Tue, 25 Oct 2022, Chuck Lever wrote:
>> fh_match() is costly, especially when filehandles are large (as is
>> the case for NFSv4). It needs to be used sparingly when searching
>> data structures. Unfortunately, with common workloads, I see
>> multiple thousands of objects stored in file_hashtbl[], which always
>> has only 256 buckets, which makes the hash chains quite lengthy.
>>
>> Walking long hash chains with the state_lock held blocks other
>> activity that needs that lock.
>>
>> To help mitigate the cost of searching with fh_match(), replace the
>> nfs4_file hash table with an rhashtable, which can dynamically
>> resize its bucket array to minimize hash chain length. The ideal for
>> this use case is one bucket per inode.
>>
>> The result is an improvement in the latency of NFSv4 operations
>> and the reduction of nfsd CPU utilization due to the cost of
>> fh_match() and the CPU cache misses incurred while walking long
>> hash chains in the nfs4_file hash table.
>>
>> Signed-off-by: Chuck Lever <[email protected]>
>> ---
>> fs/nfsd/nfs4state.c | 64 +++++++++++++++++++++++++--------------------------
>> fs/nfsd/state.h | 4 ---
>> 2 files changed, 31 insertions(+), 37 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 681cb2daa843..5b90e5a6a04f 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>> void
>> put_nfs4_file(struct nfs4_file *fi)
>> {
>> - might_lock(&state_lock);
>> -
>> - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
>> + if (refcount_dec_and_test(&fi->fi_ref)) {
>> remove_nfs4_file_locked(fi);
>> - spin_unlock(&state_lock);
>> WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>> WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>> call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
>> @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>> return ret & OWNER_HASH_MASK;
>> }
>>
>> -/* hash table for nfs4_file */
>> -#define FILE_HASH_BITS 8
>> -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
>> -
>> -static unsigned int file_hashval(const struct svc_fh *fh)
>> -{
>> - struct inode *inode = d_inode(fh->fh_dentry);
>> -
>> - /* XXX: why not (here & in file cache) use inode? */
>> - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
>> -}
>> -
>> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>> -
>> static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>>
>> static const struct rhashtable_params nfs4_file_rhash_params = {
>> @@ -4683,12 +4666,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>> static noinline_for_stack struct nfs4_file *
>> find_nfs4_file(const struct svc_fh *fhp)
>> {
>> - unsigned int hashval = file_hashval(fhp);
>> + struct rhlist_head *tmp, *list;
>> struct nfs4_file *fi = NULL;
>>
>> rcu_read_lock();
>> - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
>> - lockdep_is_held(&state_lock)) {
>> + list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> + nfs4_file_rhash_params);
>> + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>> if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> if (!refcount_inc_not_zero(&fi->fi_ref))
>> fi = NULL;
>> @@ -4708,33 +4692,45 @@ find_nfs4_file(const struct svc_fh *fhp)
>> static noinline_for_stack struct nfs4_file *
>> insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
>> {
>> - unsigned int hashval = file_hashval(fhp);
>> + struct rhlist_head *tmp, *list;
>> struct nfs4_file *ret = NULL;
>> bool alias_found = false;
>> struct nfs4_file *fi;
>> + int err;
>>
>> - spin_lock(&state_lock);
>> - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
>> - lockdep_is_held(&state_lock)) {
>> + rcu_read_lock();
>> +
>
> As mentioned separately, we need some sort of lock around this loop and
> the later insert.
Yep. I just ran out of time today.
>> + list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> + nfs4_file_rhash_params);
>> + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>> if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> if (refcount_inc_not_zero(&fi->fi_ref))
>> ret = fi;
>> } else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
>> fi->fi_aliased = alias_found = true;
>
> This d_inde)( == fi->fi_inode test is no longer relevant. Everything in
> the list must have the same inode. Best remove it.
My understanding is that more than one inode can hash into one of
these bucket lists. I don't see how rhltable_insert() can prevent
that from happening, and that will certainly be true if we go back
to using a fixed-size table.
I will try to test this assumption before posting the next revision.
> Thanks,
> NeilBrown
>
>> }
>> - if (likely(ret == NULL)) {
>> - init_nfs4_file(fhp, new);
>> - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
>> - new->fi_aliased = alias_found;
>> - ret = new;
>> - }
>> - spin_unlock(&state_lock);
>> + if (ret)
>> + goto out_unlock;
>> +
>> + init_nfs4_file(fhp, new);
>> + err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
>> + &new->fi_rlist,
>> + nfs4_file_rhash_params);
>> + if (err)
>> + goto out_unlock;
>> +
>> + new->fi_aliased = alias_found;
>> + ret = new;
>> +
>> +out_unlock:
>> + rcu_read_unlock();
>> return ret;
>> }
>>
>> static noinline_for_stack void remove_nfs4_file_locked(struct nfs4_file *fi)
>> {
>> - hlist_del_rcu(&fi->fi_hash);
>> + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
>> + nfs4_file_rhash_params);
>> }
>>
>> /*
>> @@ -5624,6 +5620,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>> * If not found, create the nfs4_file struct
>> */
>> fp = insert_nfs4_file(open->op_file, current_fh);
>> + if (unlikely(!fp))
>> + return nfserr_jukebox;
>> if (fp != open->op_file) {
>> status = nfs4_check_deleg(cl, open, &dp);
>> if (status)
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index 190fc7e418a4..eadd7f465bf5 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate {
>> * inode can have multiple filehandles associated with it, so there is
>> * (potentially) a many to one relationship between this struct and struct
>> * inode.
>> - *
>> - * These are hashed by filehandle in the file_hashtbl, which is protected by
>> - * the global state_lock spinlock.
>> */
>> struct nfs4_file {
>> refcount_t fi_ref;
>> struct inode * fi_inode;
>> bool fi_aliased;
>> spinlock_t fi_lock;
>> - struct hlist_node fi_hash; /* hash on fi_fhandle */
>> struct rhlist_head fi_rlist;
>> struct list_head fi_stateids;
>> union {
>>
>>
>>
--
Chuck Lever
On Tue, 25 Oct 2022, Chuck Lever III wrote:
>
> > On Oct 24, 2022, at 7:43 PM, NeilBrown <[email protected]> wrote:
> >
> > On Tue, 25 Oct 2022, Chuck Lever wrote:
> >> fh_match() is costly, especially when filehandles are large (as is
> >> the case for NFSv4). It needs to be used sparingly when searching
> >> data structures. Unfortunately, with common workloads, I see
> >> multiple thousands of objects stored in file_hashtbl[], which always
> >> has only 256 buckets, which makes the hash chains quite lengthy.
> >>
> >> Walking long hash chains with the state_lock held blocks other
> >> activity that needs that lock.
> >>
> >> To help mitigate the cost of searching with fh_match(), replace the
> >> nfs4_file hash table with an rhashtable, which can dynamically
> >> resize its bucket array to minimize hash chain length. The ideal for
> >> this use case is one bucket per inode.
> >>
> >> The result is an improvement in the latency of NFSv4 operations
> >> and the reduction of nfsd CPU utilization due to the cost of
> >> fh_match() and the CPU cache misses incurred while walking long
> >> hash chains in the nfs4_file hash table.
> >>
> >> Signed-off-by: Chuck Lever <[email protected]>
> >> ---
> >> fs/nfsd/nfs4state.c | 64 +++++++++++++++++++++++++--------------------------
> >> fs/nfsd/state.h | 4 ---
> >> 2 files changed, 31 insertions(+), 37 deletions(-)
> >>
> >> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> >> index 681cb2daa843..5b90e5a6a04f 100644
> >> --- a/fs/nfsd/nfs4state.c
> >> +++ b/fs/nfsd/nfs4state.c
> >> @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
> >> void
> >> put_nfs4_file(struct nfs4_file *fi)
> >> {
> >> - might_lock(&state_lock);
> >> -
> >> - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
> >> + if (refcount_dec_and_test(&fi->fi_ref)) {
> >> remove_nfs4_file_locked(fi);
> >> - spin_unlock(&state_lock);
> >> WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
> >> WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
> >> call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
> >> @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
> >> return ret & OWNER_HASH_MASK;
> >> }
> >>
> >> -/* hash table for nfs4_file */
> >> -#define FILE_HASH_BITS 8
> >> -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
> >> -
> >> -static unsigned int file_hashval(const struct svc_fh *fh)
> >> -{
> >> - struct inode *inode = d_inode(fh->fh_dentry);
> >> -
> >> - /* XXX: why not (here & in file cache) use inode? */
> >> - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
> >> -}
> >> -
> >> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
> >> -
> >> static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
> >>
> >> static const struct rhashtable_params nfs4_file_rhash_params = {
> >> @@ -4683,12 +4666,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> >> static noinline_for_stack struct nfs4_file *
> >> find_nfs4_file(const struct svc_fh *fhp)
> >> {
> >> - unsigned int hashval = file_hashval(fhp);
> >> + struct rhlist_head *tmp, *list;
> >> struct nfs4_file *fi = NULL;
> >>
> >> rcu_read_lock();
> >> - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> >> - lockdep_is_held(&state_lock)) {
> >> + list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> >> + nfs4_file_rhash_params);
> >> + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
> >> if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> >> if (!refcount_inc_not_zero(&fi->fi_ref))
> >> fi = NULL;
> >> @@ -4708,33 +4692,45 @@ find_nfs4_file(const struct svc_fh *fhp)
> >> static noinline_for_stack struct nfs4_file *
> >> insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
> >> {
> >> - unsigned int hashval = file_hashval(fhp);
> >> + struct rhlist_head *tmp, *list;
> >> struct nfs4_file *ret = NULL;
> >> bool alias_found = false;
> >> struct nfs4_file *fi;
> >> + int err;
> >>
> >> - spin_lock(&state_lock);
> >> - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
> >> - lockdep_is_held(&state_lock)) {
> >> + rcu_read_lock();
> >> +
> >
> > As mentioned separately, we need some sort of lock around this loop and
> > the later insert.
>
> Yep. I just ran out of time today.
>
>
> >> + list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> >> + nfs4_file_rhash_params);
> >> + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
> >> if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> >> if (refcount_inc_not_zero(&fi->fi_ref))
> >> ret = fi;
> >> } else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
> >> fi->fi_aliased = alias_found = true;
> >
> > This d_inde)( == fi->fi_inode test is no longer relevant. Everything in
> > the list must have the same inode. Best remove it.
>
> My understanding is that more than one inode can hash into one of
> these bucket lists. I don't see how rhltable_insert() can prevent
> that from happening, and that will certainly be true if we go back
> to using a fixed-size table.
With rhltable each bucket is a list of lists. An rhlist_head contains
two pointers. "rhead.next" points to the next entry in the bucket which
will have a different key (different inode in this case). "next" points
to the next entry with the same key - different filehandle, same inode,
in this case.
The list you get back from rhltable_lookup() isn't the full bucket list.
It is just the list within the bucket of all elements with the same key.
NeilBrown
>
> I will try to test this assumption before posting the next revision.
>
On Tue, 25 Oct 2022, Chuck Lever III wrote:
>
> > On Oct 24, 2022, at 7:37 PM, NeilBrown <[email protected]> wrote:
> >
> > On Tue, 25 Oct 2022, Chuck Lever wrote:
> >> Introduce the infrastructure for managing nfs4_file objects in an
> >> rhashtable. This infrastructure will be used by the next patch.
> >>
> >> Signed-off-by: Chuck Lever <[email protected]>
> >> ---
> >> fs/nfsd/nfs4state.c | 23 ++++++++++++++++++++++-
> >> fs/nfsd/state.h | 1 +
> >> 2 files changed, 23 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> >> index abed795bb4ec..681cb2daa843 100644
> >> --- a/fs/nfsd/nfs4state.c
> >> +++ b/fs/nfsd/nfs4state.c
> >> @@ -44,7 +44,9 @@
> >> #include <linux/jhash.h>
> >> #include <linux/string_helpers.h>
> >> #include <linux/fsnotify.h>
> >> +#include <linux/rhashtable.h>
> >> #include <linux/nfs_ssc.h>
> >> +
> >> #include "xdr4.h"
> >> #include "xdr4cb.h"
> >> #include "vfs.h"
> >> @@ -721,6 +723,18 @@ static unsigned int file_hashval(const struct svc_fh *fh)
> >>
> >> static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
> >>
> >> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
> >> +
> >> +static const struct rhashtable_params nfs4_file_rhash_params = {
> >> + .key_len = sizeof_field(struct nfs4_file, fi_inode),
> >> + .key_offset = offsetof(struct nfs4_file, fi_inode),
> >> + .head_offset = offsetof(struct nfs4_file, fi_rlist),
> >> +
> >> + /* Reduce resizing churn on light workloads */
> >> + .min_size = 256, /* buckets */
> >
> > Every time I see this line a groan a little bit. Probably I'm groaning
> > at rhashtable - you shouldn't need to have to worry about these sorts of
> > details when using an API... but I agree that avoiding churn is likely
> > a good idea.
> >
> > Where did 256 come from?
>
> Here's the current file_hashtbl definition:
>
> 710 /* hash table for nfs4_file */
> 711 #define FILE_HASH_BITS 8
> 712 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
> 713
> 714 static unsigned int file_hashval(const struct svc_fh *fh)
> 715 {
> 716 struct inode *inode = d_inode(fh->fh_dentry);
> 717
> 718 /* XXX: why not (here & in file cache) use inode? */
> 719 return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
> 720 }
> 721
> 722 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>
> 256 buckets is the size of the existing file_hashtbl.
>
>
> > Would PAGE_SIZE/sizeof(void*) make more sense
> > (though that is 512).
>
> For rhashtable, you need to account for sizeof(struct bucket_table),
> if I'm reading nested_bucket_table_alloc() correctly.
>
> 256 is 2048 bytes + sizeof(struct bucket_table). 512 buckets would
> push us over a page.
Ah yes, of course. The does suggest that 256 is a better choice.
>
>
> > How much churn is too much? The default is 4 and we grow at >75% and
> > shrink at <30%, so at 4 entries the table would resize to 8, and that 2
> > entries it would shrink back. That does sound like churn.
> > If we choose 8, then at 7 we grow to 16 and at 4 we go back to 8.
> > If we choose 16, then at 13 we grow to 32 and at 9 we go back to 16.
> >
> > If we choose 64, then at 49 we grow to 128 and at 39 we go back to 64.
> >
> > The margin seems rather narrow. May 30% is too high - 15% might be a
> > lot better. But changing that might be difficult.
>
> I could go a little smaller. Basically table resizing is OK when we're
> talking about a lot of buckets because that overhead is very likely to
> be amortized over many insertions and removals.
>
>
> > So I don't have a good recommendation, but I don't like magic numbers.
> > Maybe PAGE_SIZE/sizeof(void*) ??
>
> The only thing I can think of would be
>
> #define NFS4_FILE_HASH_SIZE (some number or constant calculation)
>
> which to me isn't much better than
>
> .size = 256, /* buckets */
>
> I will ponder some more.
>
Maybe just a comment saying that this number will allow the
struct bucket_table to fit in one 4K page.
Thanks,
NeilBrown
>
> > But either way
> > Reviewed-by: NeilBrown <[email protected]>
> >
> > Thanks,
> > NeilBrown
> >
> >
> >> + .automatic_shrinking = true,
> >> +};
> >> +
> >> /*
> >> * Check if courtesy clients have conflicting access and resolve it if possible
> >> *
> >> @@ -8023,10 +8037,16 @@ nfs4_state_start(void)
> >> {
> >> int ret;
> >>
> >> - ret = nfsd4_create_callback_queue();
> >> + ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
> >> if (ret)
> >> return ret;
> >>
> >> + ret = nfsd4_create_callback_queue();
> >> + if (ret) {
> >> + rhltable_destroy(&nfs4_file_rhltable);
> >> + return ret;
> >> + }
> >> +
> >> set_max_delegations();
> >> return 0;
> >> }
> >> @@ -8057,6 +8077,7 @@ nfs4_state_shutdown_net(struct net *net)
> >>
> >> nfsd4_client_tracking_exit(net);
> >> nfs4_state_destroy_net(net);
> >> + rhltable_destroy(&nfs4_file_rhltable);
> >> #ifdef CONFIG_NFSD_V4_2_INTER_SSC
> >> nfsd4_ssc_shutdown_umount(nn);
> >> #endif
> >> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> >> index e2daef3cc003..190fc7e418a4 100644
> >> --- a/fs/nfsd/state.h
> >> +++ b/fs/nfsd/state.h
> >> @@ -546,6 +546,7 @@ struct nfs4_file {
> >> bool fi_aliased;
> >> spinlock_t fi_lock;
> >> struct hlist_node fi_hash; /* hash on fi_fhandle */
> >> + struct rhlist_head fi_rlist;
> >> struct list_head fi_stateids;
> >> union {
> >> struct list_head fi_delegations;
> >>
> >>
> >>
>
> --
> Chuck Lever
>
>
>
>
> On Oct 24, 2022, at 9:01 PM, NeilBrown <[email protected]> wrote:
>
> On Tue, 25 Oct 2022, Chuck Lever III wrote:
>>
>>> On Oct 24, 2022, at 7:43 PM, NeilBrown <[email protected]> wrote:
>>
>>>> + list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>>>> + nfs4_file_rhash_params);
>>>> + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
>>>> if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>>>> if (refcount_inc_not_zero(&fi->fi_ref))
>>>> ret = fi;
>>>> } else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
>>>> fi->fi_aliased = alias_found = true;
>>>
>>> This d_inde)( == fi->fi_inode test is no longer relevant. Everything in
>>> the list must have the same inode. Best remove it.
>>
>> My understanding is that more than one inode can hash into one of
>> these bucket lists. I don't see how rhltable_insert() can prevent
>> that from happening, and that will certainly be true if we go back
>> to using a fixed-size table.
>
> With rhltable each bucket is a list of lists. An rhlist_head contains
> two pointers. "rhead.next" points to the next entry in the bucket which
> will have a different key (different inode in this case). "next" points
> to the next entry with the same key - different filehandle, same inode,
> in this case.
That's the detail I didn't yet have, thanks. I will remove the extra
inode test and give it a try.
> The list you get back from rhltable_lookup() isn't the full bucket list.
> It is just the list within the bucket of all elements with the same key.
--
Chuck Lever