LinuxLists.cc - [PATCH v1 0/6] Fix some lock contention in the NFS server's DRC

2023-07-09 15:55:10

Subject: [PATCH v1 0/6] Fix some lock contention in the NFS server's DRC

This series optimizes DRC scalability by freeing cache objects only
once the hash bucket lock is no longer held. There are a couple of
related clean-ups to go along with this optimization.

---

Chuck Lever (6):
NFSD: Refactor nfsd_reply_cache_free_locked()
NFSD: Rename nfsd_reply_cache_alloc()
NFSD: Replace nfsd_prune_bucket()
NFSD: Refactor the duplicate reply cache shrinker
NFSD: Remove svc_rqst::rq_cacherep
NFSD: Rename struct svc_cacherep

fs/nfsd/cache.h | 8 +-
fs/nfsd/nfscache.c | 203 ++++++++++++++++++++++++-------------
fs/nfsd/nfssvc.c | 10 +-
fs/nfsd/trace.h | 26 ++++-
include/linux/sunrpc/svc.h | 1 -
5 files changed, 165 insertions(+), 83 deletions(-)

--
Chuck Lever

2023-07-09 15:55:25

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 3/6] NFSD: Replace nfsd_prune_bucket()

From: Chuck Lever <[email protected]>

Enable nfsd_prune_bucket() to drop the bucket lock while calling
kfree(). Use the same pattern that Jeff recently introduced in the
NFSD filecache.

A few percpu operations are moved outside the lock since they
temporarily disable local IRQs which is expensive and does not
need to be done while the lock is held.

Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfscache.c | 78 ++++++++++++++++++++++++++++++++++++++++++----------
fs/nfsd/trace.h | 22 +++++++++++++++
2 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 74fc9d9eeb1e..c8b572d2c72a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -116,6 +116,21 @@ static void nfsd_cacherep_free(struct svc_cacherep *rp)
kmem_cache_free(drc_slab, rp);
}

+static unsigned long
+nfsd_cacherep_dispose(struct list_head *dispose)
+{
+ struct svc_cacherep *rp;
+ unsigned long freed = 0;
+
+ while (!list_empty(dispose)) {
+ rp = list_first_entry(dispose, struct svc_cacherep, c_lru);
+ list_del(&rp->c_lru);
+ nfsd_cacherep_free(rp);
+ freed++;
+ }
+ return freed;
+}
+
static void
nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
struct svc_cacherep *rp)
@@ -259,6 +274,41 @@ nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn)
return &nn->drc_hashtbl[hash];
}

+/*
+ * Remove and return no more than @max expired entries in bucket @b.
+ * If @max is zero, do not limit the number of removed entries.
+ */
+static void
+nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
+ unsigned int max, struct list_head *dispose)
+{
+ unsigned long expiry = jiffies - RC_EXPIRE;
+ struct svc_cacherep *rp, *tmp;
+ unsigned int freed = 0;
+
+ lockdep_assert_held(&b->cache_lock);
+
+ /* The bucket LRU is ordered oldest-first. */
+ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
+ /*
+ * Don't free entries attached to calls that are still
+ * in-progress, but do keep scanning the list.
+ */
+ if (rp->c_state == RC_INPROG)
+ continue;
+
+ if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries &&
+ time_before(expiry, rp->c_timestamp))
+ break;
+
+ nfsd_cacherep_unlink_locked(nn, b, rp);
+ list_add(&rp->c_lru, dispose);
+
+ if (max && ++freed > max)
+ break;
+ }
+}
+
static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
unsigned int max)
{
@@ -282,11 +332,6 @@ static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
return freed;
}

-static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
-{
- return prune_bucket(b, nn, 3);
-}
-
/*
* Walk the LRU list and prune off entries that are older than RC_EXPIRE.
* Also prune the oldest ones when the total exceeds the max number of entries.
@@ -442,6 +487,8 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
__wsum csum;
struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
+ unsigned long freed;
+ LIST_HEAD(dispose);
int rtn = RC_DOIT;

rqstp->rq_cacherep = NULL;
@@ -466,20 +513,18 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
found = nfsd_cache_insert(b, rp, nn);
if (found != rp)
goto found_entry;
-
- nfsd_stats_rc_misses_inc();
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
+ nfsd_prune_bucket_locked(nn, b, 3, &dispose);
+ spin_unlock(&b->cache_lock);

+ freed = nfsd_cacherep_dispose(&dispose);
+ trace_nfsd_drc_gc(nn, freed);
+
+ nfsd_stats_rc_misses_inc();
atomic_inc(&nn->num_drc_entries);
nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
-
- nfsd_prune_bucket(b, nn);
-
-out_unlock:
- spin_unlock(&b->cache_lock);
-out:
- return rtn;
+ goto out;

found_entry:
/* We found a matching entry which is either in progress or done. */
@@ -517,7 +562,10 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)

out_trace:
trace_nfsd_drc_found(nn, rqstp, rtn);
- goto out_unlock;
+out_unlock:
+ spin_unlock(&b->cache_lock);
+out:
+ return rtn;
}

/**
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 2af74983f146..c06c505d04fb 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1261,6 +1261,28 @@ TRACE_EVENT(nfsd_drc_mismatch,
__entry->ingress)
);

+TRACE_EVENT_CONDITION(nfsd_drc_gc,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ unsigned long freed
+ ),
+ TP_ARGS(nn, freed),
+ TP_CONDITION(freed > 0),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(unsigned long, freed)
+ __field(int, total)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->freed = freed;
+ __entry->total = atomic_read(&nn->num_drc_entries);
+ ),
+ TP_printk("boot_time=%16llx total=%d freed=%lu",
+ __entry->boot_time, __entry->total, __entry->freed
+ )
+);
+
TRACE_EVENT(nfsd_cb_args,
TP_PROTO(
const struct nfs4_client *clp,

2023-07-09 15:55:27

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 5/6] NFSD: Remove svc_rqst::rq_cacherep

From: Chuck Lever <[email protected]>

Over time I'd like to see NFS-specific fields moved out of struct
svc_rqst, which is an RPC layer object. These fields are layering
violations.

Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/cache.h | 6 ++++--
fs/nfsd/nfscache.c | 11 ++++++-----
fs/nfsd/nfssvc.c | 10 ++++++----
include/linux/sunrpc/svc.h | 1 -
4 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 4c9b87850ab1..27610b071880 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,8 +84,10 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn);
void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
-int nfsd_cache_lookup(struct svc_rqst *);
-void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+int nfsd_cache_lookup(struct svc_rqst *rqstp,
+ struct svc_cacherep **cacherep);
+void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp,
+ int cachetype, __be32 *statp);
int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);

#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index c08078ac9284..9bdcd73206c9 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -464,6 +464,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
/**
* nfsd_cache_lookup - Find an entry in the duplicate reply cache
* @rqstp: Incoming Call to find
+ * @cacherep: OUT: DRC entry for this request
*
* Try to find an entry matching the current call in the cache. When none
* is found, we try to grab the oldest expired entry off the LRU list. If
@@ -476,7 +477,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
* %RC_REPLY: Reply from cache
* %RC_DROPIT: Do not process the request further
*/
-int nfsd_cache_lookup(struct svc_rqst *rqstp)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, struct svc_cacherep **cacherep)
{
struct nfsd_net *nn;
struct svc_cacherep *rp, *found;
@@ -487,7 +488,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
LIST_HEAD(dispose);
int rtn = RC_DOIT;

- rqstp->rq_cacherep = NULL;
if (type == RC_NOCACHE) {
nfsd_stats_rc_nocache_inc();
goto out;
@@ -509,7 +509,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
found = nfsd_cache_insert(b, rp, nn);
if (found != rp)
goto found_entry;
- rqstp->rq_cacherep = rp;
+ *cacherep = rp;
rp->c_state = RC_INPROG;
nfsd_prune_bucket_locked(nn, b, 3, &dispose);
spin_unlock(&b->cache_lock);
@@ -567,6 +567,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
/**
* nfsd_cache_update - Update an entry in the duplicate reply cache.
* @rqstp: svc_rqst with a finished Reply
+ * @rp: IN: DRC entry for this request
* @cachetype: which cache to update
* @statp: pointer to Reply's NFS status code, or NULL
*
@@ -584,10 +585,10 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
* nfsd failed to encode a reply that otherwise would have been cached.
* In this case, nfsd_cache_update is called with statp == NULL.
*/
-void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp,
+ int cachetype, __be32 *statp)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
struct nfsd_drc_bucket *b;
int len;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d42b2a40c93c..64ac70990019 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1045,6 +1045,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
{
const struct svc_procedure *proc = rqstp->rq_procinfo;
__be32 *statp = rqstp->rq_accept_statp;
+ struct svc_cacherep *rp;

/*
* Give the xdr decoder a chance to change this if it wants
@@ -1055,7 +1056,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;

- switch (nfsd_cache_lookup(rqstp)) {
+ rp = NULL;
+ switch (nfsd_cache_lookup(rqstp, &rp)) {
case RC_DOIT:
break;
case RC_REPLY:
@@ -1071,7 +1073,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;

- nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
+ nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
out_cached_reply:
return 1;

@@ -1081,13 +1083,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
return 1;

out_update_drop:
- nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL);
out_dropit:
return 0;

out_encode_err:
trace_nfsd_cant_encode_err(rqstp);
- nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL);
*statp = rpc_system_err;
return 1;
}
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6669f3eb9ed4..604ca45af429 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -268,7 +268,6 @@ struct svc_rqst {
/* Catering to nfsd */
struct auth_domain * rq_client; /* RPC peer info */
struct auth_domain * rq_gssclient; /* "gss/"-style peer info */
- struct svc_cacherep * rq_cacherep; /* cache info */
struct task_struct *rq_task; /* service thread */
struct net *rq_bc_net; /* pointer to backchannel's
* net namespace

2023-07-09 15:55:52

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 2/6] NFSD: Rename nfsd_reply_cache_alloc()

From: Chuck Lever <[email protected]>

For readability, rename to match the other helpers.

Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfscache.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 601298b7f75f..74fc9d9eeb1e 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -85,8 +85,8 @@ nfsd_hashsize(unsigned int limit)
}

static struct svc_cacherep *
-nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
- struct nfsd_net *nn)
+nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum,
+ struct nfsd_net *nn)
{
struct svc_cacherep *rp;

@@ -457,7 +457,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
* preallocate an entry.
*/
nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
+ rp = nfsd_cacherep_alloc(rqstp, csum, nn);
if (!rp)
goto out;

2023-07-09 15:55:52

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 6/6] NFSD: Rename struct svc_cacherep

From: Chuck Lever <[email protected]>

The svc_ prefix is identified with the SunRPC layer. Although the
duplicate reply cache caches RPC replies, it is only for the NFS
protocol. Rename the struct to better reflect its purpose.

Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/cache.h | 6 +++---
fs/nfsd/nfscache.c | 44 ++++++++++++++++++++++----------------------
fs/nfsd/nfssvc.c | 2 +-
fs/nfsd/trace.h | 4 ++--
4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 27610b071880..929248c6ca84 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -19,7 +19,7 @@
* typical sockaddr_storage. This is for space reasons, since sockaddr_storage
* is much larger than a sockaddr_in6.
*/
-struct svc_cacherep {
+struct nfsd_cacherep {
struct {
/* Keep often-read xid, csum in the same cache line: */
__be32 k_xid;
@@ -85,8 +85,8 @@ void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
int nfsd_cache_lookup(struct svc_rqst *rqstp,
- struct svc_cacherep **cacherep);
-void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp,
+ struct nfsd_cacherep **cacherep);
+void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
int cachetype, __be32 *statp);
int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 9bdcd73206c9..6eb3d7bdfaf3 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -84,11 +84,11 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}

-static struct svc_cacherep *
+static struct nfsd_cacherep *
nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum,
struct nfsd_net *nn)
{
- struct svc_cacherep *rp;
+ struct nfsd_cacherep *rp;

rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
if (rp) {
@@ -110,7 +110,7 @@ nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum,
return rp;
}

-static void nfsd_cacherep_free(struct svc_cacherep *rp)
+static void nfsd_cacherep_free(struct nfsd_cacherep *rp)
{
kfree(rp->c_replvec.iov_base);
kmem_cache_free(drc_slab, rp);
@@ -119,11 +119,11 @@ static void nfsd_cacherep_free(struct svc_cacherep *rp)
static unsigned long
nfsd_cacherep_dispose(struct list_head *dispose)
{
- struct svc_cacherep *rp;
+ struct nfsd_cacherep *rp;
unsigned long freed = 0;

while (!list_empty(dispose)) {
- rp = list_first_entry(dispose, struct svc_cacherep, c_lru);
+ rp = list_first_entry(dispose, struct nfsd_cacherep, c_lru);
list_del(&rp->c_lru);
nfsd_cacherep_free(rp);
freed++;
@@ -133,7 +133,7 @@ nfsd_cacherep_dispose(struct list_head *dispose)

static void
nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
- struct svc_cacherep *rp)
+ struct nfsd_cacherep *rp)
{
if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base)
nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len);
@@ -146,7 +146,7 @@ nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
}

static void
-nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp,
struct nfsd_net *nn)
{
nfsd_cacherep_unlink_locked(nn, b, rp);
@@ -154,7 +154,7 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
}

static void
-nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp,
struct nfsd_net *nn)
{
spin_lock(&b->cache_lock);
@@ -166,7 +166,7 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
int nfsd_drc_slab_create(void)
{
drc_slab = kmem_cache_create("nfsd_drc",
- sizeof(struct svc_cacherep), 0, 0, NULL);
+ sizeof(struct nfsd_cacherep), 0, 0, NULL);
return drc_slab ? 0: -ENOMEM;
}

@@ -235,7 +235,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)

void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
{
- struct svc_cacherep *rp;
+ struct nfsd_cacherep *rp;
unsigned int i;

unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
@@ -243,7 +243,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
for (i = 0; i < nn->drc_hashsize; i++) {
struct list_head *head = &nn->drc_hashtbl[i].lru_head;
while (!list_empty(head)) {
- rp = list_first_entry(head, struct svc_cacherep, c_lru);
+ rp = list_first_entry(head, struct nfsd_cacherep, c_lru);
nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i],
rp, nn);
}
@@ -260,7 +260,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
* not already scheduled.
*/
static void
-lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
+lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp)
{
rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &b->lru_head);
@@ -283,7 +283,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
unsigned int max, struct list_head *dispose)
{
unsigned long expiry = jiffies - RC_EXPIRE;
- struct svc_cacherep *rp, *tmp;
+ struct nfsd_cacherep *rp, *tmp;
unsigned int freed = 0;

lockdep_assert_held(&b->cache_lock);
@@ -401,8 +401,8 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
}

static int
-nfsd_cache_key_cmp(const struct svc_cacherep *key,
- const struct svc_cacherep *rp, struct nfsd_net *nn)
+nfsd_cache_key_cmp(const struct nfsd_cacherep *key,
+ const struct nfsd_cacherep *rp, struct nfsd_net *nn)
{
if (key->c_key.k_xid == rp->c_key.k_xid &&
key->c_key.k_csum != rp->c_key.k_csum) {
@@ -418,11 +418,11 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key,
* Must be called with cache_lock held. Returns the found entry or
* inserts an empty key on failure.
*/
-static struct svc_cacherep *
-nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
+static struct nfsd_cacherep *
+nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key,
struct nfsd_net *nn)
{
- struct svc_cacherep *rp, *ret = key;
+ struct nfsd_cacherep *rp, *ret = key;
struct rb_node **p = &b->rb_head.rb_node,
*parent = NULL;
unsigned int entries = 0;
@@ -431,7 +431,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
while (*p != NULL) {
++entries;
parent = *p;
- rp = rb_entry(parent, struct svc_cacherep, c_node);
+ rp = rb_entry(parent, struct nfsd_cacherep, c_node);

cmp = nfsd_cache_key_cmp(key, rp, nn);
if (cmp < 0)
@@ -477,10 +477,10 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
* %RC_REPLY: Reply from cache
* %RC_DROPIT: Do not process the request further
*/
-int nfsd_cache_lookup(struct svc_rqst *rqstp, struct svc_cacherep **cacherep)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
{
struct nfsd_net *nn;
- struct svc_cacherep *rp, *found;
+ struct nfsd_cacherep *rp, *found;
__wsum csum;
struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
@@ -585,7 +585,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct svc_cacherep **cacherep)
* nfsd failed to encode a reply that otherwise would have been cached.
* In this case, nfsd_cache_update is called with statp == NULL.
*/
-void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp,
+void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
int cachetype, __be32 *statp)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 64ac70990019..5bdbac1f4d0f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1045,7 +1045,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
{
const struct svc_procedure *proc = rqstp->rq_procinfo;
__be32 *statp = rqstp->rq_accept_statp;
- struct svc_cacherep *rp;
+ struct nfsd_cacherep *rp;

/*
* Give the xdr decoder a chance to change this if it wants
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c06c505d04fb..2388053eb862 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1240,8 +1240,8 @@ TRACE_EVENT(nfsd_drc_found,
TRACE_EVENT(nfsd_drc_mismatch,
TP_PROTO(
const struct nfsd_net *nn,
- const struct svc_cacherep *key,
- const struct svc_cacherep *rp
+ const struct nfsd_cacherep *key,
+ const struct nfsd_cacherep *rp
),
TP_ARGS(nn, key, rp),
TP_STRUCT__entry(

2023-07-09 15:56:02

by Chuck Lever

[permalink] [raw]

Subject: [PATCH v1 4/6] NFSD: Refactor the duplicate reply cache shrinker

From: Chuck Lever <[email protected]>

Avoid holding the bucket lock while freeing cache entries. This
change also caps the number of entries that are freed when the
shrinker calls to reduce the shrinker's impact on the cache's
effectiveness.

Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfsd/nfscache.c | 82 +++++++++++++++++++++++++---------------------------
1 file changed, 39 insertions(+), 43 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index c8b572d2c72a..c08078ac9284 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -309,68 +309,64 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
}
}

-static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
- unsigned int max)
+/**
+ * nfsd_reply_cache_count - count_objects method for the DRC shrinker
+ * @shrink: our registered shrinker context
+ * @sc: garbage collection parameters
+ *
+ * Returns the total number of entries in the duplicate reply cache. To
+ * keep things simple and quick, this is not the number of expired entries
+ * in the cache (ie, the number that would be removed by a call to
+ * nfsd_reply_cache_scan).
+ */
+static unsigned long
+nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
- struct svc_cacherep *rp, *tmp;
- long freed = 0;
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_reply_cache_shrinker);

- list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
- /*
- * Don't free entries attached to calls that are still
- * in-progress, but do keep scanning the list.
- */
- if (rp->c_state == RC_INPROG)
- continue;
- if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries &&
- time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
- break;
- nfsd_reply_cache_free_locked(b, rp, nn);
- if (max && freed++ > max)
- break;
- }
- return freed;
+ return atomic_read(&nn->num_drc_entries);
}

-/*
- * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
- * Also prune the oldest ones when the total exceeds the max number of entries.
+/**
+ * nfsd_reply_cache_scan - scan_objects method for the DRC shrinker
+ * @shrink: our registered shrinker context
+ * @sc: garbage collection parameters
+ *
+ * Free expired entries on each bucket's LRU list until we've released
+ * nr_to_scan freed objects. Nothing will be released if the cache
+ * has not exceeded it's max_drc_entries limit.
+ *
+ * Returns the number of entries released by this call.
*/
-static long
-prune_cache_entries(struct nfsd_net *nn)
+static unsigned long
+nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_reply_cache_shrinker);
+ unsigned long freed = 0;
+ LIST_HEAD(dispose);
unsigned int i;
- long freed = 0;

for (i = 0; i < nn->drc_hashsize; i++) {
struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i];

if (list_empty(&b->lru_head))
continue;
+
spin_lock(&b->cache_lock);
- freed += prune_bucket(b, nn, 0);
+ nfsd_prune_bucket_locked(nn, b, 0, &dispose);
spin_unlock(&b->cache_lock);
- }
- return freed;
-}

-static unsigned long
-nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
-{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
+ freed += nfsd_cacherep_dispose(&dispose);
+ if (freed > sc->nr_to_scan)
+ break;
+ }

- return atomic_read(&nn->num_drc_entries);
+ trace_nfsd_drc_gc(nn, freed);
+ return freed;
}

-static unsigned long
-nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
-{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
-
- return prune_cache_entries(nn);
-}
/*
* Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
*/

2023-07-10 13:05:39

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH v1 0/6] Fix some lock contention in the NFS server's DRC

On Sun, 2023-07-09 at 11:45 -0400, Chuck Lever wrote:
> This series optimizes DRC scalability by freeing cache objects only
> once the hash bucket lock is no longer held. There are a couple of
> related clean-ups to go along with this optimization.
>

The conventional wisdom that I've always heard is that a kfree under
spinlock is generally no big deal. It can't block and is usually quite
fast. Are you able to measure any performance delta from this set?

> ---
>
> Chuck Lever (6):
> NFSD: Refactor nfsd_reply_cache_free_locked()
> NFSD: Rename nfsd_reply_cache_alloc()
> NFSD: Replace nfsd_prune_bucket()
> NFSD: Refactor the duplicate reply cache shrinker
> NFSD: Remove svc_rqst::rq_cacherep
> NFSD: Rename struct svc_cacherep
>
>
> fs/nfsd/cache.h | 8 +-
> fs/nfsd/nfscache.c | 203 ++++++++++++++++++++++++-------------
> fs/nfsd/nfssvc.c | 10 +-
> fs/nfsd/trace.h | 26 ++++-
> include/linux/sunrpc/svc.h | 1 -
> 5 files changed, 165 insertions(+), 83 deletions(-)
>
> --
> Chuck Lever
>

--
Jeff Layton <[email protected]>

2023-07-10 13:18:43

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH v1 5/6] NFSD: Remove svc_rqst::rq_cacherep

On Sun, 2023-07-09 at 11:45 -0400, Chuck Lever wrote:
> From: Chuck Lever <[email protected]>
>
> Over time I'd like to see NFS-specific fields moved out of struct
> svc_rqst, which is an RPC layer object. These fields are layering
> violations.
>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
> fs/nfsd/cache.h | 6 ++++--
> fs/nfsd/nfscache.c | 11 ++++++-----
> fs/nfsd/nfssvc.c | 10 ++++++----
> include/linux/sunrpc/svc.h | 1 -
> 4 files changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
> index 4c9b87850ab1..27610b071880 100644
> --- a/fs/nfsd/cache.h
> +++ b/fs/nfsd/cache.h
> @@ -84,8 +84,10 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn);
> void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
> int nfsd_reply_cache_init(struct nfsd_net *);
> void nfsd_reply_cache_shutdown(struct nfsd_net *);
> -int nfsd_cache_lookup(struct svc_rqst *);
> -void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
> +int nfsd_cache_lookup(struct svc_rqst *rqstp,
> + struct svc_cacherep **cacherep);
> +void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp,
> + int cachetype, __be32 *statp);
> int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
>
> #endif /* NFSCACHE_H */
> diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
> index c08078ac9284..9bdcd73206c9 100644
> --- a/fs/nfsd/nfscache.c
> +++ b/fs/nfsd/nfscache.c
> @@ -464,6 +464,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
> /**
> * nfsd_cache_lookup - Find an entry in the duplicate reply cache
> * @rqstp: Incoming Call to find
> + * @cacherep: OUT: DRC entry for this request
> *
> * Try to find an entry matching the current call in the cache. When none
> * is found, we try to grab the oldest expired entry off the LRU list. If
> @@ -476,7 +477,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
> * %RC_REPLY: Reply from cache
> * %RC_DROPIT: Do not process the request further
> */
> -int nfsd_cache_lookup(struct svc_rqst *rqstp)
> +int nfsd_cache_lookup(struct svc_rqst *rqstp, struct svc_cacherep **cacherep)
> {
> struct nfsd_net *nn;
> struct svc_cacherep *rp, *found;
> @@ -487,7 +488,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
> LIST_HEAD(dispose);
> int rtn = RC_DOIT;
>
> - rqstp->rq_cacherep = NULL;
> if (type == RC_NOCACHE) {
> nfsd_stats_rc_nocache_inc();
> goto out;
> @@ -509,7 +509,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
> found = nfsd_cache_insert(b, rp, nn);
> if (found != rp)
> goto found_entry;
> - rqstp->rq_cacherep = rp;
> + *cacherep = rp;
> rp->c_state = RC_INPROG;
> nfsd_prune_bucket_locked(nn, b, 3, &dispose);
> spin_unlock(&b->cache_lock);
> @@ -567,6 +567,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
> /**
> * nfsd_cache_update - Update an entry in the duplicate reply cache.
> * @rqstp: svc_rqst with a finished Reply
> + * @rp: IN: DRC entry for this request
> * @cachetype: which cache to update
> * @statp: pointer to Reply's NFS status code, or NULL
> *
> @@ -584,10 +585,10 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
> * nfsd failed to encode a reply that otherwise would have been cached.
> * In this case, nfsd_cache_update is called with statp == NULL.
> */
> -void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
> +void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp,
> + int cachetype, __be32 *statp)
> {
> struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
> - struct svc_cacherep *rp = rqstp->rq_cacherep;
> struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
> struct nfsd_drc_bucket *b;
> int len;
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index d42b2a40c93c..64ac70990019 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -1045,6 +1045,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
> {
> const struct svc_procedure *proc = rqstp->rq_procinfo;
> __be32 *statp = rqstp->rq_accept_statp;
> + struct svc_cacherep *rp;
>
> /*
> * Give the xdr decoder a chance to change this if it wants
> @@ -1055,7 +1056,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
> if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
> goto out_decode_err;
>
> - switch (nfsd_cache_lookup(rqstp)) {
> + rp = NULL;
> + switch (nfsd_cache_lookup(rqstp, &rp)) {
> case RC_DOIT:
> break;
> case RC_REPLY:
> @@ -1071,7 +1073,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
> if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
> goto out_encode_err;
>
> - nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
> + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
> out_cached_reply:
> return 1;
>
> @@ -1081,13 +1083,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
> return 1;
>
> out_update_drop:
> - nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
> + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL);
> out_dropit:
> return 0;
>
> out_encode_err:
> trace_nfsd_cant_encode_err(rqstp);
> - nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
> + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL);
> *statp = rpc_system_err;
> return 1;
> }
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 6669f3eb9ed4..604ca45af429 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -268,7 +268,6 @@ struct svc_rqst {
> /* Catering to nfsd */
> struct auth_domain * rq_client; /* RPC peer info */
> struct auth_domain * rq_gssclient; /* "gss/"-style peer info */
> - struct svc_cacherep * rq_cacherep; /* cache info */
> struct task_struct *rq_task; /* service thread */
> struct net *rq_bc_net; /* pointer to backchannel's
> * net namespace
>
>

Nice cleanup! One less layering violation.

Reviewed-by: Jeff Layton <[email protected]>

2023-07-10 13:19:37

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH v1 0/6] Fix some lock contention in the NFS server's DRC

On Sun, 2023-07-09 at 11:45 -0400, Chuck Lever wrote:
> This series optimizes DRC scalability by freeing cache objects only
> once the hash bucket lock is no longer held. There are a couple of
> related clean-ups to go along with this optimization.
>
> ---
>
> Chuck Lever (6):
> NFSD: Refactor nfsd_reply_cache_free_locked()
> NFSD: Rename nfsd_reply_cache_alloc()
> NFSD: Replace nfsd_prune_bucket()
> NFSD: Refactor the duplicate reply cache shrinker
> NFSD: Remove svc_rqst::rq_cacherep
> NFSD: Rename struct svc_cacherep
>
>
> fs/nfsd/cache.h | 8 +-
> fs/nfsd/nfscache.c | 203 ++++++++++++++++++++++++-------------
> fs/nfsd/nfssvc.c | 10 +-
> fs/nfsd/trace.h | 26 ++++-
> include/linux/sunrpc/svc.h | 1 -
> 5 files changed, 165 insertions(+), 83 deletions(-)
>
> --
> Chuck Lever
>

This all looks like reasonable cleanup to me, regardless of whether it's
produces measurable optimization.

Reviewed-by: Jeff Layton <[email protected]>

2023-07-10 13:58:50

by Chuck Lever III

[permalink] [raw]

Subject: Re: [PATCH v1 0/6] Fix some lock contention in the NFS server's DRC

> On Jul 10, 2023, at 9:03 AM, Jeff Layton <[email protected]> wrote:
>
> On Sun, 2023-07-09 at 11:45 -0400, Chuck Lever wrote:
>> This series optimizes DRC scalability by freeing cache objects only
>> once the hash bucket lock is no longer held. There are a couple of
>> related clean-ups to go along with this optimization.
>>
>
>
> The conventional wisdom that I've always heard is that a kfree under
> spinlock is generally no big deal. It can't block and is usually quite
> fast. Are you able to measure any performance delta from this set?

Yes, a couple of percent better throughput with a tmpfs export and
a fast transport, which is not a common use case, granted.

I think the difference is that, after this change, we're holding
each bucket lock for a shorter period of time. That's enough to
reduce some lock contention.

But I also like the shrinker improvements.

>> ---
>>
>> Chuck Lever (6):
>> NFSD: Refactor nfsd_reply_cache_free_locked()
>> NFSD: Rename nfsd_reply_cache_alloc()
>> NFSD: Replace nfsd_prune_bucket()
>> NFSD: Refactor the duplicate reply cache shrinker
>> NFSD: Remove svc_rqst::rq_cacherep
>> NFSD: Rename struct svc_cacherep
>>
>>
>> fs/nfsd/cache.h | 8 +-
>> fs/nfsd/nfscache.c | 203 ++++++++++++++++++++++++-------------
>> fs/nfsd/nfssvc.c | 10 +-
>> fs/nfsd/trace.h | 26 ++++-
>> include/linux/sunrpc/svc.h | 1 -
>> 5 files changed, 165 insertions(+), 83 deletions(-)
>>
>> --
>> Chuck Lever
>>
>
> --
> Jeff Layton <[email protected]>

--
Chuck Lever