2018-10-01 21:20:29

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 00/15] Performance improvements for knfsd

The following set of patches aim to remove some of the global spinlocks
that are currently being taken during the processing of every incoming
RPC call. Most of these spinlocks are protecting read-mostly structures,
and so can be replaced with taking an RCU read lock.

The patchset also replaces the current reader/writer spinlock in the
server cache implementation with an RCU read lock + a regular spinlock.
This gives a slight scalability improvement by allowing lookups to be
concurrent with updates rather than excluding them.

Finally, there is a set of changes to the NFSv2/v3/v4.0 duplicate reply
cache to further optimise it for the common case where we're only
inserting new entries. By using a red-black tree rather than a simple
linked list, we reduce the typical number of entries we need to check
(a.k.a. "chain length" in /proc/fs/nfsd/reply_cache_stats) from
roughly 80 to 9 per incoming RPC request. This significantly reduces the
total amount of time spent in nfsd_cache_lookup() according to 'perf'.

Trond Myklebust (15):
SUNRPC: Remove the server 'authtab_lock' and just use RCU
SUNRPC: Add lockless lookup of the server's auth domain
SUNRPC: Allow cache lookups to use RCU protection rather than the r/w
spinlock
SUNRPC: Make server side AUTH_UNIX use lockless lookups
knfsd: Allow lockless lookups of the exports
SUNRPC: Lockless server RPCSEC_GSS context lookup
knfsd: Lockless lookup of NFSv4 identities.
NFS: Lockless DNS lookups
SUNRPC: Remove non-RCU protected lookup
SUNRPC: Replace the cache_detail->hash_lock with a regular spinlock
SUNRPC: Simplify TCP receive code
knfsd: Remove dead code from nfsd_cache_lookup
knfsd: Simplify NFS duplicate replay cache
knfsd: Further simplify the cache lookup
knfsd: Improve lookup performance in the duplicate reply cache using
an rbtree

Documentation/filesystems/nfs/rpc-cache.txt | 6 +-
fs/nfs/dns_resolve.c | 15 +-
fs/nfsd/cache.h | 19 ++-
fs/nfsd/export.c | 14 +-
fs/nfsd/export.h | 2 +
fs/nfsd/nfs4idmap.c | 11 +-
fs/nfsd/nfscache.c | 142 +++++++++---------
include/linux/sunrpc/cache.h | 18 ++-
include/linux/sunrpc/svcauth.h | 1 +
net/sunrpc/auth_gss/svcauth_gss.c | 41 +++++-
net/sunrpc/cache.c | 153 ++++++++++++--------
net/sunrpc/svcauth.c | 74 +++++++---
net/sunrpc/svcauth_unix.c | 24 ++-
net/sunrpc/svcsock.c | 53 ++-----
14 files changed, 327 insertions(+), 246 deletions(-)

--
2.17.1


2018-10-01 21:20:34

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 06/15] SUNRPC: Lockless server RPCSEC_GSS context lookup

Use RCU protection for looking up the RPCSEC_GSS context.

Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/auth_gss/svcauth_gss.c | 32 +++++++++++++++++++++++++------
1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 87c71fb0f0ea..1ece4bc3eb8d 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -76,6 +76,7 @@ struct rsi {
struct xdr_netobj in_handle, in_token;
struct xdr_netobj out_handle, out_token;
int major_status, minor_status;
+ struct rcu_head rcu_head;
};

static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old);
@@ -89,13 +90,21 @@ static void rsi_free(struct rsi *rsii)
kfree(rsii->out_token.data);
}

-static void rsi_put(struct kref *ref)
+static void rsi_free_rcu(struct rcu_head *head)
{
- struct rsi *rsii = container_of(ref, struct rsi, h.ref);
+ struct rsi *rsii = container_of(head, struct rsi, rcu_head);
+
rsi_free(rsii);
kfree(rsii);
}

+static void rsi_put(struct kref *ref)
+{
+ struct rsi *rsii = container_of(ref, struct rsi, h.ref);
+
+ call_rcu(&rsii->rcu_head, rsi_free_rcu);
+}
+
static inline int rsi_hash(struct rsi *item)
{
return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS)
@@ -282,7 +291,7 @@ static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item)
struct cache_head *ch;
int hash = rsi_hash(item);

- ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
if (ch)
return container_of(ch, struct rsi, h);
else
@@ -330,6 +339,7 @@ struct rsc {
struct svc_cred cred;
struct gss_svc_seq_data seqdata;
struct gss_ctx *mechctx;
+ struct rcu_head rcu_head;
};

static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old);
@@ -343,12 +353,22 @@ static void rsc_free(struct rsc *rsci)
free_svc_cred(&rsci->cred);
}

+static void rsc_free_rcu(struct rcu_head *head)
+{
+ struct rsc *rsci = container_of(head, struct rsc, rcu_head);
+
+ kfree(rsci->handle.data);
+ kfree(rsci);
+}
+
static void rsc_put(struct kref *ref)
{
struct rsc *rsci = container_of(ref, struct rsc, h.ref);

- rsc_free(rsci);
- kfree(rsci);
+ if (rsci->mechctx)
+ gss_delete_sec_context(&rsci->mechctx);
+ free_svc_cred(&rsci->cred);
+ call_rcu(&rsci->rcu_head, rsc_free_rcu);
}

static inline int
@@ -542,7 +562,7 @@ static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item)
struct cache_head *ch;
int hash = rsc_hash(item);

- ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
if (ch)
return container_of(ch, struct rsc, h);
else
--
2.17.1

2018-10-01 21:20:30

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 01/15] SUNRPC: Remove the server 'authtab_lock' and just use RCU

Module removal is RCU safe by design, so we really have no need to
lock the 'authtab[]' array.

Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/svcauth.c | 52 +++++++++++++++++++++++++++++---------------
1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index bb8db3cb8032..f83443856cd1 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -27,12 +27,32 @@
extern struct auth_ops svcauth_null;
extern struct auth_ops svcauth_unix;

-static DEFINE_SPINLOCK(authtab_lock);
-static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = {
- [0] = &svcauth_null,
- [1] = &svcauth_unix,
+static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = {
+ [RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null,
+ [RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix,
};

+static struct auth_ops *
+svc_get_auth_ops(rpc_authflavor_t flavor)
+{
+ struct auth_ops *aops;
+
+ if (flavor >= RPC_AUTH_MAXFLAVOR)
+ return NULL;
+ rcu_read_lock();
+ aops = rcu_dereference(authtab[flavor]);
+ if (aops != NULL && !try_module_get(aops->owner))
+ aops = NULL;
+ rcu_read_unlock();
+ return aops;
+}
+
+static void
+svc_put_auth_ops(struct auth_ops *aops)
+{
+ module_put(aops->owner);
+}
+
int
svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
{
@@ -45,14 +65,11 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)

dprintk("svc: svc_authenticate (%d)\n", flavor);

- spin_lock(&authtab_lock);
- if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) ||
- !try_module_get(aops->owner)) {
- spin_unlock(&authtab_lock);
+ aops = svc_get_auth_ops(flavor);
+ if (aops == NULL) {
*authp = rpc_autherr_badcred;
return SVC_DENIED;
}
- spin_unlock(&authtab_lock);

rqstp->rq_auth_slack = 0;
init_svc_cred(&rqstp->rq_cred);
@@ -82,7 +99,7 @@ int svc_authorise(struct svc_rqst *rqstp)

if (aops) {
rv = aops->release(rqstp);
- module_put(aops->owner);
+ svc_put_auth_ops(aops);
}
return rv;
}
@@ -90,13 +107,14 @@ int svc_authorise(struct svc_rqst *rqstp)
int
svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
{
+ struct auth_ops *old;
int rv = -EINVAL;
- spin_lock(&authtab_lock);
- if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) {
- authtab[flavor] = aops;
- rv = 0;
+
+ if (flavor < RPC_AUTH_MAXFLAVOR) {
+ old = cmpxchg((struct auth_ops ** __force)&authtab[flavor], NULL, aops);
+ if (old == NULL || old == aops)
+ rv = 0;
}
- spin_unlock(&authtab_lock);
return rv;
}
EXPORT_SYMBOL_GPL(svc_auth_register);
@@ -104,10 +122,8 @@ EXPORT_SYMBOL_GPL(svc_auth_register);
void
svc_auth_unregister(rpc_authflavor_t flavor)
{
- spin_lock(&authtab_lock);
if (flavor < RPC_AUTH_MAXFLAVOR)
- authtab[flavor] = NULL;
- spin_unlock(&authtab_lock);
+ rcu_assign_pointer(authtab[flavor], NULL);
}
EXPORT_SYMBOL_GPL(svc_auth_unregister);

--
2.17.1

2018-10-01 21:20:35

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 08/15] NFS: Lockless DNS lookups

Enable RCU protected lookup in the legacy DNS resolver.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dns_resolve.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 060c658eab66..e93a5dc07c8c 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -65,6 +65,7 @@ struct nfs_dns_ent {

struct sockaddr_storage addr;
size_t addrlen;
+ struct rcu_head rcu_head;
};


@@ -101,15 +102,23 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
}
}

-static void nfs_dns_ent_put(struct kref *ref)
+static void nfs_dns_ent_free_rcu(struct rcu_head *head)
{
struct nfs_dns_ent *item;

- item = container_of(ref, struct nfs_dns_ent, h.ref);
+ item = container_of(head, struct nfs_dns_ent, rcu_head);
kfree(item->hostname);
kfree(item);
}

+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ call_rcu(item, nfs_dns_ent_free_rcu);
+}
+
static struct cache_head *nfs_dns_ent_alloc(void)
{
struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
@@ -195,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
{
struct cache_head *ch;

- ch = sunrpc_cache_lookup(cd,
+ ch = sunrpc_cache_lookup_rcu(cd,
&key->h,
nfs_dns_hash(key));
if (!ch)
--
2.17.1

2018-10-01 21:20:33

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 05/15] knfsd: Allow lockless lookups of the exports

Convert structs svc_expkey and svc_export to allow RCU protected lookups.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/export.c | 14 +++++++-------
fs/nfsd/export.h | 2 ++
2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a1143f7c2201..802993d8912f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -46,7 +46,7 @@ static void expkey_put(struct kref *ref)
!test_bit(CACHE_NEGATIVE, &key->h.flags))
path_put(&key->ek_path);
auth_domain_put(key->ek_client);
- kfree(key);
+ kfree_rcu(key, ek_rcu);
}

static void expkey_request(struct cache_detail *cd,
@@ -265,7 +265,7 @@ svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item)
struct cache_head *ch;
int hash = svc_expkey_hash(item);

- ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
if (ch)
return container_of(ch, struct svc_expkey, h);
else
@@ -314,7 +314,7 @@ static void svc_export_put(struct kref *ref)
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp->ex_uuid);
- kfree(exp);
+ kfree_rcu(exp, ex_rcu);
}

static void svc_export_request(struct cache_detail *cd,
@@ -780,7 +780,7 @@ svc_export_lookup(struct svc_export *exp)
struct cache_head *ch;
int hash = svc_export_hash(exp);

- ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash);
+ ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash);
if (ch)
return container_of(ch, struct svc_export, h);
else
@@ -1216,9 +1216,9 @@ static int e_show(struct seq_file *m, void *p)
}

const struct seq_operations nfs_exports_op = {
- .start = cache_seq_start,
- .next = cache_seq_next,
- .stop = cache_seq_stop,
+ .start = cache_seq_start_rcu,
+ .next = cache_seq_next_rcu,
+ .stop = cache_seq_stop_rcu,
.show = e_show,
};

diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index c8b74126ddaa..e7daa1f246f0 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -61,6 +61,7 @@ struct svc_export {
u32 ex_layout_types;
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
+ struct rcu_head ex_rcu;
};

/* an "export key" (expkey) maps a filehandlefragement to an
@@ -75,6 +76,7 @@ struct svc_expkey {
u32 ek_fsid[6];

struct path ek_path;
+ struct rcu_head ek_rcu;
};

#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
--
2.17.1

2018-10-01 21:20:33

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 03/15] SUNRPC: Allow cache lookups to use RCU protection rather than the r/w spinlock

Instead of the reader/writer spinlock, allow cache lookups to use RCU
for looking up entries. This is more efficient since modifications can
occur while other entries are being looked up.

Note that for now, we keep the reader/writer spinlock until all users
have been converted to use RCU-safe freeing of their cache entries.

Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/cache.h | 12 ++++
net/sunrpc/cache.c | 122 +++++++++++++++++++++++++++++------
2 files changed, 114 insertions(+), 20 deletions(-)

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 40d2822f0e2f..cf3e17ee2786 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -167,6 +167,9 @@ extern const struct file_operations cache_file_operations_pipefs;
extern const struct file_operations content_file_operations_pipefs;
extern const struct file_operations cache_flush_operations_pipefs;

+extern struct cache_head *
+sunrpc_cache_lookup_rcu(struct cache_detail *detail,
+ struct cache_head *key, int hash);
extern struct cache_head *
sunrpc_cache_lookup(struct cache_detail *detail,
struct cache_head *key, int hash);
@@ -186,6 +189,12 @@ static inline struct cache_head *cache_get(struct cache_head *h)
return h;
}

+static inline struct cache_head *cache_get_rcu(struct cache_head *h)
+{
+ if (kref_get_unless_zero(&h->ref))
+ return h;
+ return NULL;
+}

static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
{
@@ -227,6 +236,9 @@ extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
extern void *cache_seq_next(struct seq_file *file, void *p, loff_t *pos);
extern void cache_seq_stop(struct seq_file *file, void *p);
+extern void *cache_seq_start_rcu(struct seq_file *file, loff_t *pos);
+extern void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos);
+extern void cache_seq_stop_rcu(struct seq_file *file, void *p);

extern void qword_add(char **bpp, int *lp, char *str);
extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 109fbe591e7b..7593afed9036 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -54,16 +54,34 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail)
h->last_refresh = now;
}

-struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
- struct cache_head *key, int hash)
+static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail,
+ struct cache_head *key,
+ int hash)
{
- struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL;
- struct hlist_head *head;
+ struct hlist_head *head = &detail->hash_table[hash];
+ struct cache_head *tmp;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp, head, cache_list) {
+ if (detail->match(tmp, key)) {
+ if (cache_is_expired(detail, tmp))
+ continue;
+ tmp = cache_get_rcu(tmp);
+ rcu_read_unlock();
+ return tmp;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}

- head = &detail->hash_table[hash];
+static struct cache_head *sunrpc_cache_find(struct cache_detail *detail,
+ struct cache_head *key, int hash)
+{
+ struct hlist_head *head = &detail->hash_table[hash];
+ struct cache_head *tmp;

read_lock(&detail->hash_lock);
-
hlist_for_each_entry(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp))
@@ -75,7 +93,15 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
}
}
read_unlock(&detail->hash_lock);
- /* Didn't find anything, insert an empty entry */
+ return NULL;
+}
+
+static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
+ struct cache_head *key,
+ int hash)
+{
+ struct cache_head *new, *tmp, *freeme = NULL;
+ struct hlist_head *head = &detail->hash_table[hash];

new = detail->alloc();
if (!new)
@@ -90,10 +116,10 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
write_lock(&detail->hash_lock);

/* check if entry appeared while we slept */
- hlist_for_each_entry(tmp, head, cache_list) {
+ hlist_for_each_entry_rcu(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp)) {
- hlist_del_init(&tmp->cache_list);
+ hlist_del_init_rcu(&tmp->cache_list);
detail->entries --;
freeme = tmp;
break;
@@ -105,7 +131,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
}
}

- hlist_add_head(&new->cache_list, head);
+ hlist_add_head_rcu(&new->cache_list, head);
detail->entries++;
cache_get(new);
write_unlock(&detail->hash_lock);
@@ -114,6 +140,31 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
cache_put(freeme, detail);
return new;
}
+
+struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail,
+ struct cache_head *key, int hash)
+{
+ struct cache_head *ret;
+
+ ret = sunrpc_cache_find_rcu(detail, key, hash);
+ if (ret)
+ return ret;
+ /* Didn't find anything, insert an empty entry */
+ return sunrpc_cache_add_entry(detail, key, hash);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu);
+
+struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
+ struct cache_head *key, int hash)
+{
+ struct cache_head *ret;
+
+ ret = sunrpc_cache_find(detail, key, hash);
+ if (ret)
+ return ret;
+ /* Didn't find anything, insert an empty entry */
+ return sunrpc_cache_add_entry(detail, key, hash);
+}
EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);


@@ -433,7 +484,7 @@ static int cache_clean(void)
if (!cache_is_expired(current_detail, ch))
continue;

- hlist_del_init(&ch->cache_list);
+ hlist_del_init_rcu(&ch->cache_list);
current_detail->entries--;
rv = 1;
break;
@@ -504,7 +555,7 @@ void cache_purge(struct cache_detail *detail)
for (i = 0; i < detail->hash_size; i++) {
head = &detail->hash_table[i];
hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
- hlist_del_init(&ch->cache_list);
+ hlist_del_init_rcu(&ch->cache_list);
detail->entries--;

set_bit(CACHE_CLEANED, &ch->flags);
@@ -1289,21 +1340,19 @@ EXPORT_SYMBOL_GPL(qword_get);
* get a header, then pass each real item in the cache
*/

-void *cache_seq_start(struct seq_file *m, loff_t *pos)
- __acquires(cd->hash_lock)
+static void *__cache_seq_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
unsigned int hash, entry;
struct cache_head *ch;
struct cache_detail *cd = m->private;

- read_lock(&cd->hash_lock);
if (!n--)
return SEQ_START_TOKEN;
hash = n >> 32;
entry = n & ((1LL<<32) - 1);

- hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list)
+ hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list)
if (!entry--)
return ch;
n &= ~((1LL<<32) - 1);
@@ -1315,9 +1364,19 @@ void *cache_seq_start(struct seq_file *m, loff_t *pos)
if (hash >= cd->hash_size)
return NULL;
*pos = n+1;
- return hlist_entry_safe(cd->hash_table[hash].first,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_first_rcu(&cd->hash_table[hash])),
struct cache_head, cache_list);
}
+
+void *cache_seq_start(struct seq_file *m, loff_t *pos)
+ __acquires(cd->hash_lock)
+{
+ struct cache_detail *cd = m->private;
+
+ read_lock(&cd->hash_lock);
+ return __cache_seq_start(m, pos);
+}
EXPORT_SYMBOL_GPL(cache_seq_start);

void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
@@ -1333,7 +1392,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
*pos += 1LL<<32;
} else {
++*pos;
- return hlist_entry_safe(ch->cache_list.next,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_next_rcu(&ch->cache_list)),
struct cache_head, cache_list);
}
*pos &= ~((1LL<<32) - 1);
@@ -1345,7 +1405,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
if (hash >= cd->hash_size)
return NULL;
++*pos;
- return hlist_entry_safe(cd->hash_table[hash].first,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_first_rcu(&cd->hash_table[hash])),
struct cache_head, cache_list);
}
EXPORT_SYMBOL_GPL(cache_seq_next);
@@ -1358,6 +1419,27 @@ void cache_seq_stop(struct seq_file *m, void *p)
}
EXPORT_SYMBOL_GPL(cache_seq_stop);

+void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return __cache_seq_start(m, pos);
+}
+EXPORT_SYMBOL_GPL(cache_seq_start_rcu);
+
+void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos)
+{
+ return cache_seq_next(file, p, pos);
+}
+EXPORT_SYMBOL_GPL(cache_seq_next_rcu);
+
+void cache_seq_stop_rcu(struct seq_file *m, void *p)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(cache_seq_stop_rcu);
+
static int c_show(struct seq_file *m, void *p)
{
struct cache_head *cp = p;
@@ -1846,7 +1928,7 @@ void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
{
write_lock(&cd->hash_lock);
if (!hlist_unhashed(&h->cache_list)){
- hlist_del_init(&h->cache_list);
+ hlist_del_init_rcu(&h->cache_list);
cd->entries--;
write_unlock(&cd->hash_lock);
cache_put(h, cd);
--
2.17.1

2018-10-01 21:20:39

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 12/15] knfsd: Remove dead code from nfsd_cache_lookup

The preallocated cache entry is always set to type RC_NOCACHE, and that
type isn't changed until we later call nfsd_cache_update().

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/nfscache.c | 8 --------
1 file changed, 8 deletions(-)

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index dbdeb9d6af03..cef4686f87ef 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -446,14 +446,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rp->c_csum = csum;

lru_put_end(b, rp);
-
- /* release any buffer */
- if (rp->c_type == RC_REPLBUFF) {
- drc_mem_usage -= rp->c_replvec.iov_len;
- kfree(rp->c_replvec.iov_base);
- rp->c_replvec.iov_base = NULL;
- }
- rp->c_type = RC_NOCACHE;
out:
spin_unlock(&b->cache_lock);
return rtn;
--
2.17.1

2018-10-01 21:20:40

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 13/15] knfsd: Simplify NFS duplicate replay cache

Simplify the duplicate replay cache by initialising the preallocated
cache entry, so that we can use it as a key for the cache lookup.

Note that the 99.999% case we want to optimise for is still the one
where the lookup fails, and we have to add this entry to the cache,
so preinitialising should not cause a performance penalty.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/cache.h | 6 +--
fs/nfsd/nfscache.c | 94 ++++++++++++++++++++++------------------------
2 files changed, 47 insertions(+), 53 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b7559c6f2b97..bd77d5f6fe53 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -24,13 +24,13 @@ struct svc_cacherep {
unsigned char c_state, /* unused, inprog, done */
c_type, /* status, buffer */
c_secure : 1; /* req came from port < 1024 */
- struct sockaddr_in6 c_addr;
__be32 c_xid;
- u32 c_prot;
+ __wsum c_csum;
u32 c_proc;
+ u32 c_prot;
u32 c_vers;
unsigned int c_len;
- __wsum c_csum;
+ struct sockaddr_in6 c_addr;
unsigned long c_timestamp;
union {
struct kvec u_vec;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index cef4686f87ef..527ce4c65765 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -121,7 +121,7 @@ nfsd_cache_hash(__be32 xid)
}

static struct svc_cacherep *
-nfsd_reply_cache_alloc(void)
+nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
{
struct svc_cacherep *rp;

@@ -130,6 +130,16 @@ nfsd_reply_cache_alloc(void)
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
INIT_LIST_HEAD(&rp->c_lru);
+
+ rp->c_xid = rqstp->rq_xid;
+ rp->c_proc = rqstp->rq_proc;
+ memset(&rp->c_addr, 0, sizeof(rp->c_addr));
+ rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
+ rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
+ rp->c_prot = rqstp->rq_prot;
+ rp->c_vers = rqstp->rq_vers;
+ rp->c_len = rqstp->rq_arg.len;
+ rp->c_csum = csum;
}
return rp;
}
@@ -141,9 +151,11 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
}
- list_del(&rp->c_lru);
- atomic_dec(&num_drc_entries);
- drc_mem_usage -= sizeof(*rp);
+ if (rp->c_state != RC_UNUSED) {
+ list_del(&rp->c_lru);
+ atomic_dec(&num_drc_entries);
+ drc_mem_usage -= sizeof(*rp);
+ }
kmem_cache_free(drc_slab, rp);
}

@@ -319,24 +331,23 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
}

static bool
-nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+nfsd_cache_match(const struct svc_cacherep *key, const struct svc_cacherep *rp)
{
/* Check RPC XID first */
- if (rqstp->rq_xid != rp->c_xid)
+ if (key->c_xid != rp->c_xid)
return false;
/* compare checksum of NFS data */
- if (csum != rp->c_csum) {
+ if (key->c_csum != rp->c_csum) {
++payload_misses;
return false;
}

/* Other discriminators */
- if (rqstp->rq_proc != rp->c_proc ||
- rqstp->rq_prot != rp->c_prot ||
- rqstp->rq_vers != rp->c_vers ||
- rqstp->rq_arg.len != rp->c_len ||
- !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
- rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ if (key->c_proc != rp->c_proc ||
+ key->c_prot != rp->c_prot ||
+ key->c_vers != rp->c_vers ||
+ key->c_len != rp->c_len ||
+ memcmp(&key->c_addr, &rp->c_addr, sizeof(key->c_addr)) != 0)
return false;

return true;
@@ -345,19 +356,18 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
/*
* Search the request hash for an entry that matches the given rqstp.
* Must be called with cache_lock held. Returns the found entry or
- * NULL on failure.
+ * inserts an empty key on failure.
*/
static struct svc_cacherep *
-nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
- __wsum csum)
+nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key)
{
- struct svc_cacherep *rp, *ret = NULL;
+ struct svc_cacherep *rp, *ret = key;
struct list_head *rh = &b->lru_head;
unsigned int entries = 0;

list_for_each_entry(rp, rh, c_lru) {
++entries;
- if (nfsd_cache_match(rqstp, csum, rp)) {
+ if (nfsd_cache_match(key, rp)) {
ret = rp;
break;
}
@@ -374,6 +384,7 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
atomic_read(&num_drc_entries));
}

+ lru_put_end(b, ret);
return ret;
}

@@ -389,9 +400,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
{
struct svc_cacherep *rp, *found;
__be32 xid = rqstp->rq_xid;
- u32 proto = rqstp->rq_prot,
- vers = rqstp->rq_vers,
- proc = rqstp->rq_proc;
__wsum csum;
u32 hash = nfsd_cache_hash(xid);
struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
@@ -410,52 +418,38 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
- rp = nfsd_reply_cache_alloc();
- spin_lock(&b->cache_lock);
- if (likely(rp)) {
- atomic_inc(&num_drc_entries);
- drc_mem_usage += sizeof(*rp);
+ rp = nfsd_reply_cache_alloc(rqstp, csum);
+ if (!rp) {
+ dprintk("nfsd: unable to allocate DRC entry!\n");
+ return rtn;
}

- /* go ahead and prune the cache */
- prune_bucket(b);
-
- found = nfsd_cache_search(b, rqstp, csum);
- if (found) {
- if (likely(rp))
- nfsd_reply_cache_free_locked(rp);
+ spin_lock(&b->cache_lock);
+ found = nfsd_cache_insert(b, rp);
+ if (found != rp) {
+ nfsd_reply_cache_free_locked(rp);
rp = found;
goto found_entry;
}

- if (!rp) {
- dprintk("nfsd: unable to allocate DRC entry!\n");
- goto out;
- }
-
nfsdstats.rcmisses++;
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
- rp->c_xid = xid;
- rp->c_proc = proc;
- rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
- rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
- rp->c_prot = proto;
- rp->c_vers = vers;
- rp->c_len = rqstp->rq_arg.len;
- rp->c_csum = csum;

- lru_put_end(b, rp);
+ atomic_inc(&num_drc_entries);
+ drc_mem_usage += sizeof(*rp);
+
+ /* go ahead and prune the cache */
+ prune_bucket(b);
out:
spin_unlock(&b->cache_lock);
return rtn;

found_entry:
- nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
- lru_put_end(b, rp);
-
+ nfsdstats.rchits++;
rtn = RC_DROPIT;
+
/* Request being processed */
if (rp->c_state == RC_INPROG)
goto out;
--
2.17.1

2018-10-01 21:20:32

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 04/15] SUNRPC: Make server side AUTH_UNIX use lockless lookups

Convert structs ip_map and unix_gid to use RCU protected lookups.

Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/svcauth_unix.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 84cf39021a03..fb9041b92f72 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -97,6 +97,7 @@ struct ip_map {
char m_class[8]; /* e.g. "nfsd" */
struct in6_addr m_addr;
struct unix_domain *m_client;
+ struct rcu_head m_rcu;
};

static void ip_map_put(struct kref *kref)
@@ -107,7 +108,7 @@ static void ip_map_put(struct kref *kref)
if (test_bit(CACHE_VALID, &item->flags) &&
!test_bit(CACHE_NEGATIVE, &item->flags))
auth_domain_put(&im->m_client->h);
- kfree(im);
+ kfree_rcu(im, m_rcu);
}

static inline int hash_ip6(const struct in6_addr *ip)
@@ -286,9 +287,9 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,

strcpy(ip.m_class, class);
ip.m_addr = *addr;
- ch = sunrpc_cache_lookup(cd, &ip.h,
- hash_str(class, IP_HASHBITS) ^
- hash_ip6(addr));
+ ch = sunrpc_cache_lookup_rcu(cd, &ip.h,
+ hash_str(class, IP_HASHBITS) ^
+ hash_ip6(addr));

if (ch)
return container_of(ch, struct ip_map, h);
@@ -418,6 +419,7 @@ struct unix_gid {
struct cache_head h;
kuid_t uid;
struct group_info *gi;
+ struct rcu_head rcu;
};

static int unix_gid_hash(kuid_t uid)
@@ -432,7 +434,7 @@ static void unix_gid_put(struct kref *kref)
if (test_bit(CACHE_VALID, &item->flags) &&
!test_bit(CACHE_NEGATIVE, &item->flags))
put_group_info(ug->gi);
- kfree(ug);
+ kfree_rcu(ug, rcu);
}

static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew)
@@ -625,7 +627,7 @@ static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid)
struct cache_head *ch;

ug.uid = uid;
- ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid));
+ ch = sunrpc_cache_lookup_rcu(cd, &ug.h, unix_gid_hash(uid));
if (ch)
return container_of(ch, struct unix_gid, h);
else
--
2.17.1

2018-10-01 21:20:31

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 02/15] SUNRPC: Add lockless lookup of the server's auth domain

Avoid taking the global auth_domain_lock in most lookups of the auth domain
by adding an RCU protected lookup.

Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/svcauth.h | 1 +
net/sunrpc/auth_gss/svcauth_gss.c | 9 ++++++++-
net/sunrpc/svcauth.c | 22 +++++++++++++++++++---
net/sunrpc/svcauth_unix.c | 10 ++++++++--
4 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 04e404a07882..3e53a6e2ada7 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -82,6 +82,7 @@ struct auth_domain {
struct hlist_node hash;
char *name;
struct auth_ops *flavour;
+ struct rcu_head rcu_head;
};

/*
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 860f2a1bbb67..87c71fb0f0ea 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1764,14 +1764,21 @@ svcauth_gss_release(struct svc_rqst *rqstp)
}

static void
-svcauth_gss_domain_release(struct auth_domain *dom)
+svcauth_gss_domain_release_rcu(struct rcu_head *head)
{
+ struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head);
struct gss_domain *gd = container_of(dom, struct gss_domain, h);

kfree(dom->name);
kfree(gd);
}

+static void
+svcauth_gss_domain_release(struct auth_domain *dom)
+{
+ call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu);
+}
+
static struct auth_ops svcauthops_gss = {
.name = "rpcsec_gss",
.owner = THIS_MODULE,
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index f83443856cd1..775b8c94265b 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -143,10 +143,11 @@ static struct hlist_head auth_domain_table[DN_HASHMAX];
static DEFINE_SPINLOCK(auth_domain_lock);

static void auth_domain_release(struct kref *kref)
+ __releases(&auth_domain_lock)
{
struct auth_domain *dom = container_of(kref, struct auth_domain, ref);

- hlist_del(&dom->hash);
+ hlist_del_rcu(&dom->hash);
dom->flavour->domain_release(dom);
spin_unlock(&auth_domain_lock);
}
@@ -175,7 +176,7 @@ auth_domain_lookup(char *name, struct auth_domain *new)
}
}
if (new)
- hlist_add_head(&new->hash, head);
+ hlist_add_head_rcu(&new->hash, head);
spin_unlock(&auth_domain_lock);
return new;
}
@@ -183,6 +184,21 @@ EXPORT_SYMBOL_GPL(auth_domain_lookup);

struct auth_domain *auth_domain_find(char *name)
{
- return auth_domain_lookup(name, NULL);
+ struct auth_domain *hp;
+ struct hlist_head *head;
+
+ head = &auth_domain_table[hash_str(name, DN_HASHBITS)];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hp, head, hash) {
+ if (strcmp(hp->name, name)==0) {
+ if (!kref_get_unless_zero(&hp->ref))
+ hp = NULL;
+ rcu_read_unlock();
+ return hp;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
}
EXPORT_SYMBOL_GPL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index af7f28fb8102..84cf39021a03 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -37,20 +37,26 @@ struct unix_domain {
extern struct auth_ops svcauth_null;
extern struct auth_ops svcauth_unix;

-static void svcauth_unix_domain_release(struct auth_domain *dom)
+static void svcauth_unix_domain_release_rcu(struct rcu_head *head)
{
+ struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head);
struct unix_domain *ud = container_of(dom, struct unix_domain, h);

kfree(dom->name);
kfree(ud);
}

+static void svcauth_unix_domain_release(struct auth_domain *dom)
+{
+ call_rcu(&dom->rcu_head, svcauth_unix_domain_release_rcu);
+}
+
struct auth_domain *unix_domain_find(char *name)
{
struct auth_domain *rv;
struct unix_domain *new = NULL;

- rv = auth_domain_lookup(name, NULL);
+ rv = auth_domain_find(name);
while(1) {
if (rv) {
if (new && rv != &new->h)
--
2.17.1

2018-10-01 21:20:34

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 07/15] knfsd: Lockless lookup of NFSv4 identities.

Enable RCU protected lookups of the NFSv4 idmap.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/nfs4idmap.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a5bb76593ce7..bf137fec33ff 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -65,6 +65,7 @@ struct ent {
u32 id;
char name[IDMAP_NAMESZ];
char authname[IDMAP_NAMESZ];
+ struct rcu_head rcu_head;
};

/* Common entry handling */
@@ -89,7 +90,7 @@ static void
ent_put(struct kref *ref)
{
struct ent *map = container_of(ref, struct ent, h.ref);
- kfree(map);
+ kfree_rcu(map, rcu_head);
}

static struct cache_head *
@@ -264,8 +265,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
static struct ent *
idtoname_lookup(struct cache_detail *cd, struct ent *item)
{
- struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
- idtoname_hash(item));
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ idtoname_hash(item));
if (ch)
return container_of(ch, struct ent, h);
else
@@ -422,8 +423,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
static struct ent *
nametoid_lookup(struct cache_detail *cd, struct ent *item)
{
- struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
- nametoid_hash(item));
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ nametoid_hash(item));
if (ch)
return container_of(ch, struct ent, h);
else
--
2.17.1

2018-10-01 21:20:38

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 10/15] SUNRPC: Replace the cache_detail->hash_lock with a regular spinlock

Now that the reader functions are all RCU protected, use a regular
spinlock rather than a reader/writer lock.

Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/cache.h | 2 +-
net/sunrpc/cache.c | 46 ++++++++++++++++++------------------
2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index c3d67e893430..5a3e95017fc6 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -67,7 +67,7 @@ struct cache_detail {
struct module * owner;
int hash_size;
struct hlist_head * hash_table;
- rwlock_t hash_lock;
+ spinlock_t hash_lock;

char *name;
void (*cache_put)(struct kref *);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 593cf8607414..f96345b1180e 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -92,7 +92,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
cache_init(new, detail);
detail->init(new, key);

- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);

/* check if entry appeared while we slept */
hlist_for_each_entry_rcu(tmp, head, cache_list) {
@@ -104,7 +104,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
break;
}
cache_get(tmp);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_put(new, detail);
return tmp;
}
@@ -113,7 +113,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
hlist_add_head_rcu(&new->cache_list, head);
detail->entries++;
cache_get(new);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);

if (freeme)
cache_put(freeme, detail);
@@ -167,18 +167,18 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
struct cache_head *tmp;

if (!test_bit(CACHE_VALID, &old->flags)) {
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
if (!test_bit(CACHE_VALID, &old->flags)) {
if (test_bit(CACHE_NEGATIVE, &new->flags))
set_bit(CACHE_NEGATIVE, &old->flags);
else
detail->update(old, new);
cache_fresh_locked(old, new->expiry_time, detail);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(old, detail);
return old;
}
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
}
/* We need to insert a new entry */
tmp = detail->alloc();
@@ -189,7 +189,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
cache_init(tmp, detail);
detail->init(tmp, old);

- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
if (test_bit(CACHE_NEGATIVE, &new->flags))
set_bit(CACHE_NEGATIVE, &tmp->flags);
else
@@ -199,7 +199,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
cache_get(tmp);
cache_fresh_locked(tmp, new->expiry_time, detail);
cache_fresh_locked(old, 0, detail);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(tmp, detail);
cache_fresh_unlocked(old, detail);
cache_put(old, detail);
@@ -239,7 +239,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
{
int rv;

- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
rv = cache_is_valid(h);
if (rv == -EAGAIN) {
set_bit(CACHE_NEGATIVE, &h->flags);
@@ -247,7 +247,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
detail);
rv = -ENOENT;
}
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(h, detail);
return rv;
}
@@ -357,7 +357,7 @@ static struct delayed_work cache_cleaner;

void sunrpc_init_cache_detail(struct cache_detail *cd)
{
- rwlock_init(&cd->hash_lock);
+ spin_lock_init(&cd->hash_lock);
INIT_LIST_HEAD(&cd->queue);
spin_lock(&cache_list_lock);
cd->nextcheck = 0;
@@ -377,11 +377,11 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
{
cache_purge(cd);
spin_lock(&cache_list_lock);
- write_lock(&cd->hash_lock);
+ spin_lock(&cd->hash_lock);
if (current_detail == cd)
current_detail = NULL;
list_del_init(&cd->others);
- write_unlock(&cd->hash_lock);
+ spin_unlock(&cd->hash_lock);
spin_unlock(&cache_list_lock);
if (list_empty(&cache_list)) {
/* module must be being unloaded so its safe to kill the worker */
@@ -438,7 +438,7 @@ static int cache_clean(void)
struct hlist_head *head;
struct hlist_node *tmp;

- write_lock(&current_detail->hash_lock);
+ spin_lock(&current_detail->hash_lock);

/* Ok, now to clean this strand */

@@ -455,7 +455,7 @@ static int cache_clean(void)
break;
}

- write_unlock(&current_detail->hash_lock);
+ spin_unlock(&current_detail->hash_lock);
d = current_detail;
if (!ch)
current_index ++;
@@ -510,9 +510,9 @@ void cache_purge(struct cache_detail *detail)
struct hlist_node *tmp = NULL;
int i = 0;

- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
if (!detail->entries) {
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
return;
}

@@ -524,13 +524,13 @@ void cache_purge(struct cache_detail *detail)
detail->entries--;

set_bit(CACHE_CLEANED, &ch->flags);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(ch, detail);
cache_put(ch, detail);
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
}
}
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
}
EXPORT_SYMBOL_GPL(cache_purge);

@@ -1873,13 +1873,13 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);

void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
{
- write_lock(&cd->hash_lock);
+ spin_lock(&cd->hash_lock);
if (!hlist_unhashed(&h->cache_list)){
hlist_del_init_rcu(&h->cache_list);
cd->entries--;
- write_unlock(&cd->hash_lock);
+ spin_unlock(&cd->hash_lock);
cache_put(h, cd);
} else
- write_unlock(&cd->hash_lock);
+ spin_unlock(&cd->hash_lock);
}
EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
--
2.17.1

2018-10-01 21:20:41

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 14/15] knfsd: Further simplify the cache lookup

Order the structure so that the key can be compared using memcmp().

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/cache.h | 18 ++++++++++--------
fs/nfsd/nfscache.c | 45 ++++++++++++++++-----------------------------
2 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index bd77d5f6fe53..cbcdc15753b7 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -19,18 +19,20 @@
* is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
- struct list_head c_lru;
+ struct {
+ __be32 k_xid;
+ __wsum k_csum;
+ u32 k_proc;
+ u32 k_prot;
+ u32 k_vers;
+ unsigned int k_len;
+ struct sockaddr_in6 k_addr;
+ } c_key;

+ struct list_head c_lru;
unsigned char c_state, /* unused, inprog, done */
c_type, /* status, buffer */
c_secure : 1; /* req came from port < 1024 */
- __be32 c_xid;
- __wsum c_csum;
- u32 c_proc;
- u32 c_prot;
- u32 c_vers;
- unsigned int c_len;
- struct sockaddr_in6 c_addr;
unsigned long c_timestamp;
union {
struct kvec u_vec;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 527ce4c65765..230cc83921ad 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -131,15 +131,15 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
rp->c_type = RC_NOCACHE;
INIT_LIST_HEAD(&rp->c_lru);

- rp->c_xid = rqstp->rq_xid;
- rp->c_proc = rqstp->rq_proc;
- memset(&rp->c_addr, 0, sizeof(rp->c_addr));
- rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
- rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
- rp->c_prot = rqstp->rq_prot;
- rp->c_vers = rqstp->rq_vers;
- rp->c_len = rqstp->rq_arg.len;
- rp->c_csum = csum;
+ memset(&rp->c_key, 0, sizeof(rp->c_key));
+ rp->c_key.k_xid = rqstp->rq_xid;
+ rp->c_key.k_proc = rqstp->rq_proc;
+ rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp));
+ rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp)));
+ rp->c_key.k_prot = rqstp->rq_prot;
+ rp->c_key.k_vers = rqstp->rq_vers;
+ rp->c_key.k_len = rqstp->rq_arg.len;
+ rp->c_key.k_csum = csum;
}
return rp;
}
@@ -330,27 +330,14 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
return csum;
}

-static bool
-nfsd_cache_match(const struct svc_cacherep *key, const struct svc_cacherep *rp)
+static int
+nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp)
{
- /* Check RPC XID first */
- if (key->c_xid != rp->c_xid)
- return false;
- /* compare checksum of NFS data */
- if (key->c_csum != rp->c_csum) {
+ if (key->c_key.k_xid == rp->c_key.k_xid &&
+ key->c_key.k_csum != rp->c_key.k_csum)
++payload_misses;
- return false;
- }
-
- /* Other discriminators */
- if (key->c_proc != rp->c_proc ||
- key->c_prot != rp->c_prot ||
- key->c_vers != rp->c_vers ||
- key->c_len != rp->c_len ||
- memcmp(&key->c_addr, &rp->c_addr, sizeof(key->c_addr)) != 0)
- return false;

- return true;
+ return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key));
}

/*
@@ -367,7 +354,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key)

list_for_each_entry(rp, rh, c_lru) {
++entries;
- if (nfsd_cache_match(key, rp)) {
+ if (nfsd_cache_key_cmp(key, rp) == 0) {
ret = rp;
break;
}
@@ -510,7 +497,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
if (!rp)
return;

- hash = nfsd_cache_hash(rp->c_xid);
+ hash = nfsd_cache_hash(rp->c_key.k_xid);
b = &drc_hashtbl[hash];

len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
--
2.17.1

2018-10-01 21:20:42

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 15/15] knfsd: Improve lookup performance in the duplicate reply cache using an rbtree

Use an rbtree to ensure the lookup/insert of an entry in a DRC bucket is
O(log(N)).

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/cache.h | 1 +
fs/nfsd/nfscache.c | 37 ++++++++++++++++++++++++++-----------
2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index cbcdc15753b7..ef1c8aa89ca4 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -29,6 +29,7 @@ struct svc_cacherep {
struct sockaddr_in6 k_addr;
} c_key;

+ struct rb_node c_node;
struct list_head c_lru;
unsigned char c_state, /* unused, inprog, done */
c_type, /* status, buffer */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 230cc83921ad..e2fe0e9ce0df 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -30,6 +30,7 @@
#define TARGET_BUCKET_SIZE 64

struct nfsd_drc_bucket {
+ struct rb_root rb_head;
struct list_head lru_head;
spinlock_t cache_lock;
};
@@ -129,6 +130,7 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
if (rp) {
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
+ RB_CLEAR_NODE(&rp->c_node);
INIT_LIST_HEAD(&rp->c_lru);

memset(&rp->c_key, 0, sizeof(rp->c_key));
@@ -145,13 +147,14 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
}

static void
-nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
}
if (rp->c_state != RC_UNUSED) {
+ rb_erase(&rp->c_node, &b->rb_head);
list_del(&rp->c_lru);
atomic_dec(&num_drc_entries);
drc_mem_usage -= sizeof(*rp);
@@ -163,7 +166,7 @@ static void
nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
spin_lock(&b->cache_lock);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
spin_unlock(&b->cache_lock);
}

@@ -219,7 +222,7 @@ void nfsd_reply_cache_shutdown(void)
struct list_head *head = &drc_hashtbl[i].lru_head;
while (!list_empty(head)) {
rp = list_first_entry(head, struct svc_cacherep, c_lru);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp);
}
}

@@ -258,7 +261,7 @@ prune_bucket(struct nfsd_drc_bucket *b)
if (atomic_read(&num_drc_entries) <= max_drc_entries &&
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
freed++;
}
return freed;
@@ -349,17 +352,29 @@ static struct svc_cacherep *
nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key)
{
struct svc_cacherep *rp, *ret = key;
- struct list_head *rh = &b->lru_head;
+ struct rb_node **p = &b->rb_head.rb_node,
+ *parent = NULL;
unsigned int entries = 0;
+ int cmp;

- list_for_each_entry(rp, rh, c_lru) {
+ while (*p != NULL) {
++entries;
- if (nfsd_cache_key_cmp(key, rp) == 0) {
+ parent = *p;
+ rp = rb_entry(parent, struct svc_cacherep, c_node);
+
+ cmp = nfsd_cache_key_cmp(key, rp);
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
ret = rp;
- break;
+ goto out;
}
}
-
+ rb_link_node(&key->c_node, parent, p);
+ rb_insert_color(&key->c_node, &b->rb_head);
+out:
/* tally hash chain length stats */
if (entries > longest_chain) {
longest_chain = entries;
@@ -414,7 +429,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
spin_lock(&b->cache_lock);
found = nfsd_cache_insert(b, rp);
if (found != rp) {
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(NULL, rp);
rp = found;
goto found_entry;
}
@@ -462,7 +477,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
break;
default:
printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
}

goto out;
--
2.17.1

2018-10-01 21:20:37

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 09/15] SUNRPC: Remove non-RCU protected lookup

Clean up the cache code by removing the non-RCU protected lookup.

Signed-off-by: Trond Myklebust <[email protected]>
---
Documentation/filesystems/nfs/rpc-cache.txt | 6 +-
include/linux/sunrpc/cache.h | 6 --
net/sunrpc/cache.c | 61 ++-------------------
3 files changed, 7 insertions(+), 66 deletions(-)

diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index ebcaaee21616..c4dac829db0f 100644
--- a/Documentation/filesystems/nfs/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
@@ -84,7 +84,7 @@ Creating a Cache
A message from user space has arrived to fill out a
cache entry. It is in 'buf' of length 'len'.
cache_parse should parse this, find the item in the
- cache with sunrpc_cache_lookup, and update the item
+ cache with sunrpc_cache_lookup_rcu, and update the item
with sunrpc_cache_update.


@@ -95,7 +95,7 @@ Creating a Cache
Using a cache
-------------

-To find a value in a cache, call sunrpc_cache_lookup passing a pointer
+To find a value in a cache, call sunrpc_cache_lookup_rcu passing a pointer
to the cache_head in a sample item with the 'key' fields filled in.
This will be passed to ->match to identify the target entry. If no
entry is found, a new entry will be create, added to the cache, and
@@ -116,7 +116,7 @@ item does become valid, the deferred copy of the request will be
revisited (->revisit). It is expected that this method will
reschedule the request for processing.

-The value returned by sunrpc_cache_lookup can also be passed to
+The value returned by sunrpc_cache_lookup_rcu can also be passed to
sunrpc_cache_update to set the content for the item. A second item is
passed which should hold the content. If the item found by _lookup
has valid data, then it is discarded and a new item is created. This
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index cf3e17ee2786..c3d67e893430 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -171,9 +171,6 @@ extern struct cache_head *
sunrpc_cache_lookup_rcu(struct cache_detail *detail,
struct cache_head *key, int hash);
extern struct cache_head *
-sunrpc_cache_lookup(struct cache_detail *detail,
- struct cache_head *key, int hash);
-extern struct cache_head *
sunrpc_cache_update(struct cache_detail *detail,
struct cache_head *new, struct cache_head *old, int hash);

@@ -233,9 +230,6 @@ extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);

/* Must store cache_detail in seq_file->private if using next three functions */
-extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
-extern void *cache_seq_next(struct seq_file *file, void *p, loff_t *pos);
-extern void cache_seq_stop(struct seq_file *file, void *p);
extern void *cache_seq_start_rcu(struct seq_file *file, loff_t *pos);
extern void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos);
extern void cache_seq_stop_rcu(struct seq_file *file, void *p);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 7593afed9036..593cf8607414 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -75,27 +75,6 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail,
return NULL;
}

-static struct cache_head *sunrpc_cache_find(struct cache_detail *detail,
- struct cache_head *key, int hash)
-{
- struct hlist_head *head = &detail->hash_table[hash];
- struct cache_head *tmp;
-
- read_lock(&detail->hash_lock);
- hlist_for_each_entry(tmp, head, cache_list) {
- if (detail->match(tmp, key)) {
- if (cache_is_expired(detail, tmp))
- /* This entry is expired, we will discard it. */
- break;
- cache_get(tmp);
- read_unlock(&detail->hash_lock);
- return tmp;
- }
- }
- read_unlock(&detail->hash_lock);
- return NULL;
-}
-
static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
struct cache_head *key,
int hash)
@@ -154,20 +133,6 @@ struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail,
}
EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu);

-struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
- struct cache_head *key, int hash)
-{
- struct cache_head *ret;
-
- ret = sunrpc_cache_find(detail, key, hash);
- if (ret)
- return ret;
- /* Didn't find anything, insert an empty entry */
- return sunrpc_cache_add_entry(detail, key, hash);
-}
-EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
-
-
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);

static void cache_fresh_locked(struct cache_head *head, time_t expiry,
@@ -1369,17 +1334,7 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos)
struct cache_head, cache_list);
}

-void *cache_seq_start(struct seq_file *m, loff_t *pos)
- __acquires(cd->hash_lock)
-{
- struct cache_detail *cd = m->private;
-
- read_lock(&cd->hash_lock);
- return __cache_seq_start(m, pos);
-}
-EXPORT_SYMBOL_GPL(cache_seq_start);
-
-void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
+static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
{
struct cache_head *ch = p;
int hash = (*pos >> 32);
@@ -1411,14 +1366,6 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
}
EXPORT_SYMBOL_GPL(cache_seq_next);

-void cache_seq_stop(struct seq_file *m, void *p)
- __releases(cd->hash_lock)
-{
- struct cache_detail *cd = m->private;
- read_unlock(&cd->hash_lock);
-}
-EXPORT_SYMBOL_GPL(cache_seq_stop);
-
void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos)
__acquires(RCU)
{
@@ -1466,9 +1413,9 @@ static int c_show(struct seq_file *m, void *p)
}

static const struct seq_operations cache_content_op = {
- .start = cache_seq_start,
- .next = cache_seq_next,
- .stop = cache_seq_stop,
+ .start = cache_seq_start_rcu,
+ .next = cache_seq_next_rcu,
+ .stop = cache_seq_stop_rcu,
.show = c_show,
};

--
2.17.1

2018-10-01 21:20:38

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 11/15] SUNRPC: Simplify TCP receive code

Use the fact that the iov iterators already have functionality for
skipping a base offset.

Signed-off-by: Trond Myklebust <[email protected]>
---
net/sunrpc/svcsock.c | 53 ++++++++++++--------------------------------
1 file changed, 14 insertions(+), 39 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5445145e639c..64765f08ae07 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -325,59 +325,34 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
/*
* Generic recvfrom routine.
*/
-static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
- int buflen)
+static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov,
+ unsigned int nr, size_t buflen, unsigned int base)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT,
- };
- int len;
+ struct msghdr msg = { NULL };
+ ssize_t len;

rqstp->rq_xprt_hlen = 0;

clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen);
- len = sock_recvmsg(svsk->sk_sock, &msg, msg.msg_flags);
+ if (base != 0) {
+ iov_iter_advance(&msg.msg_iter, base);
+ buflen -= base;
+ }
+ len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
/* If we read a full record, then assume there may be more
* data to read (stream based sockets only!)
*/
if (len == buflen)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);

- dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
+ dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n",
svsk, iov[0].iov_base, iov[0].iov_len, len);
return len;
}

-static int svc_partial_recvfrom(struct svc_rqst *rqstp,
- struct kvec *iov, int nr,
- int buflen, unsigned int base)
-{
- size_t save_iovlen;
- void *save_iovbase;
- unsigned int i;
- int ret;
-
- if (base == 0)
- return svc_recvfrom(rqstp, iov, nr, buflen);
-
- for (i = 0; i < nr; i++) {
- if (iov[i].iov_len > base)
- break;
- base -= iov[i].iov_len;
- }
- save_iovlen = iov[i].iov_len;
- save_iovbase = iov[i].iov_base;
- iov[i].iov_len -= base;
- iov[i].iov_base += base;
- ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
- iov[i].iov_len = save_iovlen;
- iov[i].iov_base = save_iovbase;
- return ret;
-}
-
/*
* Set socket snd and rcv buffer lengths
*/
@@ -962,7 +937,8 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
iov.iov_len = want;
- if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
+ len = svc_recvfrom(rqstp, &iov, 1, want, 0);
+ if (len < 0)
goto error;
svsk->sk_tcplen += len;

@@ -1088,14 +1064,13 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)

vec = rqstp->rq_vec;

- pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
- svsk->sk_datalen + want);
+ pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want);

rqstp->rq_respages = &rqstp->rq_pages[pnum];
rqstp->rq_next_page = rqstp->rq_respages + 1;

/* Now receive data */
- len = svc_partial_recvfrom(rqstp, vec, pnum, want, base);
+ len = svc_recvfrom(rqstp, vec, pnum, base + want, base);
if (len >= 0) {
svsk->sk_tcplen += len;
svsk->sk_datalen += len;
--
2.17.1

2018-10-01 22:07:23

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH 00/15] Performance improvements for knfsd

T24gTW9uLCAyMDE4LTEwLTAxIGF0IDEwOjQxIC0wNDAwLCBUcm9uZCBNeWtsZWJ1c3Qgd3JvdGU6
DQo+IFRoZSBmb2xsb3dpbmcgc2V0IG9mIHBhdGNoZXMgYWltIHRvIHJlbW92ZSBzb21lIG9mIHRo
ZSBnbG9iYWwNCj4gc3BpbmxvY2tzDQo+IHRoYXQgYXJlIGN1cnJlbnRseSBiZWluZyB0YWtlbiBk
dXJpbmcgdGhlIHByb2Nlc3Npbmcgb2YgZXZlcnkNCj4gaW5jb21pbmcNCj4gUlBDIGNhbGwuIE1v
c3Qgb2YgdGhlc2Ugc3BpbmxvY2tzIGFyZSBwcm90ZWN0aW5nIHJlYWQtbW9zdGx5DQo+IHN0cnVj
dHVyZXMsDQo+IGFuZCBzbyBjYW4gYmUgcmVwbGFjZWQgd2l0aCB0YWtpbmcgYW4gUkNVIHJlYWQg
bG9jay4NCj4gDQo+IFRoZSBwYXRjaHNldCBhbHNvIHJlcGxhY2VzIHRoZSBjdXJyZW50IHJlYWRl
ci93cml0ZXIgc3BpbmxvY2sgaW4gdGhlDQo+IHNlcnZlciBjYWNoZSBpbXBsZW1lbnRhdGlvbiB3
aXRoIGFuIFJDVSByZWFkIGxvY2sgKyBhIHJlZ3VsYXINCj4gc3BpbmxvY2suDQo+IFRoaXMgZ2l2
ZXMgYSBzbGlnaHQgc2NhbGFiaWxpdHkgaW1wcm92ZW1lbnQgYnkgYWxsb3dpbmcgbG9va3VwcyB0
byBiZQ0KPiBjb25jdXJyZW50IHdpdGggdXBkYXRlcyByYXRoZXIgdGhhbiBleGNsdWRpbmcgdGhl
bS4NCj4gDQo+IEZpbmFsbHksIHRoZXJlIGlzIGEgc2V0IG9mIGNoYW5nZXMgdG8gdGhlIE5GU3Yy
L3YzL3Y0LjAgZHVwbGljYXRlDQo+IHJlcGx5DQo+IGNhY2hlIHRvIGZ1cnRoZXIgb3B0aW1pc2Ug
aXQgZm9yIHRoZSBjb21tb24gY2FzZSB3aGVyZSB3ZSdyZSBvbmx5DQo+IGluc2VydGluZyBuZXcg
ZW50cmllcy4gQnkgdXNpbmcgYSByZWQtYmxhY2sgdHJlZSByYXRoZXIgdGhhbiBhIHNpbXBsZQ0K
PiBsaW5rZWQgbGlzdCwgd2UgcmVkdWNlIHRoZSB0eXBpY2FsIG51bWJlciBvZiBlbnRyaWVzIHdl
IG5lZWQgdG8gY2hlY2sNCj4gKGEuay5hLiAiY2hhaW4gbGVuZ3RoIiBpbiAvcHJvYy9mcy9uZnNk
L3JlcGx5X2NhY2hlX3N0YXRzKSBmcm9tDQo+IHJvdWdobHkgODAgdG8gOSBwZXIgaW5jb21pbmcg
UlBDIHJlcXVlc3QuIFRoaXMgc2lnbmlmaWNhbnRseSByZWR1Y2VzDQo+IHRoZQ0KPiB0b3RhbCBh
bW91bnQgb2YgdGltZSBzcGVudCBpbiBuZnNkX2NhY2hlX2xvb2t1cCgpIGFjY29yZGluZyB0bw0K
PiAncGVyZicuDQo+IA0KPiBUcm9uZCBNeWtsZWJ1c3QgKDE1KToNCj4gICBTVU5SUEM6IFJlbW92
ZSB0aGUgc2VydmVyICdhdXRodGFiX2xvY2snIGFuZCBqdXN0IHVzZSBSQ1UNCj4gICBTVU5SUEM6
IEFkZCBsb2NrbGVzcyBsb29rdXAgb2YgdGhlIHNlcnZlcidzIGF1dGggZG9tYWluDQo+ICAgU1VO
UlBDOiBBbGxvdyBjYWNoZSBsb29rdXBzIHRvIHVzZSBSQ1UgcHJvdGVjdGlvbiByYXRoZXIgdGhh
biB0aGUNCj4gci93DQo+ICAgICBzcGlubG9jaw0KPiAgIFNVTlJQQzogTWFrZSBzZXJ2ZXIgc2lk
ZSBBVVRIX1VOSVggdXNlIGxvY2tsZXNzIGxvb2t1cHMNCj4gICBrbmZzZDogQWxsb3cgbG9ja2xl
c3MgbG9va3VwcyBvZiB0aGUgZXhwb3J0cw0KPiAgIFNVTlJQQzogTG9ja2xlc3Mgc2VydmVyIFJQ
Q1NFQ19HU1MgY29udGV4dCBsb29rdXANCj4gICBrbmZzZDogTG9ja2xlc3MgbG9va3VwIG9mIE5G
U3Y0IGlkZW50aXRpZXMuDQo+ICAgTkZTOiBMb2NrbGVzcyBETlMgbG9va3Vwcw0KPiAgIFNVTlJQ
QzogUmVtb3ZlIG5vbi1SQ1UgcHJvdGVjdGVkIGxvb2t1cA0KPiAgIFNVTlJQQzogUmVwbGFjZSB0
aGUgY2FjaGVfZGV0YWlsLT5oYXNoX2xvY2sgd2l0aCBhIHJlZ3VsYXIgc3BpbmxvY2sNCj4gICBT
VU5SUEM6IFNpbXBsaWZ5IFRDUCByZWNlaXZlIGNvZGUNCj4gICBrbmZzZDogUmVtb3ZlIGRlYWQg
Y29kZSBmcm9tIG5mc2RfY2FjaGVfbG9va3VwDQo+ICAga25mc2Q6IFNpbXBsaWZ5IE5GUyBkdXBs
aWNhdGUgcmVwbGF5IGNhY2hlDQo+ICAga25mc2Q6IEZ1cnRoZXIgc2ltcGxpZnkgdGhlIGNhY2hl
IGxvb2t1cA0KPiAgIGtuZnNkOiBJbXByb3ZlIGxvb2t1cCBwZXJmb3JtYW5jZSBpbiB0aGUgZHVw
bGljYXRlIHJlcGx5IGNhY2hlDQo+IHVzaW5nDQo+ICAgICBhbiByYnRyZWUNCj4gDQo+ICBEb2N1
bWVudGF0aW9uL2ZpbGVzeXN0ZW1zL25mcy9ycGMtY2FjaGUudHh0IHwgICA2ICstDQo+ICBmcy9u
ZnMvZG5zX3Jlc29sdmUuYyAgICAgICAgICAgICAgICAgICAgICAgIHwgIDE1ICstDQo+ICBmcy9u
ZnNkL2NhY2hlLmggICAgICAgICAgICAgICAgICAgICAgICAgICAgIHwgIDE5ICsrLQ0KPiAgZnMv
bmZzZC9leHBvcnQuYyAgICAgICAgICAgICAgICAgICAgICAgICAgICB8ICAxNCArLQ0KPiAgZnMv
bmZzZC9leHBvcnQuaCAgICAgICAgICAgICAgICAgICAgICAgICAgICB8ICAgMiArDQo+ICBmcy9u
ZnNkL25mczRpZG1hcC5jICAgICAgICAgICAgICAgICAgICAgICAgIHwgIDExICstDQo+ICBmcy9u
ZnNkL25mc2NhY2hlLmMgICAgICAgICAgICAgICAgICAgICAgICAgIHwgMTQyICsrKysrKysrKy0t
LS0tLS0tLQ0KPiAgaW5jbHVkZS9saW51eC9zdW5ycGMvY2FjaGUuaCAgICAgICAgICAgICAgICB8
ICAxOCArKy0NCj4gIGluY2x1ZGUvbGludXgvc3VucnBjL3N2Y2F1dGguaCAgICAgICAgICAgICAg
fCAgIDEgKw0KPiAgbmV0L3N1bnJwYy9hdXRoX2dzcy9zdmNhdXRoX2dzcy5jICAgICAgICAgICB8
ICA0MSArKysrKy0NCj4gIG5ldC9zdW5ycGMvY2FjaGUuYyAgICAgICAgICAgICAgICAgICAgICAg
ICAgfCAxNTMgKysrKysrKysrKysrLS0tLQ0KPiAtLS0tDQo+ICBuZXQvc3VucnBjL3N2Y2F1dGgu
YyAgICAgICAgICAgICAgICAgICAgICAgIHwgIDc0ICsrKysrKystLS0NCj4gIG5ldC9zdW5ycGMv
c3ZjYXV0aF91bml4LmMgICAgICAgICAgICAgICAgICAgfCAgMjQgKystDQo+ICBuZXQvc3VucnBj
L3N2Y3NvY2suYyAgICAgICAgICAgICAgICAgICAgICAgIHwgIDUzICsrLS0tLS0NCj4gIDE0IGZp
bGVzIGNoYW5nZWQsIDMyNyBpbnNlcnRpb25zKCspLCAyNDYgZGVsZXRpb25zKC0pDQo+IA0KDQpJ
IGZvcmdvdCB0byBhZGQuIFRoZSBmdWxsIHBhdGNoc2V0IGlzIGFsc28gaG9zdGVkIG9uIGdpdDov
L2dpdC5saW51eC0NCm5mcy5vcmcvcHJvamVjdHMvdHJvbmRteS9saW51eC1uZnMuZ2l0IGluIHRo
ZSAna25mc2QtZGV2ZWwnIGJyYW5jaDoNCg0KZ2l0IHB1bGwgZ2l0Oi8vZ2l0LmxpbnV4LW5mcy5v
cmcvcHJvamVjdHMvdHJvbmRteS9saW51eC1uZnMuZ2l0IGtuZnNkLQ0KZGV2ZWwNCg0KLS0gDQpU
cm9uZCBNeWtsZWJ1c3QNCkxpbnV4IE5GUyBjbGllbnQgbWFpbnRhaW5lciwgSGFtbWVyc3BhY2UN
CnRyb25kLm15a2xlYnVzdEBoYW1tZXJzcGFjZS5jb20NCg0KDQo=

2018-10-03 02:24:06

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 01/15] SUNRPC: Remove the server 'authtab_lock' and just use RCU

On Mon, Oct 01, 2018 at 10:41:43AM -0400, Trond Myklebust wrote:
> Module removal is RCU safe by design, so we really have no need to
> lock the 'authtab[]' array.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> net/sunrpc/svcauth.c | 52 +++++++++++++++++++++++++++++---------------
> 1 file changed, 34 insertions(+), 18 deletions(-)
>
> diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
> index bb8db3cb8032..f83443856cd1 100644
> --- a/net/sunrpc/svcauth.c
> +++ b/net/sunrpc/svcauth.c
> @@ -27,12 +27,32 @@
> extern struct auth_ops svcauth_null;
> extern struct auth_ops svcauth_unix;
>
> -static DEFINE_SPINLOCK(authtab_lock);
> -static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = {
> - [0] = &svcauth_null,
> - [1] = &svcauth_unix,
> +static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = {
> + [RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null,
> + [RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix,

I didn't know about "__rcu" and "__force __rcu". Grepping turns up
relevant documentation for RCU_INIT_POINTER in include/linux/rcupdate.h,
OK, I think I get it.--b.

> };
>
> +static struct auth_ops *
> +svc_get_auth_ops(rpc_authflavor_t flavor)
> +{
> + struct auth_ops *aops;
> +
> + if (flavor >= RPC_AUTH_MAXFLAVOR)
> + return NULL;
> + rcu_read_lock();
> + aops = rcu_dereference(authtab[flavor]);
> + if (aops != NULL && !try_module_get(aops->owner))
> + aops = NULL;
> + rcu_read_unlock();
> + return aops;
> +}
> +
> +static void
> +svc_put_auth_ops(struct auth_ops *aops)
> +{
> + module_put(aops->owner);
> +}
> +
> int
> svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
> {
> @@ -45,14 +65,11 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
>
> dprintk("svc: svc_authenticate (%d)\n", flavor);
>
> - spin_lock(&authtab_lock);
> - if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) ||
> - !try_module_get(aops->owner)) {
> - spin_unlock(&authtab_lock);
> + aops = svc_get_auth_ops(flavor);
> + if (aops == NULL) {
> *authp = rpc_autherr_badcred;
> return SVC_DENIED;
> }
> - spin_unlock(&authtab_lock);
>
> rqstp->rq_auth_slack = 0;
> init_svc_cred(&rqstp->rq_cred);
> @@ -82,7 +99,7 @@ int svc_authorise(struct svc_rqst *rqstp)
>
> if (aops) {
> rv = aops->release(rqstp);
> - module_put(aops->owner);
> + svc_put_auth_ops(aops);
> }
> return rv;
> }
> @@ -90,13 +107,14 @@ int svc_authorise(struct svc_rqst *rqstp)
> int
> svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
> {
> + struct auth_ops *old;
> int rv = -EINVAL;
> - spin_lock(&authtab_lock);
> - if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) {
> - authtab[flavor] = aops;
> - rv = 0;
> +
> + if (flavor < RPC_AUTH_MAXFLAVOR) {
> + old = cmpxchg((struct auth_ops ** __force)&authtab[flavor], NULL, aops);
> + if (old == NULL || old == aops)
> + rv = 0;
> }
> - spin_unlock(&authtab_lock);
> return rv;
> }
> EXPORT_SYMBOL_GPL(svc_auth_register);
> @@ -104,10 +122,8 @@ EXPORT_SYMBOL_GPL(svc_auth_register);
> void
> svc_auth_unregister(rpc_authflavor_t flavor)
> {
> - spin_lock(&authtab_lock);
> if (flavor < RPC_AUTH_MAXFLAVOR)
> - authtab[flavor] = NULL;
> - spin_unlock(&authtab_lock);
> + rcu_assign_pointer(authtab[flavor], NULL);
> }
> EXPORT_SYMBOL_GPL(svc_auth_unregister);
>
> --
> 2.17.1

2018-10-03 22:57:11

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 03/15] SUNRPC: Allow cache lookups to use RCU protection rather than the r/w spinlock

On Mon, Oct 01, 2018 at 10:41:45AM -0400, Trond Myklebust wrote:
> Instead of the reader/writer spinlock, allow cache lookups to use RCU
> for looking up entries. This is more efficient since modifications can
> occur while other entries are being looked up.
>
> Note that for now, we keep the reader/writer spinlock until all users
> have been converted to use RCU-safe freeing of their cache entries.

This one's a bit easier for me to review if I separate out find/add
split from the interesting part.

Looks good to me, applying as follows:

--b.

commit b92a8fababa9
Author: Trond Myklebust <[email protected]>
Date: Mon Oct 1 10:41:45 2018 -0400

SUNRPC: Refactor sunrpc_cache_lookup

This is a trivial split into lookup and insert functions, no change in
behavior.

Signed-off-by: Trond Myklebust <[email protected]>
Signed-off-by: J. Bruce Fields <[email protected]>

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 109fbe591e7b..aa8e62d61f4d 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -54,13 +54,11 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail)
h->last_refresh = now;
}

-struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
- struct cache_head *key, int hash)
+static struct cache_head *sunrpc_cache_find(struct cache_detail *detail,
+ struct cache_head *key, int hash)
{
- struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL;
- struct hlist_head *head;
-
- head = &detail->hash_table[hash];
+ struct hlist_head *head = &detail->hash_table[hash];
+ struct cache_head *tmp;

read_lock(&detail->hash_lock);

@@ -75,7 +73,15 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
}
}
read_unlock(&detail->hash_lock);
- /* Didn't find anything, insert an empty entry */
+ return NULL;
+}
+
+static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
+ struct cache_head *key,
+ int hash)
+{
+ struct cache_head *new, *tmp, *freeme = NULL;
+ struct hlist_head *head = &detail->hash_table[hash];

new = detail->alloc();
if (!new)
@@ -114,8 +120,19 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
cache_put(freeme, detail);
return new;
}
-EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);

+struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
+ struct cache_head *key, int hash)
+{
+ struct cache_head *ret;
+
+ ret = sunrpc_cache_find(detail, key, hash);
+ if (ret)
+ return ret;
+ /* Didn't find anything, insert an empty entry */
+ return sunrpc_cache_add_entry(detail, key, hash);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);

static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);

commit b0e4dad04223
Author: J. Bruce Fields <[email protected]>
Date: Wed Oct 3 12:01:22 2018 -0400

SUNRPC: Allow cache lookups to use RCU protection rather than the r/w spinlock

Instead of the reader/writer spinlock, allow cache lookups to use RCU
for looking up entries. This is more efficient since modifications can
occur while other entries are being looked up.

Note that for now, we keep the reader/writer spinlock until all users
have been converted to use RCU-safe freeing of their cache entries.

Signed-off-by: Trond Myklebust <[email protected]>
Signed-off-by: J. Bruce Fields <[email protected]>

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 40d2822f0e2f..cf3e17ee2786 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -167,6 +167,9 @@ extern const struct file_operations cache_file_operations_pipefs;
extern const struct file_operations content_file_operations_pipefs;
extern const struct file_operations cache_flush_operations_pipefs;

+extern struct cache_head *
+sunrpc_cache_lookup_rcu(struct cache_detail *detail,
+ struct cache_head *key, int hash);
extern struct cache_head *
sunrpc_cache_lookup(struct cache_detail *detail,
struct cache_head *key, int hash);
@@ -186,6 +189,12 @@ static inline struct cache_head *cache_get(struct cache_head *h)
return h;
}

+static inline struct cache_head *cache_get_rcu(struct cache_head *h)
+{
+ if (kref_get_unless_zero(&h->ref))
+ return h;
+ return NULL;
+}

static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
{
@@ -227,6 +236,9 @@ extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
extern void *cache_seq_next(struct seq_file *file, void *p, loff_t *pos);
extern void cache_seq_stop(struct seq_file *file, void *p);
+extern void *cache_seq_start_rcu(struct seq_file *file, loff_t *pos);
+extern void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos);
+extern void cache_seq_stop_rcu(struct seq_file *file, void *p);

extern void qword_add(char **bpp, int *lp, char *str);
extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index aa8e62d61f4d..7593afed9036 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -54,6 +54,27 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail)
h->last_refresh = now;
}

+static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail,
+ struct cache_head *key,
+ int hash)
+{
+ struct hlist_head *head = &detail->hash_table[hash];
+ struct cache_head *tmp;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp, head, cache_list) {
+ if (detail->match(tmp, key)) {
+ if (cache_is_expired(detail, tmp))
+ continue;
+ tmp = cache_get_rcu(tmp);
+ rcu_read_unlock();
+ return tmp;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
static struct cache_head *sunrpc_cache_find(struct cache_detail *detail,
struct cache_head *key, int hash)
{
@@ -61,7 +82,6 @@ static struct cache_head *sunrpc_cache_find(struct cache_detail *detail,
struct cache_head *tmp;

read_lock(&detail->hash_lock);
-
hlist_for_each_entry(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp))
@@ -96,10 +116,10 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
write_lock(&detail->hash_lock);

/* check if entry appeared while we slept */
- hlist_for_each_entry(tmp, head, cache_list) {
+ hlist_for_each_entry_rcu(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp)) {
- hlist_del_init(&tmp->cache_list);
+ hlist_del_init_rcu(&tmp->cache_list);
detail->entries --;
freeme = tmp;
break;
@@ -111,7 +131,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
}
}

- hlist_add_head(&new->cache_list, head);
+ hlist_add_head_rcu(&new->cache_list, head);
detail->entries++;
cache_get(new);
write_unlock(&detail->hash_lock);
@@ -121,6 +141,19 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
return new;
}

+struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail,
+ struct cache_head *key, int hash)
+{
+ struct cache_head *ret;
+
+ ret = sunrpc_cache_find_rcu(detail, key, hash);
+ if (ret)
+ return ret;
+ /* Didn't find anything, insert an empty entry */
+ return sunrpc_cache_add_entry(detail, key, hash);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu);
+
struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
struct cache_head *key, int hash)
{
@@ -134,6 +167,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
}
EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);

+
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);

static void cache_fresh_locked(struct cache_head *head, time_t expiry,
@@ -450,7 +484,7 @@ static int cache_clean(void)
if (!cache_is_expired(current_detail, ch))
continue;

- hlist_del_init(&ch->cache_list);
+ hlist_del_init_rcu(&ch->cache_list);
current_detail->entries--;
rv = 1;
break;
@@ -521,7 +555,7 @@ void cache_purge(struct cache_detail *detail)
for (i = 0; i < detail->hash_size; i++) {
head = &detail->hash_table[i];
hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
- hlist_del_init(&ch->cache_list);
+ hlist_del_init_rcu(&ch->cache_list);
detail->entries--;

set_bit(CACHE_CLEANED, &ch->flags);
@@ -1306,21 +1340,19 @@ EXPORT_SYMBOL_GPL(qword_get);
* get a header, then pass each real item in the cache
*/

-void *cache_seq_start(struct seq_file *m, loff_t *pos)
- __acquires(cd->hash_lock)
+static void *__cache_seq_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
unsigned int hash, entry;
struct cache_head *ch;
struct cache_detail *cd = m->private;

- read_lock(&cd->hash_lock);
if (!n--)
return SEQ_START_TOKEN;
hash = n >> 32;
entry = n & ((1LL<<32) - 1);

- hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list)
+ hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list)
if (!entry--)
return ch;
n &= ~((1LL<<32) - 1);
@@ -1332,9 +1364,19 @@ void *cache_seq_start(struct seq_file *m, loff_t *pos)
if (hash >= cd->hash_size)
return NULL;
*pos = n+1;
- return hlist_entry_safe(cd->hash_table[hash].first,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_first_rcu(&cd->hash_table[hash])),
struct cache_head, cache_list);
}
+
+void *cache_seq_start(struct seq_file *m, loff_t *pos)
+ __acquires(cd->hash_lock)
+{
+ struct cache_detail *cd = m->private;
+
+ read_lock(&cd->hash_lock);
+ return __cache_seq_start(m, pos);
+}
EXPORT_SYMBOL_GPL(cache_seq_start);

void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
@@ -1350,7 +1392,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
*pos += 1LL<<32;
} else {
++*pos;
- return hlist_entry_safe(ch->cache_list.next,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_next_rcu(&ch->cache_list)),
struct cache_head, cache_list);
}
*pos &= ~((1LL<<32) - 1);
@@ -1362,7 +1405,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
if (hash >= cd->hash_size)
return NULL;
++*pos;
- return hlist_entry_safe(cd->hash_table[hash].first,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_first_rcu(&cd->hash_table[hash])),
struct cache_head, cache_list);
}
EXPORT_SYMBOL_GPL(cache_seq_next);
@@ -1375,6 +1419,27 @@ void cache_seq_stop(struct seq_file *m, void *p)
}
EXPORT_SYMBOL_GPL(cache_seq_stop);

+void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return __cache_seq_start(m, pos);
+}
+EXPORT_SYMBOL_GPL(cache_seq_start_rcu);
+
+void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos)
+{
+ return cache_seq_next(file, p, pos);
+}
+EXPORT_SYMBOL_GPL(cache_seq_next_rcu);
+
+void cache_seq_stop_rcu(struct seq_file *m, void *p)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(cache_seq_stop_rcu);
+
static int c_show(struct seq_file *m, void *p)
{
struct cache_head *cp = p;
@@ -1863,7 +1928,7 @@ void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
{
write_lock(&cd->hash_lock);
if (!hlist_unhashed(&h->cache_list)){
- hlist_del_init(&h->cache_list);
+ hlist_del_init_rcu(&h->cache_list);
cd->entries--;
write_unlock(&cd->hash_lock);
cache_put(h, cd);

2018-10-03 22:59:57

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 08/15] NFS: Lockless DNS lookups

The individual conversions of these caches looks straightforward to me.
Thanks for doing this in these small steps. I'm assuming you're OK with
these nfs/ changes going through the nfsd tree.

--b.

On Mon, Oct 01, 2018 at 10:41:50AM -0400, Trond Myklebust wrote:
> Enable RCU protected lookup in the legacy DNS resolver.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/dns_resolve.c | 15 ++++++++++++---
> 1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
> index 060c658eab66..e93a5dc07c8c 100644
> --- a/fs/nfs/dns_resolve.c
> +++ b/fs/nfs/dns_resolve.c
> @@ -65,6 +65,7 @@ struct nfs_dns_ent {
>
> struct sockaddr_storage addr;
> size_t addrlen;
> + struct rcu_head rcu_head;
> };
>
>
> @@ -101,15 +102,23 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
> }
> }
>
> -static void nfs_dns_ent_put(struct kref *ref)
> +static void nfs_dns_ent_free_rcu(struct rcu_head *head)
> {
> struct nfs_dns_ent *item;
>
> - item = container_of(ref, struct nfs_dns_ent, h.ref);
> + item = container_of(head, struct nfs_dns_ent, rcu_head);
> kfree(item->hostname);
> kfree(item);
> }
>
> +static void nfs_dns_ent_put(struct kref *ref)
> +{
> + struct nfs_dns_ent *item;
> +
> + item = container_of(ref, struct nfs_dns_ent, h.ref);
> + call_rcu(item, nfs_dns_ent_free_rcu);
> +}
> +
> static struct cache_head *nfs_dns_ent_alloc(void)
> {
> struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
> @@ -195,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
> {
> struct cache_head *ch;
>
> - ch = sunrpc_cache_lookup(cd,
> + ch = sunrpc_cache_lookup_rcu(cd,
> &key->h,
> nfs_dns_hash(key));
> if (!ch)
> --
> 2.17.1

2018-10-04 00:03:42

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 13/15] knfsd: Simplify NFS duplicate replay cache

On Mon, Oct 01, 2018 at 10:41:55AM -0400, Trond Myklebust wrote:
> Simplify the duplicate replay cache by initialising the preallocated
> cache entry, so that we can use it as a key for the cache lookup.
>
> Note that the 99.999% case we want to optimise for is still the one
> where the lookup fails, and we have to add this entry to the cache,
> so preinitialising should not cause a performance penalty.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfsd/cache.h | 6 +--
> fs/nfsd/nfscache.c | 94 ++++++++++++++++++++++------------------------
> 2 files changed, 47 insertions(+), 53 deletions(-)
>
> diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
> index b7559c6f2b97..bd77d5f6fe53 100644
> --- a/fs/nfsd/cache.h
> +++ b/fs/nfsd/cache.h
> @@ -24,13 +24,13 @@ struct svc_cacherep {
> unsigned char c_state, /* unused, inprog, done */
> c_type, /* status, buffer */
> c_secure : 1; /* req came from port < 1024 */
> - struct sockaddr_in6 c_addr;
> __be32 c_xid;
> - u32 c_prot;
> + __wsum c_csum;
> u32 c_proc;
> + u32 c_prot;
> u32 c_vers;
> unsigned int c_len;
> - __wsum c_csum;
> + struct sockaddr_in6 c_addr;
> unsigned long c_timestamp;
> union {
> struct kvec u_vec;

Unless I've missed something subtle--I'll move this chunk into the next
patch.--b.

> diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
> index cef4686f87ef..527ce4c65765 100644
> --- a/fs/nfsd/nfscache.c
> +++ b/fs/nfsd/nfscache.c
> @@ -121,7 +121,7 @@ nfsd_cache_hash(__be32 xid)
> }
>
> static struct svc_cacherep *
> -nfsd_reply_cache_alloc(void)
> +nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
> {
> struct svc_cacherep *rp;
>
> @@ -130,6 +130,16 @@ nfsd_reply_cache_alloc(void)
> rp->c_state = RC_UNUSED;
> rp->c_type = RC_NOCACHE;
> INIT_LIST_HEAD(&rp->c_lru);
> +
> + rp->c_xid = rqstp->rq_xid;
> + rp->c_proc = rqstp->rq_proc;
> + memset(&rp->c_addr, 0, sizeof(rp->c_addr));
> + rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
> + rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
> + rp->c_prot = rqstp->rq_prot;
> + rp->c_vers = rqstp->rq_vers;
> + rp->c_len = rqstp->rq_arg.len;
> + rp->c_csum = csum;
> }
> return rp;
> }
> @@ -141,9 +151,11 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
> drc_mem_usage -= rp->c_replvec.iov_len;
> kfree(rp->c_replvec.iov_base);
> }
> - list_del(&rp->c_lru);
> - atomic_dec(&num_drc_entries);
> - drc_mem_usage -= sizeof(*rp);
> + if (rp->c_state != RC_UNUSED) {
> + list_del(&rp->c_lru);
> + atomic_dec(&num_drc_entries);
> + drc_mem_usage -= sizeof(*rp);
> + }
> kmem_cache_free(drc_slab, rp);
> }
>
> @@ -319,24 +331,23 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
> }
>
> static bool
> -nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
> +nfsd_cache_match(const struct svc_cacherep *key, const struct svc_cacherep *rp)
> {
> /* Check RPC XID first */
> - if (rqstp->rq_xid != rp->c_xid)
> + if (key->c_xid != rp->c_xid)
> return false;
> /* compare checksum of NFS data */
> - if (csum != rp->c_csum) {
> + if (key->c_csum != rp->c_csum) {
> ++payload_misses;
> return false;
> }
>
> /* Other discriminators */
> - if (rqstp->rq_proc != rp->c_proc ||
> - rqstp->rq_prot != rp->c_prot ||
> - rqstp->rq_vers != rp->c_vers ||
> - rqstp->rq_arg.len != rp->c_len ||
> - !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
> - rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
> + if (key->c_proc != rp->c_proc ||
> + key->c_prot != rp->c_prot ||
> + key->c_vers != rp->c_vers ||
> + key->c_len != rp->c_len ||
> + memcmp(&key->c_addr, &rp->c_addr, sizeof(key->c_addr)) != 0)
> return false;
>
> return true;
> @@ -345,19 +356,18 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
> /*
> * Search the request hash for an entry that matches the given rqstp.
> * Must be called with cache_lock held. Returns the found entry or
> - * NULL on failure.
> + * inserts an empty key on failure.
> */
> static struct svc_cacherep *
> -nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
> - __wsum csum)
> +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key)
> {
> - struct svc_cacherep *rp, *ret = NULL;
> + struct svc_cacherep *rp, *ret = key;
> struct list_head *rh = &b->lru_head;
> unsigned int entries = 0;
>
> list_for_each_entry(rp, rh, c_lru) {
> ++entries;
> - if (nfsd_cache_match(rqstp, csum, rp)) {
> + if (nfsd_cache_match(key, rp)) {
> ret = rp;
> break;
> }
> @@ -374,6 +384,7 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
> atomic_read(&num_drc_entries));
> }
>
> + lru_put_end(b, ret);
> return ret;
> }
>
> @@ -389,9 +400,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
> {
> struct svc_cacherep *rp, *found;
> __be32 xid = rqstp->rq_xid;
> - u32 proto = rqstp->rq_prot,
> - vers = rqstp->rq_vers,
> - proc = rqstp->rq_proc;
> __wsum csum;
> u32 hash = nfsd_cache_hash(xid);
> struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
> @@ -410,52 +418,38 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
> * Since the common case is a cache miss followed by an insert,
> * preallocate an entry.
> */
> - rp = nfsd_reply_cache_alloc();
> - spin_lock(&b->cache_lock);
> - if (likely(rp)) {
> - atomic_inc(&num_drc_entries);
> - drc_mem_usage += sizeof(*rp);
> + rp = nfsd_reply_cache_alloc(rqstp, csum);
> + if (!rp) {
> + dprintk("nfsd: unable to allocate DRC entry!\n");
> + return rtn;
> }
>
> - /* go ahead and prune the cache */
> - prune_bucket(b);
> -
> - found = nfsd_cache_search(b, rqstp, csum);
> - if (found) {
> - if (likely(rp))
> - nfsd_reply_cache_free_locked(rp);
> + spin_lock(&b->cache_lock);
> + found = nfsd_cache_insert(b, rp);
> + if (found != rp) {
> + nfsd_reply_cache_free_locked(rp);
> rp = found;
> goto found_entry;
> }
>
> - if (!rp) {
> - dprintk("nfsd: unable to allocate DRC entry!\n");
> - goto out;
> - }
> -
> nfsdstats.rcmisses++;
> rqstp->rq_cacherep = rp;
> rp->c_state = RC_INPROG;
> - rp->c_xid = xid;
> - rp->c_proc = proc;
> - rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
> - rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
> - rp->c_prot = proto;
> - rp->c_vers = vers;
> - rp->c_len = rqstp->rq_arg.len;
> - rp->c_csum = csum;
>
> - lru_put_end(b, rp);
> + atomic_inc(&num_drc_entries);
> + drc_mem_usage += sizeof(*rp);
> +
> + /* go ahead and prune the cache */
> + prune_bucket(b);
> out:
> spin_unlock(&b->cache_lock);
> return rtn;
>
> found_entry:
> - nfsdstats.rchits++;
> /* We found a matching entry which is either in progress or done. */
> - lru_put_end(b, rp);
> -
> + nfsdstats.rchits++;
> rtn = RC_DROPIT;
> +
> /* Request being processed */
> if (rp->c_state == RC_INPROG)
> goto out;
> --
> 2.17.1

2018-10-04 00:44:56

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH 08/15] NFS: Lockless DNS lookups

On Wed, 2018-10-03 at 12:10 -0400, J . Bruce Fields wrote:
> The individual conversions of these caches looks straightforward to
> me.
> Thanks for doing this in these small steps. I'm assuming you're OK
> with
> these nfs/ changes going through the nfsd tree.
>

Yes please. With the bulk of the changes being nfsd specific, that
makes more sense to me.

> --b.
>
> On Mon, Oct 01, 2018 at 10:41:50AM -0400, Trond Myklebust wrote:
> > Enable RCU protected lookup in the legacy DNS resolver.
> >
> > Signed-off-by: Trond Myklebust <[email protected]>
> > ---
> > fs/nfs/dns_resolve.c | 15 ++++++++++++---
> > 1 file changed, 12 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
> > index 060c658eab66..e93a5dc07c8c 100644
> > --- a/fs/nfs/dns_resolve.c
> > +++ b/fs/nfs/dns_resolve.c
> > @@ -65,6 +65,7 @@ struct nfs_dns_ent {
> >
> > struct sockaddr_storage addr;
> > size_t addrlen;
> > + struct rcu_head rcu_head;
> > };
> >
> >
> > @@ -101,15 +102,23 @@ static void nfs_dns_ent_init(struct
> > cache_head *cnew,
> > }
> > }
> >
> > -static void nfs_dns_ent_put(struct kref *ref)
> > +static void nfs_dns_ent_free_rcu(struct rcu_head *head)
> > {
> > struct nfs_dns_ent *item;
> >
> > - item = container_of(ref, struct nfs_dns_ent, h.ref);
> > + item = container_of(head, struct nfs_dns_ent, rcu_head);
> > kfree(item->hostname);
> > kfree(item);
> > }
> >
> > +static void nfs_dns_ent_put(struct kref *ref)
> > +{
> > + struct nfs_dns_ent *item;
> > +
> > + item = container_of(ref, struct nfs_dns_ent, h.ref);
> > + call_rcu(item, nfs_dns_ent_free_rcu);
> > +}
> > +
> > static struct cache_head *nfs_dns_ent_alloc(void)
> > {
> > struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
> > @@ -195,7 +204,7 @@ static struct nfs_dns_ent
> > *nfs_dns_lookup(struct cache_detail *cd,
> > {
> > struct cache_head *ch;
> >
> > - ch = sunrpc_cache_lookup(cd,
> > + ch = sunrpc_cache_lookup_rcu(cd,
> > &key->h,
> > nfs_dns_hash(key));
> > if (!ch)
> > --
> > 2.17.1

2018-10-04 00:51:46

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH 13/15] knfsd: Simplify NFS duplicate replay cache

T24gV2VkLCAyMDE4LTEwLTAzIGF0IDEzOjE0IC0wNDAwLCBKIC4gQnJ1Y2UgRmllbGRzIHdyb3Rl
Og0KPiBPbiBNb24sIE9jdCAwMSwgMjAxOCBhdCAxMDo0MTo1NUFNIC0wNDAwLCBUcm9uZCBNeWts
ZWJ1c3Qgd3JvdGU6DQo+ID4gU2ltcGxpZnkgdGhlIGR1cGxpY2F0ZSByZXBsYXkgY2FjaGUgYnkg
aW5pdGlhbGlzaW5nIHRoZQ0KPiA+IHByZWFsbG9jYXRlZA0KPiA+IGNhY2hlIGVudHJ5LCBzbyB0
aGF0IHdlIGNhbiB1c2UgaXQgYXMgYSBrZXkgZm9yIHRoZSBjYWNoZSBsb29rdXAuDQo+ID4gDQo+
ID4gTm90ZSB0aGF0IHRoZSA5OS45OTklIGNhc2Ugd2Ugd2FudCB0byBvcHRpbWlzZSBmb3IgaXMg
c3RpbGwgdGhlIG9uZQ0KPiA+IHdoZXJlIHRoZSBsb29rdXAgZmFpbHMsIGFuZCB3ZSBoYXZlIHRv
IGFkZCB0aGlzIGVudHJ5IHRvIHRoZSBjYWNoZSwNCj4gPiBzbyBwcmVpbml0aWFsaXNpbmcgc2hv
dWxkIG5vdCBjYXVzZSBhIHBlcmZvcm1hbmNlIHBlbmFsdHkuDQo+ID4gDQo+ID4gU2lnbmVkLW9m
Zi1ieTogVHJvbmQgTXlrbGVidXN0IDx0cm9uZC5teWtsZWJ1c3RAaGFtbWVyc3BhY2UuY29tPg0K
PiA+IC0tLQ0KPiA+ICBmcy9uZnNkL2NhY2hlLmggICAgfCAgNiArLS0NCj4gPiAgZnMvbmZzZC9u
ZnNjYWNoZS5jIHwgOTQgKysrKysrKysrKysrKysrKysrKysrKy0tLS0tLS0tLS0tLS0tLS0tLQ0K
PiA+IC0tLS0tLQ0KPiA+ICAyIGZpbGVzIGNoYW5nZWQsIDQ3IGluc2VydGlvbnMoKyksIDUzIGRl
bGV0aW9ucygtKQ0KPiA+IA0KPiA+IGRpZmYgLS1naXQgYS9mcy9uZnNkL2NhY2hlLmggYi9mcy9u
ZnNkL2NhY2hlLmgNCj4gPiBpbmRleCBiNzU1OWM2ZjJiOTcuLmJkNzdkNWY2ZmU1MyAxMDA2NDQN
Cj4gPiAtLS0gYS9mcy9uZnNkL2NhY2hlLmgNCj4gPiArKysgYi9mcy9uZnNkL2NhY2hlLmgNCj4g
PiBAQCAtMjQsMTMgKzI0LDEzIEBAIHN0cnVjdCBzdmNfY2FjaGVyZXAgew0KPiA+ICAJdW5zaWdu
ZWQgY2hhcgkJY19zdGF0ZSwJLyogdW51c2VkLCBpbnByb2csDQo+ID4gZG9uZSAqLw0KPiA+ICAJ
CQkJY190eXBlLAkJLyogc3RhdHVzLCBidWZmZXINCj4gPiAqLw0KPiA+ICAJCQkJY19zZWN1cmUg
OiAxOwkvKiByZXEgY2FtZSBmcm9tDQo+ID4gcG9ydCA8IDEwMjQgKi8NCj4gPiAtCXN0cnVjdCBz
b2NrYWRkcl9pbjYJY19hZGRyOw0KPiA+ICAJX19iZTMyCQkJY194aWQ7DQo+ID4gLQl1MzIJCQlj
X3Byb3Q7DQo+ID4gKwlfX3dzdW0JCQljX2NzdW07DQo+ID4gIAl1MzIJCQljX3Byb2M7DQo+ID4g
Kwl1MzIJCQljX3Byb3Q7DQo+ID4gIAl1MzIJCQljX3ZlcnM7DQo+ID4gIAl1bnNpZ25lZCBpbnQJ
CWNfbGVuOw0KPiA+IC0JX193c3VtCQkJY19jc3VtOw0KPiA+ICsJc3RydWN0IHNvY2thZGRyX2lu
NgljX2FkZHI7DQo+ID4gIAl1bnNpZ25lZCBsb25nCQljX3RpbWVzdGFtcDsNCj4gPiAgCXVuaW9u
IHsNCj4gPiAgCQlzdHJ1Y3Qga3ZlYwl1X3ZlYzsNCj4gDQo+IFVubGVzcyBJJ3ZlIG1pc3NlZCBz
b21ldGhpbmcgc3VidGxlLS1JJ2xsIG1vdmUgdGhpcyBjaHVuayBpbnRvIHRoZQ0KPiBuZXh0DQo+
IHBhdGNoLi0tYi4NCg0KTm90aGluZyB0b28gc3VidGxlLiBUaGUgb25seSByZWFzb24gZm9yIGtl
ZXBpbmcgaXQgaW4gdGhpcyBwYXRjaCBpcw0KYmVjYXVzZSBldmVuIHdpdGggdGhlIGN1cnJlbnQg
Y29kZSwgbW9zdCBvZiB0aGUgY29tcGFyaXNvbnMgaGl0IHRoZQ0KY194aWQgYW5kIHBvc3NpYmx5
IHNvbWV0aW1lcyB0aGUgY19jc3VtLCBzbyB0aG9zZSBhcmUgdGhlIG1haW4gZmllbGRzDQp0aGF0
IHlvdSB3YW50IHRvIHRyeSB0byBrZWVwIGluIHRoZSBzYW1lIGNhY2hlIGxpbmUuDQoNCg0KLS0g
DQpUcm9uZCBNeWtsZWJ1c3QNCkxpbnV4IE5GUyBjbGllbnQgbWFpbnRhaW5lciwgSGFtbWVyc3Bh
Y2UNCnRyb25kLm15a2xlYnVzdEBoYW1tZXJzcGFjZS5jb20NCg0KDQo=

2018-10-04 01:01:19

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 13/15] knfsd: Simplify NFS duplicate replay cache

On Wed, Oct 03, 2018 at 06:01:36PM +0000, Trond Myklebust wrote:
> On Wed, 2018-10-03 at 13:14 -0400, J . Bruce Fields wrote:
> > On Mon, Oct 01, 2018 at 10:41:55AM -0400, Trond Myklebust wrote:
> > > Simplify the duplicate replay cache by initialising the
> > > preallocated
> > > cache entry, so that we can use it as a key for the cache lookup.
> > >
> > > Note that the 99.999% case we want to optimise for is still the one
> > > where the lookup fails, and we have to add this entry to the cache,
> > > so preinitialising should not cause a performance penalty.
> > >
> > > Signed-off-by: Trond Myklebust <[email protected]>
> > > ---
> > > fs/nfsd/cache.h | 6 +--
> > > fs/nfsd/nfscache.c | 94 ++++++++++++++++++++++------------------
> > > ------
> > > 2 files changed, 47 insertions(+), 53 deletions(-)
> > >
> > > diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
> > > index b7559c6f2b97..bd77d5f6fe53 100644
> > > --- a/fs/nfsd/cache.h
> > > +++ b/fs/nfsd/cache.h
> > > @@ -24,13 +24,13 @@ struct svc_cacherep {
> > > unsigned char c_state, /* unused, inprog,
> > > done */
> > > c_type, /* status, buffer
> > > */
> > > c_secure : 1; /* req came from
> > > port < 1024 */
> > > - struct sockaddr_in6 c_addr;
> > > __be32 c_xid;
> > > - u32 c_prot;
> > > + __wsum c_csum;
> > > u32 c_proc;
> > > + u32 c_prot;
> > > u32 c_vers;
> > > unsigned int c_len;
> > > - __wsum c_csum;
> > > + struct sockaddr_in6 c_addr;
> > > unsigned long c_timestamp;
> > > union {
> > > struct kvec u_vec;
> >
> > Unless I've missed something subtle--I'll move this chunk into the
> > next
> > patch.--b.
>
> Nothing too subtle. The only reason for keeping it in this patch is
> because even with the current code, most of the comparisons hit the
> c_xid and possibly sometimes the c_csum, so those are the main fields
> that you want to try to keep in the same cache line.

That could use a comment.

--b.

2018-10-04 06:42:28

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 13/15] knfsd: Simplify NFS duplicate replay cache

On Wed, Oct 03, 2018 at 02:11:50PM -0400, [email protected] wrote:
> On Wed, Oct 03, 2018 at 06:01:36PM +0000, Trond Myklebust wrote:
> > On Wed, 2018-10-03 at 13:14 -0400, J . Bruce Fields wrote:
> > > On Mon, Oct 01, 2018 at 10:41:55AM -0400, Trond Myklebust wrote:
> > > > Simplify the duplicate replay cache by initialising the
> > > > preallocated
> > > > cache entry, so that we can use it as a key for the cache lookup.
> > > >
> > > > Note that the 99.999% case we want to optimise for is still the one
> > > > where the lookup fails, and we have to add this entry to the cache,
> > > > so preinitialising should not cause a performance penalty.
> > > >
> > > > Signed-off-by: Trond Myklebust <[email protected]>
> > > > ---
> > > > fs/nfsd/cache.h | 6 +--
> > > > fs/nfsd/nfscache.c | 94 ++++++++++++++++++++++------------------
> > > > ------
> > > > 2 files changed, 47 insertions(+), 53 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
> > > > index b7559c6f2b97..bd77d5f6fe53 100644
> > > > --- a/fs/nfsd/cache.h
> > > > +++ b/fs/nfsd/cache.h
> > > > @@ -24,13 +24,13 @@ struct svc_cacherep {
> > > > unsigned char c_state, /* unused, inprog,
> > > > done */
> > > > c_type, /* status, buffer
> > > > */
> > > > c_secure : 1; /* req came from
> > > > port < 1024 */
> > > > - struct sockaddr_in6 c_addr;
> > > > __be32 c_xid;
> > > > - u32 c_prot;
> > > > + __wsum c_csum;
> > > > u32 c_proc;
> > > > + u32 c_prot;
> > > > u32 c_vers;
> > > > unsigned int c_len;
> > > > - __wsum c_csum;
> > > > + struct sockaddr_in6 c_addr;
> > > > unsigned long c_timestamp;
> > > > union {
> > > > struct kvec u_vec;
> > >
> > > Unless I've missed something subtle--I'll move this chunk into the
> > > next
> > > patch.--b.
> >
> > Nothing too subtle. The only reason for keeping it in this patch is
> > because even with the current code, most of the comparisons hit the
> > c_xid and possibly sometimes the c_csum, so those are the main fields
> > that you want to try to keep in the same cache line.
>
> That could use a comment.

I'm adding just

struct svc_cacherep {
struct {
+ /* Keep often-read xid, csum in the same cache line: */
__be32 k_xid;
__wsum k_csum;
u32 k_proc;

--b.

2018-10-04 07:35:18

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 15/15] knfsd: Improve lookup performance in the duplicate reply cache using an rbtree

On Mon, Oct 01, 2018 at 10:41:57AM -0400, Trond Myklebust wrote:
> Use an rbtree to ensure the lookup/insert of an entry in a DRC bucket is
> O(log(N)).

So the naming on those hash chain statistics are off now, but I guess
that's harmless.

All looks good to me, thanks--applying.

--b.

>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfsd/cache.h | 1 +
> fs/nfsd/nfscache.c | 37 ++++++++++++++++++++++++++-----------
> 2 files changed, 27 insertions(+), 11 deletions(-)
>
> diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
> index cbcdc15753b7..ef1c8aa89ca4 100644
> --- a/fs/nfsd/cache.h
> +++ b/fs/nfsd/cache.h
> @@ -29,6 +29,7 @@ struct svc_cacherep {
> struct sockaddr_in6 k_addr;
> } c_key;
>
> + struct rb_node c_node;
> struct list_head c_lru;
> unsigned char c_state, /* unused, inprog, done */
> c_type, /* status, buffer */
> diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
> index 230cc83921ad..e2fe0e9ce0df 100644
> --- a/fs/nfsd/nfscache.c
> +++ b/fs/nfsd/nfscache.c
> @@ -30,6 +30,7 @@
> #define TARGET_BUCKET_SIZE 64
>
> struct nfsd_drc_bucket {
> + struct rb_root rb_head;
> struct list_head lru_head;
> spinlock_t cache_lock;
> };
> @@ -129,6 +130,7 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
> if (rp) {
> rp->c_state = RC_UNUSED;
> rp->c_type = RC_NOCACHE;
> + RB_CLEAR_NODE(&rp->c_node);
> INIT_LIST_HEAD(&rp->c_lru);
>
> memset(&rp->c_key, 0, sizeof(rp->c_key));
> @@ -145,13 +147,14 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
> }
>
> static void
> -nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
> +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
> {
> if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
> drc_mem_usage -= rp->c_replvec.iov_len;
> kfree(rp->c_replvec.iov_base);
> }
> if (rp->c_state != RC_UNUSED) {
> + rb_erase(&rp->c_node, &b->rb_head);
> list_del(&rp->c_lru);
> atomic_dec(&num_drc_entries);
> drc_mem_usage -= sizeof(*rp);
> @@ -163,7 +166,7 @@ static void
> nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
> {
> spin_lock(&b->cache_lock);
> - nfsd_reply_cache_free_locked(rp);
> + nfsd_reply_cache_free_locked(b, rp);
> spin_unlock(&b->cache_lock);
> }
>
> @@ -219,7 +222,7 @@ void nfsd_reply_cache_shutdown(void)
> struct list_head *head = &drc_hashtbl[i].lru_head;
> while (!list_empty(head)) {
> rp = list_first_entry(head, struct svc_cacherep, c_lru);
> - nfsd_reply_cache_free_locked(rp);
> + nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp);
> }
> }
>
> @@ -258,7 +261,7 @@ prune_bucket(struct nfsd_drc_bucket *b)
> if (atomic_read(&num_drc_entries) <= max_drc_entries &&
> time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
> break;
> - nfsd_reply_cache_free_locked(rp);
> + nfsd_reply_cache_free_locked(b, rp);
> freed++;
> }
> return freed;
> @@ -349,17 +352,29 @@ static struct svc_cacherep *
> nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key)
> {
> struct svc_cacherep *rp, *ret = key;
> - struct list_head *rh = &b->lru_head;
> + struct rb_node **p = &b->rb_head.rb_node,
> + *parent = NULL;
> unsigned int entries = 0;
> + int cmp;
>
> - list_for_each_entry(rp, rh, c_lru) {
> + while (*p != NULL) {
> ++entries;
> - if (nfsd_cache_key_cmp(key, rp) == 0) {
> + parent = *p;
> + rp = rb_entry(parent, struct svc_cacherep, c_node);
> +
> + cmp = nfsd_cache_key_cmp(key, rp);
> + if (cmp < 0)
> + p = &parent->rb_left;
> + else if (cmp > 0)
> + p = &parent->rb_right;
> + else {
> ret = rp;
> - break;
> + goto out;
> }
> }
> -
> + rb_link_node(&key->c_node, parent, p);
> + rb_insert_color(&key->c_node, &b->rb_head);
> +out:
> /* tally hash chain length stats */
> if (entries > longest_chain) {
> longest_chain = entries;
> @@ -414,7 +429,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
> spin_lock(&b->cache_lock);
> found = nfsd_cache_insert(b, rp);
> if (found != rp) {
> - nfsd_reply_cache_free_locked(rp);
> + nfsd_reply_cache_free_locked(NULL, rp);
> rp = found;
> goto found_entry;
> }
> @@ -462,7 +477,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
> break;
> default:
> printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
> - nfsd_reply_cache_free_locked(rp);
> + nfsd_reply_cache_free_locked(b, rp);
> }
>
> goto out;
> --
> 2.17.1