LinuxLists.cc - [PATCH v3 0/8] nfsd: support for lifting grace period early

2014-09-08 14:47:02

Subject: [PATCH v3 0/8] nfsd: support for lifting grace period early

v3:
- only accept Y/y/1 in new procfile writes
- turn minorversion env var into a "reclaim complete" boolean
- serialize nfsdcltrack upcalls for a client
- reduce duplicate upcalls via NFSD4_CLIENT_STABLE flag
- don't allow reclaims after RECLAIM_COMPLETE

v2:
- move grace period handling into its own module

Here's a respin of the patches to reduce the grace period. A respun
version of the nfs-utils patches will be posted separately (needed due
to the minorversion to reclaim complete boolean change). I also threw
in some other patches that clean up the upcall handling a bit and
that prevent the client from being able to reclaim after sending a
RECLAIM_COMPLETE.

Original cover letter follows:

One of the huge annoyances in dealing with knfsd is the 90s grace period
that's imposed when the server reboots. This is not just an annoyance,
but means a siginificant amount of "downtime" in many production
environments.

This patchset aimed at reducing this pain. It adds a couple of /proc
knobs that tell the lockd and nfsd lock managers to lift the grace
period.

It also changes the UMH upcalls to pass a little bit of extra info in
the form of environment variables so that the upcall program can
determine whether there are still any clients that may be in the process
of reclaiming.

There are also a couple of cleanup patches in here that are not strictly
required. In particular, making a separate grace.ko module doesn't have
to be done, but I think it's a good idea.

Jeff Layton (8):
lockd: move lockd's grace period handling into its own module
nfsd: remove redundant boot_time parm from grace_done client tracking
op
lockd: add a /proc/fs/lockd/nlm_end_grace file
nfsd: add a v4_end_grace file to /proc/fs/nfsd
nfsd: pass extra info in env vars to upcalls to allow for early grace
period end
nfsd: serialize nfsdcltrack upcalls for a particular client
nfsd: set and test NFSD4_CLIENT_STABLE bit to reduce nfsdcltrack
upcalls
nfsd: reject reclaim request when client has already sent
RECLAIM_COMPLETE

fs/Kconfig | 6 +-
fs/lockd/Makefile | 3 +-
fs/lockd/grace.c | 65 ---------------------
fs/lockd/netns.h | 1 -
fs/lockd/procfs.c | 92 +++++++++++++++++++++++++++++
fs/lockd/procfs.h | 28 +++++++++
fs/lockd/svc.c | 10 +++-
fs/nfs_common/Makefile | 3 +-
fs/nfs_common/grace.c | 113 ++++++++++++++++++++++++++++++++++++
fs/nfsd/Kconfig | 1 +
fs/nfsd/nfs4recover.c | 150 +++++++++++++++++++++++++++++++++++++++++-------
fs/nfsd/nfs4state.c | 11 ++--
fs/nfsd/nfsctl.c | 45 +++++++++++++++
fs/nfsd/state.h | 6 +-
include/linux/proc_fs.h | 2 +
15 files changed, 440 insertions(+), 96 deletions(-)
delete mode 100644 fs/lockd/grace.c
create mode 100644 fs/lockd/procfs.c
create mode 100644 fs/lockd/procfs.h
create mode 100644 fs/nfs_common/grace.c

--
1.9.3

2014-09-08 14:47:03

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 3/8] lockd: add a /proc/fs/lockd/nlm_end_grace file

Add a new procfile that will allow a (privileged) userland process to
end the NLM grace period early. The basic idea here will be to have
sm-notify write to this file, if it sent out no NOTIFY requests when
it runs. In that situation, we can generally expect that there will be
no reclaim requests so the grace period can be lifted early.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/lockd/Makefile | 1 +
fs/lockd/procfs.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/lockd/procfs.h | 28 +++++++++++++++++
fs/lockd/svc.c | 9 ++++++
4 files changed, 130 insertions(+)
create mode 100644 fs/lockd/procfs.c
create mode 100644 fs/lockd/procfs.h

diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 6a0b351ce30e..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_LOCKD) += lockd.o
lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
svcshare.o svcproc.o svcsubs.o mon.o xdr.o
lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_PROC_FS) += procfs.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <[email protected]>
+ */
+
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+#include "netns.h"
+#include "procfs.h"
+
+/*
+ * We only allow strings that start with 'Y', 'y', or '1'.
+ */
+static ssize_t
+nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
+ loff_t *pos)
+{
+ char *data;
+ struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+ lockd_net_id);
+
+ if (size < 1)
+ return -EINVAL;
+
+ data = simple_transaction_get(file, buf, size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ switch(data[0]) {
+ case 'Y':
+ case 'y':
+ case '1':
+ locks_end_grace(&ln->lockd_manager);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return size;
+}
+
+static ssize_t
+nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
+ loff_t *pos)
+{
+ struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+ lockd_net_id);
+ char resp[3];
+
+ resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
+ resp[1] = '\n';
+ resp[2] = '\0';
+
+ return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
+}
+
+static const struct file_operations lockd_end_grace_operations = {
+ .write = nlm_end_grace_write,
+ .read = nlm_end_grace_read,
+ .llseek = default_llseek,
+ .release = simple_transaction_release,
+ .owner = THIS_MODULE,
+};
+
+int __init
+lockd_create_procfs(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = proc_mkdir("fs/lockd", NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
+ &lockd_end_grace_operations);
+ if (!entry) {
+ remove_proc_entry("fs/lockd", NULL);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void __exit
+lockd_remove_procfs(void)
+{
+ remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
+ remove_proc_entry("fs/lockd", NULL);
+}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <[email protected]>
+ */
+#ifndef _LOCKD_PROCFS_H
+#define _LOCKD_PROCFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_PROC_FS)
+int lockd_create_procfs(void);
+void lockd_remove_procfs(void);
+#else
+static inline int
+lockd_create_procfs(void)
+{
+ return 0;
+}
+
+static inline void
+lockd_remove_procfs(void)
+{
+ return;
+}
+#endif /* IS_ENABLED(CONFIG_PROC_FS) */
+
+#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 3599c9ca28ae..f01a9aa01dff 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
#include <linux/nfs.h>

#include "netns.h"
+#include "procfs.h"

#define NLMDBG_FACILITY NLMDBG_SVC
#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
@@ -616,8 +617,15 @@ static int __init init_nlm(void)
err = register_pernet_subsys(&lockd_net_ops);
if (err)
goto err_pernet;
+
+ err = lockd_create_procfs();
+ if (err)
+ goto err_procfs;
+
return 0;

+err_procfs:
+ unregister_pernet_subsys(&lockd_net_ops);
err_pernet:
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
@@ -630,6 +638,7 @@ static void __exit exit_nlm(void)
{
/* FIXME: delete all NLM clients */
nlm_shutdown_hosts();
+ lockd_remove_procfs();
unregister_pernet_subsys(&lockd_net_ops);
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
--
1.9.3

2014-09-08 14:47:05

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 4/8] nfsd: add a v4_end_grace file to /proc/fs/nfsd

Allow a privileged userland process to end the v4 grace period early.
Any write to the file will cause the v4 grace period to be lifted.
The basic idea with this will be to allow the userland client tracking
program to lift the grace period once it knows that no more clients
will be reclaiming state.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs4state.c | 2 +-
fs/nfsd/nfsctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
fs/nfsd/state.h | 3 +++
3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 422b1df4bb00..ae380e90e372 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4109,7 +4109,7 @@ out:
return status;
}

-static void
+void
nfsd4_end_grace(struct nfsd_net *nn)
{
/* do nothing if grace period already ended */
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4e042105fb6e..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -49,6 +49,7 @@ enum {
NFSD_Leasetime,
NFSD_Gracetime,
NFSD_RecoveryDir,
+ NFSD_V4EndGrace,
#endif
};

@@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
#endif

static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_Leasetime] = write_leasetime,
[NFSD_Gracetime] = write_gracetime,
[NFSD_RecoveryDir] = write_recoverydir,
+ [NFSD_V4EndGrace] = write_v4_end_grace,
#endif
};

@@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
return rv;
}

+/**
+ * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * OR
+ *
+ * Input:
+ * buf: any value
+ * size: non-zero length of C string in @buf
+ * Output:
+ * passed-in buffer filled with "Y" or "N" with a newline
+ * and NULL-terminated C string. This indicates whether
+ * the grace period has ended in the current net
+ * namespace. Return code is the size in bytes of the
+ * string. Writing a string that starts with 'Y', 'y', or
+ * '1' to the file will end the grace period for nfsd's v4
+ * lock manager.
+ */
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
+{
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (size > 0) {
+ switch(buf[0]) {
+ case 'Y':
+ case 'y':
+ case '1':
+ nfsd4_end_grace(nn);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
+ nn->grace_ended ? 'Y' : 'N');
+}
+
#endif

/*----------------------------------------------------------------------------*/
@@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
#endif
/* last one */ {""}
};
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 5f8a500a58bf..10bfda710f71 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -544,6 +544,9 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);

+/* grace period management */
+void nfsd4_end_grace(struct nfsd_net *nn);
+
/* nfs4recover operations */
extern int nfsd4_client_tracking_init(struct net *net);
extern void nfsd4_client_tracking_exit(struct net *net);
--
1.9.3

2014-09-10 13:22:09

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 9/8] nfsd: skip subsequent UMH "create" operations after the first one for v4.0 clients

In the case of v4.0 clients, we may call into the "create" client
tracking operation multiple times (once for each openowner). Upcalling
for each one of those is wasteful and slow however. We can skip doing
further "create" operations after the first one if we know that one has
already been done.

v4.1+ clients generally only call into this function once (on
RECLAIM_COMPLETE), and we can't skip upcalling on the create even if the
STABLE bit is set. Doing so would make it impossible for nfsdcltrack to
lift the grace period early.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs4recover.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5697e9d2e875..6bd97d1d1880 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1281,6 +1281,22 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
char *hexid, *reclaim_complete, *grace_start;
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);

+ /*
+ * With v4.0 clients, there's little difference in outcome between a
+ * create and check operation, and we can end up calling into this
+ * function multiple times per client (once for each openowner). So,
+ * for v4.0 clients skip upcalling once the client has been recorded
+ * on stable storage.
+ *
+ * For v4.1+ clients, the outcome of the two operations is different,
+ * so we must ensure that we upcall for the create operation. v4.1+
+ * clients call this on RECLAIM_COMPLETE though, so we should only end
+ * up doing a single create upcall per client.
+ */
+ if (clp->cl_minorversion == 0 &&
+ test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
--
1.9.3

2014-09-08 14:47:10

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 7/8] nfsd: set and test NFSD4_CLIENT_STABLE bit to reduce nfsdcltrack upcalls

The nfsdcltrack upcall doesn't utilize the NFSD4_CLIENT_STABLE flag,
which basically results in an upcall every time we call into the client
tracking ops.

Change it to set this bit on a successful "check" or "create" request,
and clear it on a "remove" request. Also, check to see if that bit is
set before upcalling on a "check" or "remove" request, and skip
upcalling appropriately, depending on its state.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs4recover.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index ae313861a6f5..5697e9d2e875 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1291,7 +1291,8 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
grace_start = nfsd4_cltrack_grace_start(nn->boot_time);

nfsd4_cltrack_upcall_lock(clp);
- nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start);
+ if (!nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start))
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
nfsd4_cltrack_upcall_unlock(clp);

kfree(reclaim_complete);
@@ -1304,6 +1305,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
{
char *hexid;

+ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
@@ -1311,7 +1315,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
}

nfsd4_cltrack_upcall_lock(clp);
- nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
+ nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
nfsd4_cltrack_upcall_unlock(clp);

kfree(hexid);
@@ -1323,6 +1329,9 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
int ret;
char *hexid, *legacy;

+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
@@ -1331,9 +1340,14 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);

nfsd4_cltrack_upcall_lock(clp);
- ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
+ ret = 0;
+ } else {
+ ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
+ if (!ret)
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
nfsd4_cltrack_upcall_unlock(clp);
-
kfree(legacy);
kfree(hexid);
return ret;
--
1.9.3

2014-09-08 14:47:08

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 6/8] nfsd: serialize nfsdcltrack upcalls for a particular client

In a later patch, we want to add a flag that will allow us to reduce the
need for upcalls. In order to handle that correctly, we'll need to
ensure that racing upcalls for the same client can't occur. In practice
it should be rare for this to occur with a well-behaved client, but it
is possible.

Convert one of the bits in the cl_flags field to be an upcall bitlock,
and use it to ensure that upcalls for the same client are serialized.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs4recover.c | 29 +++++++++++++++++++++++++++++
fs/nfsd/state.h | 1 +
2 files changed, 30 insertions(+)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5b69f0684060..ae313861a6f5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1260,6 +1260,22 @@ nfsd4_umh_cltrack_init(struct net *net)
}

static void
+nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
+{
+ wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void
+nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
+{
+ smp_mb__before_atomic();
+ clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
+}
+
+static void
nfsd4_umh_cltrack_create(struct nfs4_client *clp)
{
char *hexid, *reclaim_complete, *grace_start;
@@ -1270,9 +1286,14 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
+
reclaim_complete = nfsd4_cltrack_reclaim_complete(clp);
grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+ nfsd4_cltrack_upcall_lock(clp);
nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start);
+ nfsd4_cltrack_upcall_unlock(clp);
+
kfree(reclaim_complete);
kfree(grace_start);
kfree(hexid);
@@ -1288,7 +1309,11 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
+
+ nfsd4_cltrack_upcall_lock(clp);
nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL);
+ nfsd4_cltrack_upcall_unlock(clp);
+
kfree(hexid);
}

@@ -1304,7 +1329,11 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
return -ENOMEM;
}
legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
+
+ nfsd4_cltrack_upcall_lock(clp);
ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
+ nfsd4_cltrack_upcall_unlock(clp);
+
kfree(legacy);
kfree(hexid);
return ret;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 10bfda710f71..a0ed18f00dcb 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -306,6 +306,7 @@ struct nfs4_client {
#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
+#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
unsigned long cl_flags;
--
1.9.3

2014-09-08 14:47:01

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 1/8] lockd: move lockd's grace period handling into its own module

Currently, all of the grace period handling is part of lockd. Eventually
though we'd like to be able to build v4-only servers, at which point
we'll need to put all of this elsewhere.

Move the code itself into fs/nfs_common and have it build a grace.ko
module. Then, rejigger the Kconfig options so that both nfsd and lockd
enable it automatically.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/Kconfig | 6 ++-
fs/lockd/Makefile | 2 +-
fs/lockd/grace.c | 65 ----------------------------
fs/lockd/netns.h | 1 -
fs/lockd/svc.c | 1 -
fs/nfs_common/Makefile | 3 +-
fs/nfs_common/grace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++
fs/nfsd/Kconfig | 1 +
include/linux/proc_fs.h | 2 +
9 files changed, 124 insertions(+), 70 deletions(-)
delete mode 100644 fs/lockd/grace.c
create mode 100644 fs/nfs_common/grace.c

diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
source "fs/nfs/Kconfig"
source "fs/nfsd/Kconfig"

+config GRACE_PERIOD
+ tristate
+
config LOCKD
tristate
depends on FILE_LOCKING
+ select GRACE_PERIOD

config LOCKD_V4
bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT

config NFS_COMMON
bool
- depends on NFSD || NFS_FS
+ depends on NFSD || NFS_FS || LOCKD
default y

source "net/sunrpc/Kconfig"
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..6a0b351ce30e 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
obj-$(CONFIG_LOCKD) += lockd.o

lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
- svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
+ svcshare.o svcproc.o svcsubs.o mon.o xdr.o
lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
deleted file mode 100644
index 6d1ee7204c88..000000000000
--- a/fs/lockd/grace.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Common code for control of lockd and nfsv4 grace periods.
- */
-
-#include <linux/module.h>
-#include <linux/lockd/bind.h>
-#include <net/net_namespace.h>
-
-#include "netns.h"
-
-static DEFINE_SPINLOCK(grace_lock);
-
-/**
- * locks_start_grace
- * @lm: who this grace period is for
- *
- * A grace period is a period during which locks should not be given
- * out. Currently grace periods are only enforced by the two lock
- * managers (lockd and nfsd), using the locks_in_grace() function to
- * check when they are in a grace period.
- *
- * This function is called to start a grace period.
- */
-void locks_start_grace(struct net *net, struct lock_manager *lm)
-{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- spin_lock(&grace_lock);
- list_add(&lm->list, &ln->grace_list);
- spin_unlock(&grace_lock);
-}
-EXPORT_SYMBOL_GPL(locks_start_grace);
-
-/**
- * locks_end_grace
- * @lm: who this grace period is for
- *
- * Call this function to state that the given lock manager is ready to
- * resume regular locking. The grace period will not end until all lock
- * managers that called locks_start_grace() also call locks_end_grace().
- * Note that callers count on it being safe to call this more than once,
- * and the second call should be a no-op.
- */
-void locks_end_grace(struct lock_manager *lm)
-{
- spin_lock(&grace_lock);
- list_del_init(&lm->list);
- spin_unlock(&grace_lock);
-}
-EXPORT_SYMBOL_GPL(locks_end_grace);
-
-/**
- * locks_in_grace
- *
- * Lock managers call this function to determine when it is OK for them
- * to answer ordinary lock requests, and when they should accept only
- * lock reclaims.
- */
-int locks_in_grace(struct net *net)
-{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- return !list_empty(&ln->grace_list);
-}
-EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {

struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
- struct list_head grace_list;

spinlock_t nsm_clnt_lock;
unsigned int nsm_users;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93f8d2e..3599c9ca28ae 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -583,7 +583,6 @@ static int lockd_init_net(struct net *net)
struct lockd_net *ln = net_generic(net, lockd_net_id);

INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
- INIT_LIST_HEAD(&ln->grace_list);
spin_lock_init(&ln->nsm_clnt_lock);
return 0;
}
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
#

obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
-
nfs_acl-objs := nfsacl.o
+
+obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
new file mode 100644
index 000000000000..ae6e58ea4de5
--- /dev/null
+++ b/fs/nfs_common/grace.c
@@ -0,0 +1,113 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ *
+ * Transplanted from lockd code
+ */
+
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/fs.h>
+
+static int grace_net_id;
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @net: net namespace that this lock manager belongs to
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out. Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void
+locks_start_grace(struct net *net, struct lock_manager *lm)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ spin_lock(&grace_lock);
+ list_add(&lm->list, grace_list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @net: net namespace that this lock manager belongs to
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking. The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void
+locks_end_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_del_init(&lm->list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int
+locks_in_grace(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ return !list_empty(grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
+
+static int __net_init
+grace_init_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ INIT_LIST_HEAD(grace_list);
+ return 0;
+}
+
+static void __net_exit
+grace_exit_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ BUG_ON(!list_empty(grace_list));
+}
+
+static struct pernet_operations grace_net_ops = {
+ .init = grace_init_net,
+ .exit = grace_exit_net,
+ .id = &grace_net_id,
+ .size = sizeof(struct list_head),
+};
+
+static int __init
+init_grace(void)
+{
+ return register_pernet_subsys(&grace_net_ops);
+}
+
+static void __exit
+exit_grace(void)
+{
+ unregister_pernet_subsys(&grace_net_ops);
+}
+
+MODULE_AUTHOR("Jeff Layton <[email protected]>");
+MODULE_LICENSE("GPL");
+module_init(init_grace)
+module_exit(exit_grace)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f994e750e0d1..4fa98764de21 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
select FS_POSIX_ACL
select SUNRPC_GSS
select CRYPTO
+ select GRACE_PERIOD
help
This option enables support in your system's NFS server for
version 4 of the NFS protocol (RFC 3530).
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 9d117f61d976..b97bf2ef996e 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -74,6 +74,8 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p

#endif /* CONFIG_PROC_FS */

+struct net;
+
static inline struct proc_dir_entry *proc_net_mkdir(
struct net *net, const char *name, struct proc_dir_entry *parent)
{
--
1.9.3

2014-09-08 14:47:10

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 8/8] nfsd: reject reclaim request when client has already sent RECLAIM_COMPLETE

As stated in RFC 5661, section 18.51.3:

Once a RECLAIM_COMPLETE is done, there can be no further reclaim
operations for locks whose scope is defined as having completed
recovery. Once the client sends RECLAIM_COMPLETE, the server will
not allow the client to do subsequent reclaims of locking state for
that scope and, if these are attempted, will return
NFS4ERR_NO_GRACE.

Ensure that we enforce that requirement.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs4state.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 951c882b1581..c4c8d0621dde 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5660,6 +5660,9 @@ nfs4_check_open_reclaim(clientid_t *clid,
if (status)
return nfserr_reclaim_bad;

+ if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+ return nfserr_no_grace;
+
if (nfsd4_client_record_check(cstate->clp))
return nfserr_reclaim_bad;

--
1.9.3

2014-09-09 20:46:18

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH v3 7/8] nfsd: set and test NFSD4_CLIENT_STABLE bit to reduce nfsdcltrack upcalls

On Mon, Sep 08, 2014 at 10:46:48AM -0400, Jeff Layton wrote:
> The nfsdcltrack upcall doesn't utilize the NFSD4_CLIENT_STABLE flag,
> which basically results in an upcall every time we call into the client
> tracking ops.
>
> Change it to set this bit on a successful "check" or "create" request,
> and clear it on a "remove" request. Also, check to see if that bit is
> set before upcalling on a "check" or "remove" request, and skip
> upcalling appropriately, depending on its state.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/nfs4recover.c | 22 ++++++++++++++++++----
> 1 file changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index ae313861a6f5..5697e9d2e875 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -1291,7 +1291,8 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
> grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
>
> nfsd4_cltrack_upcall_lock(clp);
> - nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start);
> + if (!nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start))
> + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);

Don't you also want to skip the upcall when STABLE is set?

(Not a big deal in the 4.1 case as it's only called once per client (in
RECLAIM_COMPLETE) but annoying in the 4.0 case where it's called on
every OPEN_CONFIRM.)

--b.

> nfsd4_cltrack_upcall_unlock(clp);
>
> kfree(reclaim_complete);
> @@ -1304,6 +1305,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
> {
> char *hexid;
>
> + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> + return;
> +
> hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
> if (!hexid) {
> dprintk("%s: can't allocate memory for upcall!\n", __func__);
> @@ -1311,7 +1315,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
> }
>
> nfsd4_cltrack_upcall_lock(clp);
> - nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL);
> + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
> + nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
> + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> nfsd4_cltrack_upcall_unlock(clp);
>
> kfree(hexid);
> @@ -1323,6 +1329,9 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
> int ret;
> char *hexid, *legacy;
>
> + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> + return 0;
> +
> hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
> if (!hexid) {
> dprintk("%s: can't allocate memory for upcall!\n", __func__);
> @@ -1331,9 +1340,14 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
> legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
>
> nfsd4_cltrack_upcall_lock(clp);
> - ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
> + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
> + ret = 0;
> + } else {
> + ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
> + if (!ret)
> + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> + }
> nfsd4_cltrack_upcall_unlock(clp);
> -
> kfree(legacy);
> kfree(hexid);
> return ret;
> --
> 1.9.3
>

2014-09-09 12:21:16

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH v3 4/8] nfsd: add a v4_end_grace file to /proc/fs/nfsd

On Mon, 8 Sep 2014 10:46:45 -0400
Jeff Layton <[email protected]> wrote:

> Allow a privileged userland process to end the v4 grace period early.
> Any write to the file will cause the v4 grace period to be lifted.
> The basic idea with this will be to allow the userland client tracking
> program to lift the grace period once it knows that no more clients
> will be reclaiming state.
>

Oof...sorry!

The patch does what you requested, (accept only "Y", "y" or "1" as a
sign to lift the grace period), but I missed updating the patch
description above. If there's another respin needed for other reasons,
then I'll fix the text in it. Otherwise, let me know if you want me to
resend.

> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfsd/nfs4state.c | 2 +-
> fs/nfsd/nfsctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> fs/nfsd/state.h | 3 +++
> 3 files changed, 49 insertions(+), 1 deletion(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 422b1df4bb00..ae380e90e372 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4109,7 +4109,7 @@ out:
> return status;
> }
>
> -static void
> +void
> nfsd4_end_grace(struct nfsd_net *nn)
> {
> /* do nothing if grace period already ended */
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 4e042105fb6e..ca73ca79a0ee 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -49,6 +49,7 @@ enum {
> NFSD_Leasetime,
> NFSD_Gracetime,
> NFSD_RecoveryDir,
> + NFSD_V4EndGrace,
> #endif
> };
>
> @@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
> static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
> static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
> static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
> +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
> #endif
>
> static ssize_t (*write_op[])(struct file *, char *, size_t) = {
> @@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
> [NFSD_Leasetime] = write_leasetime,
> [NFSD_Gracetime] = write_gracetime,
> [NFSD_RecoveryDir] = write_recoverydir,
> + [NFSD_V4EndGrace] = write_v4_end_grace,
> #endif
> };
>
> @@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
> return rv;
> }
>
> +/**
> + * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
> + *
> + * Input:
> + * buf: ignored
> + * size: zero
> + * OR
> + *
> + * Input:
> + * buf: any value
> + * size: non-zero length of C string in @buf
> + * Output:
> + * passed-in buffer filled with "Y" or "N" with a newline
> + * and NULL-terminated C string. This indicates whether
> + * the grace period has ended in the current net
> + * namespace. Return code is the size in bytes of the
> + * string. Writing a string that starts with 'Y', 'y', or
> + * '1' to the file will end the grace period for nfsd's v4
> + * lock manager.
> + */
> +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
> +{
> + struct net *net = file->f_dentry->d_sb->s_fs_info;
> + struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +
> + if (size > 0) {
> + switch(buf[0]) {
> + case 'Y':
> + case 'y':
> + case '1':
> + nfsd4_end_grace(nn);
> + break;
> + default:
> + return -EINVAL;
> + }
> + }
> +
> + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
> + nn->grace_ended ? 'Y' : 'N');
> +}
> +
> #endif
>
> /*----------------------------------------------------------------------------*/
> @@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
> [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
> [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
> [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
> + [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
> #endif
> /* last one */ {""}
> };
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 5f8a500a58bf..10bfda710f71 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -544,6 +544,9 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
> struct nfsd_net *nn);
> extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
>
> +/* grace period management */
> +void nfsd4_end_grace(struct nfsd_net *nn);
> +
> /* nfs4recover operations */
> extern int nfsd4_client_tracking_init(struct net *net);
> extern void nfsd4_client_tracking_exit(struct net *net);

--
Jeff Layton <[email protected]>

2014-09-08 14:47:03

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 2/8] nfsd: remove redundant boot_time parm from grace_done client tracking op

Since it's stored in nfsd_net, we don't need to pass it in separately.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs4recover.c | 17 ++++++++---------
fs/nfsd/nfs4state.c | 2 +-
fs/nfsd/state.h | 2 +-
3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..a0d2ba956a3f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
void (*create)(struct nfs4_client *);
void (*remove)(struct nfs4_client *);
int (*check)(struct nfs4_client *);
- void (*grace_done)(struct nfsd_net *, time_t);
+ void (*grace_done)(struct nfsd_net *);
};

/* Globals */
@@ -392,7 +392,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
}

static void
-nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn)
{
int status;

@@ -1016,7 +1016,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
}

static void
-nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn)
{
int ret;
struct cld_upcall *cup;
@@ -1029,7 +1029,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
}

cup->cu_msg.cm_cmd = Cld_GraceDone;
- cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time;
+ cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
if (!ret)
ret = cup->cu_msg.cm_status;
@@ -1245,13 +1245,12 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
}

static void
-nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
- time_t boot_time)
+nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
{
char *legacy;
char timestr[22]; /* FIXME: better way to determine max size? */

- sprintf(timestr, "%ld", boot_time);
+ sprintf(timestr, "%ld", nn->boot_time);
legacy = nfsd4_cltrack_legacy_topdir();
nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
kfree(legacy);
@@ -1356,10 +1355,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
}

void
-nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn)
{
if (nn->client_tracking_ops)
- nn->client_tracking_ops->grace_done(nn, boot_time);
+ nn->client_tracking_ops->grace_done(nn);
}

static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8597fbeea3bb..422b1df4bb00 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4118,7 +4118,7 @@ nfsd4_end_grace(struct nfsd_net *nn)

dprintk("NFSD: end of grace period\n");
nn->grace_ended = true;
- nfsd4_record_grace_done(nn, nn->boot_time);
+ nfsd4_record_grace_done(nn);
locks_end_grace(&nn->nfsd4_manager);
/*
* Now that every NFSv4 client has had the chance to recover and
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 64f291a25a8c..5f8a500a58bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -550,7 +550,7 @@ extern void nfsd4_client_tracking_exit(struct net *net);
extern void nfsd4_client_record_create(struct nfs4_client *clp);
extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn);

/* nfs fault injection functions */
#ifdef CONFIG_NFSD_FAULT_INJECTION
--
1.9.3

2014-09-09 20:56:59

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH v3 7/8] nfsd: set and test NFSD4_CLIENT_STABLE bit to reduce nfsdcltrack upcalls

On Tue, 9 Sep 2014 16:46:17 -0400
"J. Bruce Fields" <[email protected]> wrote:

> On Mon, Sep 08, 2014 at 10:46:48AM -0400, Jeff Layton wrote:
> > The nfsdcltrack upcall doesn't utilize the NFSD4_CLIENT_STABLE flag,
> > which basically results in an upcall every time we call into the client
> > tracking ops.
> >
> > Change it to set this bit on a successful "check" or "create" request,
> > and clear it on a "remove" request. Also, check to see if that bit is
> > set before upcalling on a "check" or "remove" request, and skip
> > upcalling appropriately, depending on its state.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfsd/nfs4recover.c | 22 ++++++++++++++++++----
> > 1 file changed, 18 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index ae313861a6f5..5697e9d2e875 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -1291,7 +1291,8 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
> > grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
> >
> > nfsd4_cltrack_upcall_lock(clp);
> > - nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start);
> > + if (!nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start))
> > + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
>
> Don't you also want to skip the upcall when STABLE is set?
>
> (Not a big deal in the 4.1 case as it's only called once per client (in
> RECLAIM_COMPLETE) but annoying in the 4.0 case where it's called on
> every OPEN_CONFIRM.)
>
> --b.
>

Ahh right, I was thinking that we'd only get one OPEN_CONFIRM per
client, but you have to do it for each openowner...

No, we can't do that. Consider the case where we might do some "check"
ops before the "create" operation (i.e. we need to do some reclaims).

The "check" will set the STABLE flag, and then the create operation
gets skipped. That means that the grace period won't get lifted early
even when the RECLAIM_COMPLETE comes in since the reclaim_complete
field in the DB didn't get updated.

We could (in principle) add a new flag that indicates whether a
"create" has already been done for a client, and use that to skip
subsequent "create" ops. Sound reasonable?

> > nfsd4_cltrack_upcall_unlock(clp);
> >
> > kfree(reclaim_complete);
> > @@ -1304,6 +1305,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
> > {
> > char *hexid;
> >
> > + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > + return;
> > +
> > hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
> > if (!hexid) {
> > dprintk("%s: can't allocate memory for upcall!\n", __func__);
> > @@ -1311,7 +1315,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
> > }
> >
> > nfsd4_cltrack_upcall_lock(clp);
> > - nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL);
> > + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
> > + nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
> > + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> > nfsd4_cltrack_upcall_unlock(clp);
> >
> > kfree(hexid);
> > @@ -1323,6 +1329,9 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
> > int ret;
> > char *hexid, *legacy;
> >
> > + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > + return 0;
> > +
> > hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
> > if (!hexid) {
> > dprintk("%s: can't allocate memory for upcall!\n", __func__);
> > @@ -1331,9 +1340,14 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
> > legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
> >
> > nfsd4_cltrack_upcall_lock(clp);
> > - ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
> > + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
> > + ret = 0;
> > + } else {
> > + ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
> > + if (!ret)
> > + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> > + }
> > nfsd4_cltrack_upcall_unlock(clp);
> > -
> > kfree(legacy);
> > kfree(hexid);
> > return ret;
> > --
> > 1.9.3
> >

--
Jeff Layton <[email protected]>

2014-09-09 21:19:52

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH v3 7/8] nfsd: set and test NFSD4_CLIENT_STABLE bit to reduce nfsdcltrack upcalls

On Tue, 9 Sep 2014 16:56:51 -0400
Jeff Layton <[email protected]> wrote:

> On Tue, 9 Sep 2014 16:46:17 -0400
> "J. Bruce Fields" <[email protected]> wrote:
>
> > On Mon, Sep 08, 2014 at 10:46:48AM -0400, Jeff Layton wrote:
> > > The nfsdcltrack upcall doesn't utilize the NFSD4_CLIENT_STABLE flag,
> > > which basically results in an upcall every time we call into the client
> > > tracking ops.
> > >
> > > Change it to set this bit on a successful "check" or "create" request,
> > > and clear it on a "remove" request. Also, check to see if that bit is
> > > set before upcalling on a "check" or "remove" request, and skip
> > > upcalling appropriately, depending on its state.
> > >
> > > Signed-off-by: Jeff Layton <[email protected]>
> > > ---
> > > fs/nfsd/nfs4recover.c | 22 ++++++++++++++++++----
> > > 1 file changed, 18 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > > index ae313861a6f5..5697e9d2e875 100644
> > > --- a/fs/nfsd/nfs4recover.c
> > > +++ b/fs/nfsd/nfs4recover.c
> > > @@ -1291,7 +1291,8 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
> > > grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
> > >
> > > nfsd4_cltrack_upcall_lock(clp);
> > > - nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start);
> > > + if (!nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start))
> > > + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> >
> > Don't you also want to skip the upcall when STABLE is set?
> >
> > (Not a big deal in the 4.1 case as it's only called once per client (in
> > RECLAIM_COMPLETE) but annoying in the 4.0 case where it's called on
> > every OPEN_CONFIRM.)
> >
> > --b.
> >
>
> Ahh right, I was thinking that we'd only get one OPEN_CONFIRM per
> client, but you have to do it for each openowner...
>
> No, we can't do that. Consider the case where we might do some "check"
> ops before the "create" operation (i.e. we need to do some reclaims).
>
> The "check" will set the STABLE flag, and then the create operation
> gets skipped. That means that the grace period won't get lifted early
> even when the RECLAIM_COMPLETE comes in since the reclaim_complete
> field in the DB didn't get updated.
>
> We could (in principle) add a new flag that indicates whether a
> "create" has already been done for a client, and use that to skip
> subsequent "create" ops. Sound reasonable?
>

...or we could skip doing the upcall if the client's minorversion is 0
and the STABLE flag is set.

In the v4.0 case, there's really no difference between a "check" and
"create" operation, so there's no real benefit to doing a "create" if
you already did a "check".

> > > nfsd4_cltrack_upcall_unlock(clp);
> > >
> > > kfree(reclaim_complete);
> > > @@ -1304,6 +1305,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
> > > {
> > > char *hexid;
> > >
> > > + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > > + return;
> > > +
> > > hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
> > > if (!hexid) {
> > > dprintk("%s: can't allocate memory for upcall!\n", __func__);
> > > @@ -1311,7 +1315,9 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
> > > }
> > >
> > > nfsd4_cltrack_upcall_lock(clp);
> > > - nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL);
> > > + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
> > > + nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
> > > + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> > > nfsd4_cltrack_upcall_unlock(clp);
> > >
> > > kfree(hexid);
> > > @@ -1323,6 +1329,9 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
> > > int ret;
> > > char *hexid, *legacy;
> > >
> > > + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > > + return 0;
> > > +
> > > hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
> > > if (!hexid) {
> > > dprintk("%s: can't allocate memory for upcall!\n", __func__);
> > > @@ -1331,9 +1340,14 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
> > > legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
> > >
> > > nfsd4_cltrack_upcall_lock(clp);
> > > - ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
> > > + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
> > > + ret = 0;
> > > + } else {
> > > + ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
> > > + if (!ret)
> > > + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> > > + }
> > > nfsd4_cltrack_upcall_unlock(clp);
> > > -
> > > kfree(legacy);
> > > kfree(hexid);
> > > return ret;
> > > --
> > > 1.9.3
> > >
>
>

--
Jeff Layton <[email protected]>

2014-09-08 14:47:08

by Jeff Layton

[permalink] [raw]

Subject: [PATCH v3 5/8] nfsd: pass extra info in env vars to upcalls to allow for early grace period end

In order to support lifting the grace period early, we must tell
nfsdcltrack what sort of client the "create" upcall is for. We can't
reliably tell if a v4.0 client has completed reclaiming, so we can only
lift the grace period once all the v4.1+ clients have issued a
RECLAIM_COMPLETE and if there are no v4.0 clients.

Also, in order to lift the grace period, we have to tell userland when
the grace period started so that it can tell whether a RECLAIM_COMPLETE
has been issued for each client since then.

Since this is all optional info, we pass it along in environment
variables to the "init" and "create" upcalls. By doing this, we don't
need to revise the upcall format. The UMH upcall can simply make use of
this info if it happens to be present. If it's not then it can just
avoid lifting the grace period early.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfsd/nfs4recover.c | 90 ++++++++++++++++++++++++++++++++++++++++++++-------
fs/nfsd/nfs4state.c | 4 +--
2 files changed, 80 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index a0d2ba956a3f..5b69f0684060 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1062,6 +1062,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,

#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+#define RECLAIM_COMPLETE_ENV_PREFIX "NFSDCLTRACK_RECLAIM_COMPLETE="
+#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="

static char *
nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1128,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
return result;
}

+static char *
+nfsd4_cltrack_reclaim_complete(struct nfs4_client *clp)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + max width of integer string + terminating NULL */
+ len = strlen(RECLAIM_COMPLETE_ENV_PREFIX) + 10 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, RECLAIM_COMPLETE_ENV_PREFIX "%c",
+ clp->cl_minorversion ? 'Y' : 'N');
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_grace_start(time_t grace_start)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + max width of int64_t string + terminating NULL */
+ len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
+ grace_start);
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
static int
-nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
{
- char *envp[2];
+ char *envp[3];
char *argv[4];
int ret;

@@ -1140,10 +1192,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)

dprintk("%s: cmd: %s\n", __func__, cmd);
dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
- dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+ dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
+ dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");

- envp[0] = legacy;
- envp[1] = NULL;
+ envp[0] = env0;
+ envp[1] = env1;
+ envp[2] = NULL;

argv[0] = (char *)cltrack_prog;
argv[1] = cmd;
@@ -1187,28 +1241,40 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
}

static int
-nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+nfsd4_umh_cltrack_init(struct net *net)
{
+ int ret;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
/* XXX: The usermode helper s not working in container yet. */
if (net != &init_net) {
WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
"tracking in a container!\n");
return -EINVAL;
}
- return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+
+ ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+ kfree(grace_start);
+ return ret;
}

static void
nfsd4_umh_cltrack_create(struct nfs4_client *clp)
{
- char *hexid;
+ char *hexid, *reclaim_complete, *grace_start;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);

hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
- nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+ reclaim_complete = nfsd4_cltrack_reclaim_complete(clp);
+ grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+ nfsd4_umh_cltrack_upcall("create", hexid, reclaim_complete, grace_start);
+ kfree(reclaim_complete);
+ kfree(grace_start);
kfree(hexid);
}

@@ -1222,7 +1288,7 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
- nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+ nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL);
kfree(hexid);
}

@@ -1238,7 +1304,7 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp)
return -ENOMEM;
}
legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
- ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+ ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy, NULL);
kfree(legacy);
kfree(hexid);
return ret;
@@ -1252,7 +1318,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)

sprintf(timestr, "%ld", nn->boot_time);
legacy = nfsd4_cltrack_legacy_topdir();
- nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+ nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
kfree(legacy);
}

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ae380e90e372..951c882b1581 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6357,10 +6357,10 @@ nfs4_state_start_net(struct net *net)
ret = nfs4_state_create_net(net);
if (ret)
return ret;
- nfsd4_client_tracking_init(net);
nn->boot_time = get_seconds();
- locks_start_grace(net, &nn->nfsd4_manager);
nn->grace_ended = false;
+ locks_start_grace(net, &nn->nfsd4_manager);
+ nfsd4_client_tracking_init(net);
printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
nn->nfsd4_grace, net);
queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
--
1.9.3