2019-09-02 17:06:47

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v2 0/4] Handling NFSv3 I/O errors in knfsd

Recently, a number of changes went into the kernel to try to ensure
that I/O errors (specifically write errors) are reported to the
application once and only once. The vehicle for ensuring the errors
are reported is the struct file, which uses the 'f_wb_err' field to
track which errors have been reported.

The problem is that errors are mainly intended to be reported through
fsync(). If the client is doing synchronous writes, then all is well,
but if it is doing unstable writes, then the errors may not be
reported until the client calls COMMIT. If the file cache has
thrown out the struct file, due to memory pressure, or just because
the client took a long while between the last WRITE and the COMMIT,
then the error report may be lost, and the client may just think
its data is safely stored.

Note that the problem is compounded by the fact that NFSv3 is stateless,
so the server never knows that the client may have rebooted, so there
can be no guarantee that a COMMIT will ever be sent.

The following patch set attempts to remedy the situation using 2
strategies:

1) If the inode is dirty, then avoid garbage collecting the file
from the file cache.
2) If the file is closed, and we see that it would have reported
an error to COMMIT, then we bump the boot verifier in order to
ensure the client retransmits all its writes.

Note that if multiple clients were writing to the same file, then
we probably want to bump the boot verifier anyway, since only one
COMMIT will see the error report (because the cached file is also
shared).

So in order to implement the above strategy, we first have to do
the following: split up the file cache to act per net namespace,
since the boot verifier is per net namespace. Then add a helper
to update the boot verifier.

---
v2:
- Add patch to bump the boot verifier on all write/commit errors
- Fix initialisation of 'seq' in nfsd_copy_boot_verifier()

Trond Myklebust (4):
nfsd: nfsd_file cache entries should be per net namespace
nfsd: Support the server resetting the boot verifier
nfsd: Don't garbage collect files that might contain write errors
nfsd: Reset the boot verifier on all write I/O errors

fs/nfsd/export.c | 2 +-
fs/nfsd/filecache.c | 76 +++++++++++++++++++++++++++++++++++++--------
fs/nfsd/filecache.h | 3 +-
fs/nfsd/netns.h | 4 +++
fs/nfsd/nfs3xdr.c | 13 +++++---
fs/nfsd/nfs4proc.c | 14 +++------
fs/nfsd/nfsctl.c | 1 +
fs/nfsd/nfssvc.c | 32 ++++++++++++++++++-
fs/nfsd/vfs.c | 19 +++++++++---
9 files changed, 130 insertions(+), 34 deletions(-)

--
2.21.0


2019-09-02 17:06:47

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v2 1/4] nfsd: nfsd_file cache entries should be per net namespace

Ensure that we can safely clear out the file cache entries when the
nfs server is shut down on a container. Otherwise, the file cache
may end up pinning the mounts.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/export.c | 2 +-
fs/nfsd/filecache.c | 33 +++++++++++++++++++++------------
fs/nfsd/filecache.h | 3 ++-
fs/nfsd/nfssvc.c | 1 +
4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 052fac64b578..15422c951fd1 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -240,7 +240,7 @@ static void expkey_flush(void)
* destroyed while we're in the middle of flushing.
*/
mutex_lock(&nfsd_mutex);
- nfsd_file_cache_purge();
+ nfsd_file_cache_purge(current->nsproxy->net_ns);
mutex_unlock(&nfsd_mutex);
}

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index a2fcb251d2f6..d229fd3c825d 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -17,6 +17,7 @@
#include "vfs.h"
#include "nfsd.h"
#include "nfsfh.h"
+#include "netns.h"
#include "filecache.h"
#include "trace.h"

@@ -168,7 +169,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
}

static struct nfsd_file *
-nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval)
+nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
+ struct net *net)
{
struct nfsd_file *nf;

@@ -178,6 +180,7 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval)
INIT_LIST_HEAD(&nf->nf_lru);
nf->nf_file = NULL;
nf->nf_cred = get_current_cred();
+ nf->nf_net = net;
nf->nf_flags = 0;
nf->nf_inode = inode;
nf->nf_hashval = hashval;
@@ -608,10 +611,11 @@ nfsd_file_cache_init(void)
* Note this can deadlock with nfsd_file_lru_cb.
*/
void
-nfsd_file_cache_purge(void)
+nfsd_file_cache_purge(struct net *net)
{
unsigned int i;
struct nfsd_file *nf;
+ struct hlist_node *next;
LIST_HEAD(dispose);
bool del;

@@ -619,10 +623,12 @@ nfsd_file_cache_purge(void)
return;

for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- spin_lock(&nfsd_file_hashtbl[i].nfb_lock);
- while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) {
- nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first,
- struct nfsd_file, nf_node);
+ struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+
+ spin_lock(&nfb->nfb_lock);
+ hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+ if (net && nf->nf_net != net)
+ continue;
del = nfsd_file_unhash_and_release_locked(nf, &dispose);

/*
@@ -631,7 +637,7 @@ nfsd_file_cache_purge(void)
*/
WARN_ON_ONCE(!del);
}
- spin_unlock(&nfsd_file_hashtbl[i].nfb_lock);
+ spin_unlock(&nfb->nfb_lock);
nfsd_file_dispose_list(&dispose);
}
}
@@ -650,7 +656,7 @@ nfsd_file_cache_shutdown(void)
* calling nfsd_file_cache_purge
*/
cancel_delayed_work_sync(&nfsd_filecache_laundrette);
- nfsd_file_cache_purge();
+ nfsd_file_cache_purge(NULL);
list_lru_destroy(&nfsd_file_lru);
rcu_barrier();
fsnotify_put_group(nfsd_file_fsnotify_group);
@@ -686,7 +692,7 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2)

static struct nfsd_file *
nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
- unsigned int hashval)
+ unsigned int hashval, struct net *net)
{
struct nfsd_file *nf;
unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
@@ -697,6 +703,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
continue;
if (nf->nf_inode != inode)
continue;
+ if (nf->nf_net != net)
+ continue;
if (!nfsd_match_cred(nf->nf_cred, current_cred()))
continue;
if (nfsd_file_get(nf) != NULL)
@@ -739,6 +747,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
__be32 status;
+ struct net *net = SVC_NET(rqstp);
struct nfsd_file *nf, *new;
struct inode *inode;
unsigned int hashval;
@@ -753,12 +762,12 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
retry:
rcu_read_lock();
- nf = nfsd_file_find_locked(inode, may_flags, hashval);
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
rcu_read_unlock();
if (nf)
goto wait_for_construction;

- new = nfsd_file_alloc(inode, may_flags, hashval);
+ new = nfsd_file_alloc(inode, may_flags, hashval, net);
if (!new) {
trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
NULL, nfserr_jukebox);
@@ -766,7 +775,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
}

spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- nf = nfsd_file_find_locked(inode, may_flags, hashval);
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
if (nf == NULL)
goto open_file;
spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 0c0c67166b87..851d9abf54c2 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -34,6 +34,7 @@ struct nfsd_file {
struct rcu_head nf_rcu;
struct file *nf_file;
const struct cred *nf_cred;
+ struct net *nf_net;
#define NFSD_FILE_HASHED (0)
#define NFSD_FILE_PENDING (1)
#define NFSD_FILE_BREAK_READ (2)
@@ -48,7 +49,7 @@ struct nfsd_file {
};

int nfsd_file_cache_init(void);
-void nfsd_file_cache_purge(void);
+void nfsd_file_cache_purge(struct net *);
void nfsd_file_cache_shutdown(void);
void nfsd_file_put(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d02712ca2685..b944553c6927 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -387,6 +387,7 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);

+ nfsd_file_cache_purge(net);
nfs4_state_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
--
2.21.0

2019-09-02 17:07:18

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v2 2/4] nfsd: Support the server resetting the boot verifier

Add support to allow the server to reset the boot verifier in order to
force clients to resend I/O after a timeout failure.

Signed-off-by: Trond Myklebust <[email protected]>
Signed-off-by: Lance Shelton <[email protected]>
---
fs/nfsd/netns.h | 4 ++++
fs/nfsd/nfs3xdr.c | 13 +++++++++----
fs/nfsd/nfs4proc.c | 14 ++++----------
fs/nfsd/nfsctl.c | 1 +
fs/nfsd/nfssvc.c | 31 ++++++++++++++++++++++++++++++-
5 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index bdfe5bcb3dcd..9a4ef815fb8c 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -104,6 +104,7 @@ struct nfsd_net {

/* Time of server startup */
struct timespec64 nfssvc_boot;
+ seqlock_t boot_lock;

/*
* Max number of connections this nfsd container will allow. Defaults
@@ -179,4 +180,7 @@ struct nfsd_net {
extern void nfsd_netns_free_versions(struct nfsd_net *nn);

extern unsigned int nfsd_net_id;
+
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
+void nfsd_reset_boot_verifier(struct nfsd_net *nn);
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index fcf31822c74c..86e5658651f1 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -27,6 +27,7 @@ static u32 nfs3_ftypes[] = {
NF3SOCK, NF3BAD, NF3LNK, NF3BAD,
};

+
/*
* XDR functions for basic NFS types
*/
@@ -751,14 +752,16 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_writeres *resp = rqstp->rq_resp;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 verf[2];

p = encode_wcc_data(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->count);
*p++ = htonl(resp->committed);
/* unique identifier, y2038 overflow can be ignored */
- *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_nsec);
+ nfsd_copy_boot_verifier(verf, nn);
+ *p++ = verf[0];
+ *p++ = verf[1];
}
return xdr_ressize_check(rqstp, p);
}
@@ -1125,13 +1128,15 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_commitres *resp = rqstp->rq_resp;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 verf[2];

p = encode_wcc_data(rqstp, p, &resp->fh);
/* Write verifier */
if (resp->status == 0) {
/* unique identifier, y2038 overflow can be ignored */
- *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_nsec);
+ nfsd_copy_boot_verifier(verf, nn);
+ *p++ = verf[0];
+ *p++ = verf[1];
}
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cb51893ec1cd..4e3e77b76411 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -568,17 +568,11 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,

static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
{
- __be32 verf[2];
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ __be32 *verf = (__be32 *)verifier->data;

- /*
- * This is opaque to client, so no need to byte-swap. Use
- * __force to keep sparse happy. y2038 time_t overflow is
- * irrelevant in this usage.
- */
- verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
- verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
- memcpy(verifier->data, verf, sizeof(verifier->data));
+ BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
+
+ nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
}

static __be32
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3cf4f6aa48d6..33cbe2e5d937 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1477,6 +1477,7 @@ static __net_init int nfsd_init_net(struct net *net)

atomic_set(&nn->ntf_refcnt, 0);
init_waitqueue_head(&nn->ntf_wq);
+ seqlock_init(&nn->boot_lock);

mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
if (IS_ERR(mnt)) {
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b944553c6927..3caaf5675259 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -344,6 +344,35 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn)
return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
}

+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
+{
+ int seq = 0;
+
+ do {
+ read_seqbegin_or_lock(&nn->boot_lock, &seq);
+ /*
+ * This is opaque to client, so no need to byte-swap. Use
+ * __force to keep sparse happy. y2038 time_t overflow is
+ * irrelevant in this usage
+ */
+ verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+ verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
+ } while (need_seqretry(&nn->boot_lock, seq));
+ done_seqretry(&nn->boot_lock, seq);
+}
+
+void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
+{
+ ktime_get_real_ts64(&nn->nfssvc_boot);
+}
+
+void nfsd_reset_boot_verifier(struct nfsd_net *nn)
+{
+ write_seqlock(&nn->boot_lock);
+ nfsd_reset_boot_verifier_locked(nn);
+ write_sequnlock(&nn->boot_lock);
+}
+
static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -596,7 +625,7 @@ int nfsd_create_serv(struct net *net)
#endif
}
atomic_inc(&nn->ntf_refcnt);
- ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
+ nfsd_reset_boot_verifier(nn);
return 0;
}

--
2.21.0

2019-09-10 13:52:40

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v2 0/4] Handling NFSv3 I/O errors in knfsd

Looks OK to me; applying for 5.4.

Any ideas for easy ways to test this? Maybe I should look at
Documentation/fault-injection/fault-injection.txt.

--b.

On Mon, Sep 02, 2019 at 01:02:54PM -0400, Trond Myklebust wrote:
> Recently, a number of changes went into the kernel to try to ensure
> that I/O errors (specifically write errors) are reported to the
> application once and only once. The vehicle for ensuring the errors
> are reported is the struct file, which uses the 'f_wb_err' field to
> track which errors have been reported.
>
> The problem is that errors are mainly intended to be reported through
> fsync(). If the client is doing synchronous writes, then all is well,
> but if it is doing unstable writes, then the errors may not be
> reported until the client calls COMMIT. If the file cache has
> thrown out the struct file, due to memory pressure, or just because
> the client took a long while between the last WRITE and the COMMIT,
> then the error report may be lost, and the client may just think
> its data is safely stored.
>
> Note that the problem is compounded by the fact that NFSv3 is stateless,
> so the server never knows that the client may have rebooted, so there
> can be no guarantee that a COMMIT will ever be sent.
>
> The following patch set attempts to remedy the situation using 2
> strategies:
>
> 1) If the inode is dirty, then avoid garbage collecting the file
> from the file cache.
> 2) If the file is closed, and we see that it would have reported
> an error to COMMIT, then we bump the boot verifier in order to
> ensure the client retransmits all its writes.
>
> Note that if multiple clients were writing to the same file, then
> we probably want to bump the boot verifier anyway, since only one
> COMMIT will see the error report (because the cached file is also
> shared).
>
> So in order to implement the above strategy, we first have to do
> the following: split up the file cache to act per net namespace,
> since the boot verifier is per net namespace. Then add a helper
> to update the boot verifier.
>
> ---
> v2:
> - Add patch to bump the boot verifier on all write/commit errors
> - Fix initialisation of 'seq' in nfsd_copy_boot_verifier()
>
> Trond Myklebust (4):
> nfsd: nfsd_file cache entries should be per net namespace
> nfsd: Support the server resetting the boot verifier
> nfsd: Don't garbage collect files that might contain write errors
> nfsd: Reset the boot verifier on all write I/O errors
>
> fs/nfsd/export.c | 2 +-
> fs/nfsd/filecache.c | 76 +++++++++++++++++++++++++++++++++++++--------
> fs/nfsd/filecache.h | 3 +-
> fs/nfsd/netns.h | 4 +++
> fs/nfsd/nfs3xdr.c | 13 +++++---
> fs/nfsd/nfs4proc.c | 14 +++------
> fs/nfsd/nfsctl.c | 1 +
> fs/nfsd/nfssvc.c | 32 ++++++++++++++++++-
> fs/nfsd/vfs.c | 19 +++++++++---
> 9 files changed, 130 insertions(+), 34 deletions(-)
>
> --
> 2.21.0