Recently, a number of changes went into the kernel to try to ensure
that I/O errors (specifically write errors) are reported to the
application once and only once. The vehicle for ensuring the errors
are reported is the struct file, which uses the 'f_wb_err' field to
track which errors have been reported.
The problem is that errors are mainly intended to be reported through
fsync(). If the client is doing synchronous writes, then all is well,
but if it is doing unstable writes, then the errors may not be
reported until the client calls COMMIT. If the file cache has
thrown out the struct file, due to memory pressure, or just because
the client took a long while between the last WRITE and the COMMIT,
then the error report may be lost, and the client may just think
its data is safely stored.
Note that the problem is compounded by the fact that NFSv3 is stateless,
so the server never knows that the client may have rebooted, so there
can be no guarantee that a COMMIT will ever be sent.
The following patch set attempts to remedy the situation using 2
strategies:
1) If the inode is dirty, then avoid garbage collecting the file
from the file cache.
2) If the file is closed, and we see that it would have reported
an error to COMMIT, then we bump the boot verifier in order to
ensure the client retransmits all its writes.
Note that if multiple clients were writing to the same file, then
we probably want to bump the boot verifier anyway, since only one
COMMIT will see the error report (because the cached file is also
shared).
So in order to implement the above strategy, we first have to do
the following: split up the file cache to act per net namespace,
since the boot verifier is per net namespace. Then add a helper
to update the boot verifier.
Trond Myklebust (3):
nfsd: nfsd_file cache entries should be per net namespace
nfsd: Support the server resetting the boot verifier
nfsd: Don't garbage collect files that might contain write errors
fs/nfsd/export.c | 2 +-
fs/nfsd/filecache.c | 76 +++++++++++++++++++++++++++++++++++++--------
fs/nfsd/filecache.h | 3 +-
fs/nfsd/netns.h | 4 +++
fs/nfsd/nfs3xdr.c | 13 +++++---
fs/nfsd/nfs4proc.c | 14 +++------
fs/nfsd/nfsctl.c | 1 +
fs/nfsd/nfssvc.c | 32 ++++++++++++++++++-
8 files changed, 115 insertions(+), 30 deletions(-)
--
2.21.0
Ensure that we can safely clear out the file cache entries when the
nfs server is shut down on a container. Otherwise, the file cache
may end up pinning the mounts.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/export.c | 2 +-
fs/nfsd/filecache.c | 33 +++++++++++++++++++++------------
fs/nfsd/filecache.h | 3 ++-
fs/nfsd/nfssvc.c | 1 +
4 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 052fac64b578..15422c951fd1 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -240,7 +240,7 @@ static void expkey_flush(void)
* destroyed while we're in the middle of flushing.
*/
mutex_lock(&nfsd_mutex);
- nfsd_file_cache_purge();
+ nfsd_file_cache_purge(current->nsproxy->net_ns);
mutex_unlock(&nfsd_mutex);
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index a2fcb251d2f6..d229fd3c825d 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -17,6 +17,7 @@
#include "vfs.h"
#include "nfsd.h"
#include "nfsfh.h"
+#include "netns.h"
#include "filecache.h"
#include "trace.h"
@@ -168,7 +169,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
}
static struct nfsd_file *
-nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval)
+nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
+ struct net *net)
{
struct nfsd_file *nf;
@@ -178,6 +180,7 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval)
INIT_LIST_HEAD(&nf->nf_lru);
nf->nf_file = NULL;
nf->nf_cred = get_current_cred();
+ nf->nf_net = net;
nf->nf_flags = 0;
nf->nf_inode = inode;
nf->nf_hashval = hashval;
@@ -608,10 +611,11 @@ nfsd_file_cache_init(void)
* Note this can deadlock with nfsd_file_lru_cb.
*/
void
-nfsd_file_cache_purge(void)
+nfsd_file_cache_purge(struct net *net)
{
unsigned int i;
struct nfsd_file *nf;
+ struct hlist_node *next;
LIST_HEAD(dispose);
bool del;
@@ -619,10 +623,12 @@ nfsd_file_cache_purge(void)
return;
for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- spin_lock(&nfsd_file_hashtbl[i].nfb_lock);
- while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) {
- nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first,
- struct nfsd_file, nf_node);
+ struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+
+ spin_lock(&nfb->nfb_lock);
+ hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+ if (net && nf->nf_net != net)
+ continue;
del = nfsd_file_unhash_and_release_locked(nf, &dispose);
/*
@@ -631,7 +637,7 @@ nfsd_file_cache_purge(void)
*/
WARN_ON_ONCE(!del);
}
- spin_unlock(&nfsd_file_hashtbl[i].nfb_lock);
+ spin_unlock(&nfb->nfb_lock);
nfsd_file_dispose_list(&dispose);
}
}
@@ -650,7 +656,7 @@ nfsd_file_cache_shutdown(void)
* calling nfsd_file_cache_purge
*/
cancel_delayed_work_sync(&nfsd_filecache_laundrette);
- nfsd_file_cache_purge();
+ nfsd_file_cache_purge(NULL);
list_lru_destroy(&nfsd_file_lru);
rcu_barrier();
fsnotify_put_group(nfsd_file_fsnotify_group);
@@ -686,7 +692,7 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2)
static struct nfsd_file *
nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
- unsigned int hashval)
+ unsigned int hashval, struct net *net)
{
struct nfsd_file *nf;
unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
@@ -697,6 +703,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
continue;
if (nf->nf_inode != inode)
continue;
+ if (nf->nf_net != net)
+ continue;
if (!nfsd_match_cred(nf->nf_cred, current_cred()))
continue;
if (nfsd_file_get(nf) != NULL)
@@ -739,6 +747,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
__be32 status;
+ struct net *net = SVC_NET(rqstp);
struct nfsd_file *nf, *new;
struct inode *inode;
unsigned int hashval;
@@ -753,12 +762,12 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
retry:
rcu_read_lock();
- nf = nfsd_file_find_locked(inode, may_flags, hashval);
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
rcu_read_unlock();
if (nf)
goto wait_for_construction;
- new = nfsd_file_alloc(inode, may_flags, hashval);
+ new = nfsd_file_alloc(inode, may_flags, hashval, net);
if (!new) {
trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
NULL, nfserr_jukebox);
@@ -766,7 +775,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- nf = nfsd_file_find_locked(inode, may_flags, hashval);
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
if (nf == NULL)
goto open_file;
spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 0c0c67166b87..851d9abf54c2 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -34,6 +34,7 @@ struct nfsd_file {
struct rcu_head nf_rcu;
struct file *nf_file;
const struct cred *nf_cred;
+ struct net *nf_net;
#define NFSD_FILE_HASHED (0)
#define NFSD_FILE_PENDING (1)
#define NFSD_FILE_BREAK_READ (2)
@@ -48,7 +49,7 @@ struct nfsd_file {
};
int nfsd_file_cache_init(void);
-void nfsd_file_cache_purge(void);
+void nfsd_file_cache_purge(struct net *);
void nfsd_file_cache_shutdown(void);
void nfsd_file_put(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d02712ca2685..b944553c6927 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -387,6 +387,7 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ nfsd_file_cache_purge(net);
nfs4_state_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
--
2.21.0
Add support to allow the server to reset the boot verifier in order to
force clients to resend I/O after a timeout failure.
Signed-off-by: Trond Myklebust <[email protected]>
Signed-off-by: Lance Shelton <[email protected]>
---
fs/nfsd/netns.h | 4 ++++
fs/nfsd/nfs3xdr.c | 13 +++++++++----
fs/nfsd/nfs4proc.c | 14 ++++----------
fs/nfsd/nfsctl.c | 1 +
fs/nfsd/nfssvc.c | 31 ++++++++++++++++++++++++++++++-
5 files changed, 48 insertions(+), 15 deletions(-)
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index bdfe5bcb3dcd..9a4ef815fb8c 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -104,6 +104,7 @@ struct nfsd_net {
/* Time of server startup */
struct timespec64 nfssvc_boot;
+ seqlock_t boot_lock;
/*
* Max number of connections this nfsd container will allow. Defaults
@@ -179,4 +180,7 @@ struct nfsd_net {
extern void nfsd_netns_free_versions(struct nfsd_net *nn);
extern unsigned int nfsd_net_id;
+
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
+void nfsd_reset_boot_verifier(struct nfsd_net *nn);
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index fcf31822c74c..86e5658651f1 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -27,6 +27,7 @@ static u32 nfs3_ftypes[] = {
NF3SOCK, NF3BAD, NF3LNK, NF3BAD,
};
+
/*
* XDR functions for basic NFS types
*/
@@ -751,14 +752,16 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_writeres *resp = rqstp->rq_resp;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 verf[2];
p = encode_wcc_data(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->count);
*p++ = htonl(resp->committed);
/* unique identifier, y2038 overflow can be ignored */
- *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_nsec);
+ nfsd_copy_boot_verifier(verf, nn);
+ *p++ = verf[0];
+ *p++ = verf[1];
}
return xdr_ressize_check(rqstp, p);
}
@@ -1125,13 +1128,15 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_commitres *resp = rqstp->rq_resp;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 verf[2];
p = encode_wcc_data(rqstp, p, &resp->fh);
/* Write verifier */
if (resp->status == 0) {
/* unique identifier, y2038 overflow can be ignored */
- *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_nsec);
+ nfsd_copy_boot_verifier(verf, nn);
+ *p++ = verf[0];
+ *p++ = verf[1];
}
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cb51893ec1cd..4e3e77b76411 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -568,17 +568,11 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
{
- __be32 verf[2];
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ __be32 *verf = (__be32 *)verifier->data;
- /*
- * This is opaque to client, so no need to byte-swap. Use
- * __force to keep sparse happy. y2038 time_t overflow is
- * irrelevant in this usage.
- */
- verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
- verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
- memcpy(verifier->data, verf, sizeof(verifier->data));
+ BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
+
+ nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
}
static __be32
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 13c548733860..c3ac1e000c4a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1478,6 +1478,7 @@ static __net_init int nfsd_init_net(struct net *net)
atomic_set(&nn->ntf_refcnt, 0);
init_waitqueue_head(&nn->ntf_wq);
+ seqlock_init(&nn->boot_lock);
mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
if (IS_ERR(mnt)) {
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b944553c6927..a597fc34bc40 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -344,6 +344,35 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn)
return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
}
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
+{
+ int seq;
+
+ do {
+ read_seqbegin_or_lock(&nn->boot_lock, &seq);
+ /*
+ * This is opaque to client, so no need to byte-swap. Use
+ * __force to keep sparse happy. y2038 time_t overflow is
+ * irrelevant in this usage
+ */
+ verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+ verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
+ } while (need_seqretry(&nn->boot_lock, seq));
+ done_seqretry(&nn->boot_lock, seq);
+}
+
+void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
+{
+ ktime_get_real_ts64(&nn->nfssvc_boot);
+}
+
+void nfsd_reset_boot_verifier(struct nfsd_net *nn)
+{
+ write_seqlock(&nn->boot_lock);
+ nfsd_reset_boot_verifier_locked(nn);
+ write_sequnlock(&nn->boot_lock);
+}
+
static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -596,7 +625,7 @@ int nfsd_create_serv(struct net *net)
#endif
}
atomic_inc(&nn->ntf_refcnt);
- ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
+ nfsd_reset_boot_verifier(nn);
return 0;
}
--
2.21.0
If a file may contain unstable writes that can error out, then we want
to avoid garbage collecting the struct nfsd_file that may be
tracking those errors.
So in the garbage collector, we try to avoid collecting files that aren't
clean. Furthermore, we avoid immediately kicking off the garbage collector
in the case where the reference drops to zero for the case where there
is a write error that is being tracked.
If the file is unhashed while an error is pending, then declare a
reboot, to ensure the client resends any unstable writes.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfsd/filecache.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 42 insertions(+), 1 deletion(-)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index d229fd3c825d..92d0998824a0 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -216,6 +216,36 @@ nfsd_file_free(struct nfsd_file *nf)
return flush;
}
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+ struct address_space *mapping;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return false;
+ mapping = file->f_mapping;
+ return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+}
+
+static int
+nfsd_file_check_write_error(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return 0;
+ return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+}
+
+static bool
+nfsd_file_in_use(struct nfsd_file *nf)
+{
+ return nfsd_file_check_writeback(nf) ||
+ nfsd_file_check_write_error(nf);
+}
+
static void
nfsd_file_do_unhash(struct nfsd_file *nf)
{
@@ -223,6 +253,8 @@ nfsd_file_do_unhash(struct nfsd_file *nf)
trace_nfsd_file_unhash(nf);
+ if (nfsd_file_check_write_error(nf))
+ nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
--nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
hlist_del_rcu(&nf->nf_node);
if (!list_empty(&nf->nf_lru))
@@ -277,9 +309,10 @@ void
nfsd_file_put(struct nfsd_file *nf)
{
bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
+ bool unused = !nfsd_file_in_use(nf);
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (nfsd_file_put_noref(nf) == 1 && is_hashed)
+ if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused)
nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH);
}
@@ -345,6 +378,14 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
*/
if (atomic_read(&nf->nf_ref) > 1)
goto out_skip;
+
+ /*
+ * Don't throw out files that are still undergoing I/O or
+ * that have uncleared errors pending.
+ */
+ if (nfsd_file_check_writeback(nf))
+ goto out_skip;
+
if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
goto out_rescan;
--
2.21.0
On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
> Recently, a number of changes went into the kernel to try to ensure
> that I/O errors (specifically write errors) are reported to the
> application once and only once. The vehicle for ensuring the errors
> are reported is the struct file, which uses the 'f_wb_err' field to
> track which errors have been reported.
>
> The problem is that errors are mainly intended to be reported through
> fsync(). If the client is doing synchronous writes, then all is well,
> but if it is doing unstable writes, then the errors may not be
> reported until the client calls COMMIT. If the file cache has
> thrown out the struct file, due to memory pressure, or just because
> the client took a long while between the last WRITE and the COMMIT,
> then the error report may be lost, and the client may just think
> its data is safely stored.
These were lost before the file caching patches as well, right? Or is
there some regression?
> Note that the problem is compounded by the fact that NFSv3 is stateless,
> so the server never knows that the client may have rebooted, so there
> can be no guarantee that a COMMIT will ever be sent.
>
> The following patch set attempts to remedy the situation using 2
> strategies:
>
> 1) If the inode is dirty, then avoid garbage collecting the file
> from the file cache.
> 2) If the file is closed, and we see that it would have reported
> an error to COMMIT, then we bump the boot verifier in order to
> ensure the client retransmits all its writes.
Sounds sensible to me.
> Note that if multiple clients were writing to the same file, then
> we probably want to bump the boot verifier anyway, since only one
> COMMIT will see the error report (because the cached file is also
> shared).
I'm confused by the "probably should". So that's future work? I guess
it'd mean some additional work to identify that case. You can't really
even distinguish clients in the NFSv3 case, but I suppose you could use
IP address or TCP connection as an approximation.
--b.
> So in order to implement the above strategy, we first have to do
> the following: split up the file cache to act per net namespace,
> since the boot verifier is per net namespace. Then add a helper
> to update the boot verifier.
>
> Trond Myklebust (3):
> nfsd: nfsd_file cache entries should be per net namespace
> nfsd: Support the server resetting the boot verifier
> nfsd: Don't garbage collect files that might contain write errors
>
> fs/nfsd/export.c | 2 +-
> fs/nfsd/filecache.c | 76 +++++++++++++++++++++++++++++++++++++--------
> fs/nfsd/filecache.h | 3 +-
> fs/nfsd/netns.h | 4 +++
> fs/nfsd/nfs3xdr.c | 13 +++++---
> fs/nfsd/nfs4proc.c | 14 +++------
> fs/nfsd/nfsctl.c | 1 +
> fs/nfsd/nfssvc.c | 32 ++++++++++++++++++-
> 8 files changed, 115 insertions(+), 30 deletions(-)
>
> --
> 2.21.0
On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
> On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
> > Recently, a number of changes went into the kernel to try to ensure
> > that I/O errors (specifically write errors) are reported to the
> > application once and only once. The vehicle for ensuring the errors
> > are reported is the struct file, which uses the 'f_wb_err' field to
> > track which errors have been reported.
> >
> > The problem is that errors are mainly intended to be reported
> > through
> > fsync(). If the client is doing synchronous writes, then all is
> > well,
> > but if it is doing unstable writes, then the errors may not be
> > reported until the client calls COMMIT. If the file cache has
> > thrown out the struct file, due to memory pressure, or just because
> > the client took a long while between the last WRITE and the COMMIT,
> > then the error report may be lost, and the client may just think
> > its data is safely stored.
>
> These were lost before the file caching patches as well, right? Or
> is
> there some regression?
Correct. This is not a regression, but an attempt to fix a problem that
has existed for some time now.
>
> > Note that the problem is compounded by the fact that NFSv3 is
> > stateless,
> > so the server never knows that the client may have rebooted, so
> > there
> > can be no guarantee that a COMMIT will ever be sent.
> >
> > The following patch set attempts to remedy the situation using 2
> > strategies:
> >
> > 1) If the inode is dirty, then avoid garbage collecting the file
> > from the file cache.
> > 2) If the file is closed, and we see that it would have reported
> > an error to COMMIT, then we bump the boot verifier in order to
> > ensure the client retransmits all its writes.
>
> Sounds sensible to me.
>
> > Note that if multiple clients were writing to the same file, then
> > we probably want to bump the boot verifier anyway, since only one
> > COMMIT will see the error report (because the cached file is also
> > shared).
>
> I'm confused by the "probably should". So that's future work? I
> guess
> it'd mean some additional work to identify that case. You can't
> really
> even distinguish clients in the NFSv3 case, but I suppose you could
> use
> IP address or TCP connection as an approximation.
I'm suggesting we should do this too, but I haven't done so yet in
these patches. I'd like to hear other opinions (particularly from you,
Chuck and Jeff).
> --b.
>
> > So in order to implement the above strategy, we first have to do
> > the following: split up the file cache to act per net namespace,
> > since the boot verifier is per net namespace. Then add a helper
> > to update the boot verifier.
> >
> > Trond Myklebust (3):
> > nfsd: nfsd_file cache entries should be per net namespace
> > nfsd: Support the server resetting the boot verifier
> > nfsd: Don't garbage collect files that might contain write errors
> >
> > fs/nfsd/export.c | 2 +-
> > fs/nfsd/filecache.c | 76 +++++++++++++++++++++++++++++++++++++--
> > ------
> > fs/nfsd/filecache.h | 3 +-
> > fs/nfsd/netns.h | 4 +++
> > fs/nfsd/nfs3xdr.c | 13 +++++---
> > fs/nfsd/nfs4proc.c | 14 +++------
> > fs/nfsd/nfsctl.c | 1 +
> > fs/nfsd/nfssvc.c | 32 ++++++++++++++++++-
> > 8 files changed, 115 insertions(+), 30 deletions(-)
> >
> > --
> > 2.21.0
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Mon, Aug 26, 2019 at 09:02:31PM +0000, Trond Myklebust wrote:
> On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
> > On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
> > > Recently, a number of changes went into the kernel to try to
> > > ensure that I/O errors (specifically write errors) are reported to
> > > the application once and only once. The vehicle for ensuring the
> > > errors are reported is the struct file, which uses the 'f_wb_err'
> > > field to track which errors have been reported.
> > >
> > > The problem is that errors are mainly intended to be reported
> > > through fsync(). If the client is doing synchronous writes, then
> > > all is well, but if it is doing unstable writes, then the errors
> > > may not be reported until the client calls COMMIT. If the file
> > > cache has thrown out the struct file, due to memory pressure, or
> > > just because the client took a long while between the last WRITE
> > > and the COMMIT, then the error report may be lost, and the client
> > > may just think its data is safely stored.
> >
> > These were lost before the file caching patches as well, right? Or
> > is there some regression?
>
> Correct. This is not a regression, but an attempt to fix a problem
> that has existed for some time now.
>
> >
> > > Note that the problem is compounded by the fact that NFSv3 is
> > > stateless, so the server never knows that the client may have
> > > rebooted, so there can be no guarantee that a COMMIT will ever be
> > > sent.
> > >
> > > The following patch set attempts to remedy the situation using 2
> > > strategies:
> > >
> > > 1) If the inode is dirty, then avoid garbage collecting the file
> > > from the file cache. 2) If the file is closed, and we see that it
> > > would have reported an error to COMMIT, then we bump the boot
> > > verifier in order to ensure the client retransmits all its writes.
> >
> > Sounds sensible to me.
> >
> > > Note that if multiple clients were writing to the same file, then
> > > we probably want to bump the boot verifier anyway, since only one
> > > COMMIT will see the error report (because the cached file is also
> > > shared).
> >
> > I'm confused by the "probably should". So that's future work? I
> > guess it'd mean some additional work to identify that case. You
> > can't really even distinguish clients in the NFSv3 case, but I
> > suppose you could use IP address or TCP connection as an
> > approximation.
>
> I'm suggesting we should do this too, but I haven't done so yet in
> these patches. I'd like to hear other opinions (particularly from you,
> Chuck and Jeff).
Does this process actually converge, or do we end up with all the
clients retrying the writes and, again, only one of them getting the
error?
I wonder what the typical errors are, anyway.
--b.
On Mon, 2019-08-26 at 20:48 -0400, [email protected] wrote:
> On Mon, Aug 26, 2019 at 09:02:31PM +0000, Trond Myklebust wrote:
> > On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
> > > On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
> > > > Recently, a number of changes went into the kernel to try to
> > > > ensure that I/O errors (specifically write errors) are reported
> > > > to
> > > > the application once and only once. The vehicle for ensuring
> > > > the
> > > > errors are reported is the struct file, which uses the
> > > > 'f_wb_err'
> > > > field to track which errors have been reported.
> > > >
> > > > The problem is that errors are mainly intended to be reported
> > > > through fsync(). If the client is doing synchronous writes,
> > > > then
> > > > all is well, but if it is doing unstable writes, then the
> > > > errors
> > > > may not be reported until the client calls COMMIT. If the file
> > > > cache has thrown out the struct file, due to memory pressure,
> > > > or
> > > > just because the client took a long while between the last
> > > > WRITE
> > > > and the COMMIT, then the error report may be lost, and the
> > > > client
> > > > may just think its data is safely stored.
> > >
> > > These were lost before the file caching patches as well,
> > > right? Or
> > > is there some regression?
> >
> > Correct. This is not a regression, but an attempt to fix a problem
> > that has existed for some time now.
> >
> > > > Note that the problem is compounded by the fact that NFSv3 is
> > > > stateless, so the server never knows that the client may have
> > > > rebooted, so there can be no guarantee that a COMMIT will ever
> > > > be
> > > > sent.
> > > >
> > > > The following patch set attempts to remedy the situation using
> > > > 2
> > > > strategies:
> > > >
> > > > 1) If the inode is dirty, then avoid garbage collecting the
> > > > file
> > > > from the file cache. 2) If the file is closed, and we see that
> > > > it
> > > > would have reported an error to COMMIT, then we bump the boot
> > > > verifier in order to ensure the client retransmits all its
> > > > writes.
> > >
> > > Sounds sensible to me.
> > >
> > > > Note that if multiple clients were writing to the same file,
> > > > then
> > > > we probably want to bump the boot verifier anyway, since only
> > > > one
> > > > COMMIT will see the error report (because the cached file is
> > > > also
> > > > shared).
> > >
> > > I'm confused by the "probably should". So that's future work? I
> > > guess it'd mean some additional work to identify that case. You
> > > can't really even distinguish clients in the NFSv3 case, but I
> > > suppose you could use IP address or TCP connection as an
> > > approximation.
> >
> > I'm suggesting we should do this too, but I haven't done so yet in
> > these patches. I'd like to hear other opinions (particularly from
> > you,
> > Chuck and Jeff).
>
> Does this process actually converge, or do we end up with all the
> clients retrying the writes and, again, only one of them getting the
> error?
The client that gets the error should stop retrying if the error is
fatal.
> I wonder what the typical errors are, anyway.
I would expect ENOSPC, and EIO to be the most common. The former if
delayed allocation and/or snapshots result in writes failing after
writing to the page cache. The latter if we hit a disk outage or other
such problem.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Tue, Aug 27, 2019 at 12:56:07AM +0000, Trond Myklebust wrote:
> On Mon, 2019-08-26 at 20:48 -0400, [email protected] wrote:
> > On Mon, Aug 26, 2019 at 09:02:31PM +0000, Trond Myklebust wrote:
> > > On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
> > > > On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
> > > > > Note that if multiple clients were writing to the same file,
> > > > > then we probably want to bump the boot verifier anyway, since
> > > > > only one COMMIT will see the error report (because the cached
> > > > > file is also shared).
> > > >
> > > > I'm confused by the "probably should". So that's future work?
> > > > I guess it'd mean some additional work to identify that case.
> > > > You can't really even distinguish clients in the NFSv3 case, but
> > > > I suppose you could use IP address or TCP connection as an
> > > > approximation.
> > >
> > > I'm suggesting we should do this too, but I haven't done so yet in
> > > these patches. I'd like to hear other opinions (particularly from
> > > you, Chuck and Jeff).
> >
> > Does this process actually converge, or do we end up with all the
> > clients retrying the writes and, again, only one of them getting the
> > error?
>
> The client that gets the error should stop retrying if the error is
> fatal.
Have clients historically been good about that? I just wonder whether
it's a concern that boot-verifier-bumping could magnify the impact of
clients that are overly persistent about retrying IO errors.
> > I wonder what the typical errors are, anyway.
>
> I would expect ENOSPC, and EIO to be the most common. The former if
> delayed allocation and/or snapshots result in writes failing after
> writing to the page cache. The latter if we hit a disk outage or other
> such problem.
Makes sense.
--b.
On Mon, 2019-08-26 at 21:13 -0400, [email protected] wrote:
> On Tue, Aug 27, 2019 at 12:56:07AM +0000, Trond Myklebust wrote:
> > On Mon, 2019-08-26 at 20:48 -0400, [email protected] wrote:
> > > On Mon, Aug 26, 2019 at 09:02:31PM +0000, Trond Myklebust wrote:
> > > > On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
> > > > > On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust
> > > > > wrote:
> > > > > > Note that if multiple clients were writing to the same
> > > > > > file,
> > > > > > then we probably want to bump the boot verifier anyway,
> > > > > > since
> > > > > > only one COMMIT will see the error report (because the
> > > > > > cached
> > > > > > file is also shared).
> > > > >
> > > > > I'm confused by the "probably should". So that's future
> > > > > work?
> > > > > I guess it'd mean some additional work to identify that case.
> > > > > You can't really even distinguish clients in the NFSv3 case,
> > > > > but
> > > > > I suppose you could use IP address or TCP connection as an
> > > > > approximation.
> > > >
> > > > I'm suggesting we should do this too, but I haven't done so yet
> > > > in
> > > > these patches. I'd like to hear other opinions (particularly
> > > > from
> > > > you, Chuck and Jeff).
> > >
> > > Does this process actually converge, or do we end up with all the
> > > clients retrying the writes and, again, only one of them getting
> > > the
> > > error?
> >
> > The client that gets the error should stop retrying if the error is
> > fatal.
>
> Have clients historically been good about that? I just wonder
> whether
> it's a concern that boot-verifier-bumping could magnify the impact of
> clients that are overly persistent about retrying IO errors.
>
Clients have always been required to handle I/O errors, yes, and this
isn't just a Linux server thing. All other servers that support
unstable writes impose the same requirement on the client to check the
return value of COMMIT and to handle any errors.
> > > I wonder what the typical errors are, anyway.
> >
> > I would expect ENOSPC, and EIO to be the most common. The former if
> > delayed allocation and/or snapshots result in writes failing after
> > writing to the page cache. The latter if we hit a disk outage or
> > other
> > such problem.
>
> Makes sense.
>
> --b.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
Hi Trond,
I love your patch! Perhaps something to improve:
[auto build test WARNING on nfsd/nfsd-next]
[cannot apply to v5.3-rc6 next-20190826]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Trond-Myklebust/Handling-NFSv3-I-O-errors-in-knfsd/20190827-144324
base: git://linux-nfs.org/~bfields/linux.git nfsd-next
config: sparc64-allmodconfig (attached as .config)
compiler: sparc64-linux-gcc (GCC) 7.4.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=sparc64
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
In file included from include/linux/time.h:6:0,
from include/linux/ktime.h:24,
from include/linux/timer.h:6,
from include/linux/workqueue.h:9,
from include/linux/rhashtable-types.h:15,
from include/linux/ipc.h:7,
from include/uapi/linux/sem.h:5,
from include/linux/sem.h:5,
from include/linux/sched.h:15,
from include/linux/sched/signal.h:7,
from fs/nfsd/nfssvc.c:10:
fs/nfsd/nfssvc.c: In function 'nfsd_copy_boot_verifier':
>> include/linux/seqlock.h:528:13: warning: 'seq' may be used uninitialized in this function [-Wmaybe-uninitialized]
if (!(*seq & 1)) /* Even */
~~~~~~^~~~
fs/nfsd/nfssvc.c:349:6: note: 'seq' was declared here
int seq;
^~~
--
In file included from include/linux/time.h:6:0,
from include/linux/ktime.h:24,
from include/linux/timer.h:6,
from include/linux/workqueue.h:9,
from include/linux/rhashtable-types.h:15,
from include/linux/ipc.h:7,
from include/uapi/linux/sem.h:5,
from include/linux/sem.h:5,
from include/linux/sched.h:15,
from include/linux/sched/signal.h:7,
from fs//nfsd/nfssvc.c:10:
fs//nfsd/nfssvc.c: In function 'nfsd_copy_boot_verifier':
>> include/linux/seqlock.h:528:13: warning: 'seq' may be used uninitialized in this function [-Wmaybe-uninitialized]
if (!(*seq & 1)) /* Even */
~~~~~~^~~~
fs//nfsd/nfssvc.c:349:6: note: 'seq' was declared here
int seq;
^~~
vim +/seq +528 include/linux/seqlock.h
1370e97bb2eb1e Waiman Long 2013-09-12 515
2bc74feba12fbf Al Viro 2013-10-25 516 /**
2bc74feba12fbf Al Viro 2013-10-25 517 * read_seqbegin_or_lock - begin a sequence number check or locking block
2bc74feba12fbf Al Viro 2013-10-25 518 * @lock: sequence lock
2bc74feba12fbf Al Viro 2013-10-25 519 * @seq : sequence number to be checked
2bc74feba12fbf Al Viro 2013-10-25 520 *
2bc74feba12fbf Al Viro 2013-10-25 521 * First try it once optimistically without taking the lock. If that fails,
2bc74feba12fbf Al Viro 2013-10-25 522 * take the lock. The sequence number is also used as a marker for deciding
2bc74feba12fbf Al Viro 2013-10-25 523 * whether to be a reader (even) or writer (odd).
2bc74feba12fbf Al Viro 2013-10-25 524 * N.B. seq must be initialized to an even number to begin with.
2bc74feba12fbf Al Viro 2013-10-25 525 */
2bc74feba12fbf Al Viro 2013-10-25 526 static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
2bc74feba12fbf Al Viro 2013-10-25 527 {
2bc74feba12fbf Al Viro 2013-10-25 @528 if (!(*seq & 1)) /* Even */
2bc74feba12fbf Al Viro 2013-10-25 529 *seq = read_seqbegin(lock);
2bc74feba12fbf Al Viro 2013-10-25 530 else /* Odd */
2bc74feba12fbf Al Viro 2013-10-25 531 read_seqlock_excl(lock);
2bc74feba12fbf Al Viro 2013-10-25 532 }
2bc74feba12fbf Al Viro 2013-10-25 533
:::::: The code at line 528 was first introduced by commit
:::::: 2bc74feba12fbf052ec97aee8624c9f13593a9ac take read_seqbegin_or_lock() and friends to seqlock.h
:::::: TO: Al Viro <[email protected]>
:::::: CC: Al Viro <[email protected]>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
> On Aug 26, 2019, at 5:02 PM, Trond Myklebust <[email protected]> wrote:
>
> On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
>> On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
>>> Recently, a number of changes went into the kernel to try to ensure
>>> that I/O errors (specifically write errors) are reported to the
>>> application once and only once. The vehicle for ensuring the errors
>>> are reported is the struct file, which uses the 'f_wb_err' field to
>>> track which errors have been reported.
>>>
>>> The problem is that errors are mainly intended to be reported
>>> through
>>> fsync(). If the client is doing synchronous writes, then all is
>>> well,
>>> but if it is doing unstable writes, then the errors may not be
>>> reported until the client calls COMMIT. If the file cache has
>>> thrown out the struct file, due to memory pressure, or just because
>>> the client took a long while between the last WRITE and the COMMIT,
>>> then the error report may be lost, and the client may just think
>>> its data is safely stored.
>>
>> These were lost before the file caching patches as well, right? Or
>> is
>> there some regression?
>
> Correct. This is not a regression, but an attempt to fix a problem that
> has existed for some time now.
>
>>
>>> Note that the problem is compounded by the fact that NFSv3 is
>>> stateless,
>>> so the server never knows that the client may have rebooted, so
>>> there
>>> can be no guarantee that a COMMIT will ever be sent.
>>>
>>> The following patch set attempts to remedy the situation using 2
>>> strategies:
>>>
>>> 1) If the inode is dirty, then avoid garbage collecting the file
>>> from the file cache.
>>> 2) If the file is closed, and we see that it would have reported
>>> an error to COMMIT, then we bump the boot verifier in order to
>>> ensure the client retransmits all its writes.
>>
>> Sounds sensible to me.
>>
>>> Note that if multiple clients were writing to the same file, then
>>> we probably want to bump the boot verifier anyway, since only one
>>> COMMIT will see the error report (because the cached file is also
>>> shared).
>>
>> I'm confused by the "probably should". So that's future work? I
>> guess
>> it'd mean some additional work to identify that case. You can't
>> really
>> even distinguish clients in the NFSv3 case, but I suppose you could
>> use
>> IP address or TCP connection as an approximation.
>
> I'm suggesting we should do this too, but I haven't done so yet in
> these patches. I'd like to hear other opinions (particularly from you,
> Chuck and Jeff).
The strategy of handling these errors more carefully seems good.
Bumping the write/commit verifier so the client writes again to
retrieve the latent error is clever!
It's not clear to me though that the NFSv3 protocol can deal with
the multi-client write scenario, since it is stateless. We are now
making it stateful in some sense by preserving error state on the
server across NFS requests, without having any sense of an open
file in the protocol itself.
Would an "approximation" without open state be good enough? I
assume you are doing this to more fully support the FlexFiles
layout type. Do you have any analysis or thought about this next
step?
I also echo Bruce's concern about whether the client implementations
are up to snuff. There could be long-standing bugs or their protocol
implementation could be missing parts. This is more curiosity than
an objection, but maybe noting which client implementations you've
tested with would be good.
>> --b.
>>
>>> So in order to implement the above strategy, we first have to do
>>> the following: split up the file cache to act per net namespace,
>>> since the boot verifier is per net namespace. Then add a helper
>>> to update the boot verifier.
>>>
>>> Trond Myklebust (3):
>>> nfsd: nfsd_file cache entries should be per net namespace
>>> nfsd: Support the server resetting the boot verifier
>>> nfsd: Don't garbage collect files that might contain write errors
>>>
>>> fs/nfsd/export.c | 2 +-
>>> fs/nfsd/filecache.c | 76 +++++++++++++++++++++++++++++++++++++--
>>> ------
>>> fs/nfsd/filecache.h | 3 +-
>>> fs/nfsd/netns.h | 4 +++
>>> fs/nfsd/nfs3xdr.c | 13 +++++---
>>> fs/nfsd/nfs4proc.c | 14 +++------
>>> fs/nfsd/nfsctl.c | 1 +
>>> fs/nfsd/nfssvc.c | 32 ++++++++++++++++++-
>>> 8 files changed, 115 insertions(+), 30 deletions(-)
>>>
>>> --
>>> 2.21.0
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> [email protected]
--
Chuck Lever
On Tue, 2019-08-27 at 09:59 -0400, Chuck Lever wrote:
> > On Aug 26, 2019, at 5:02 PM, Trond Myklebust <
> > [email protected]> wrote:
> >
> > On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
> > > On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
> > > > Recently, a number of changes went into the kernel to try to
> > > > ensure
> > > > that I/O errors (specifically write errors) are reported to the
> > > > application once and only once. The vehicle for ensuring the
> > > > errors
> > > > are reported is the struct file, which uses the 'f_wb_err'
> > > > field to
> > > > track which errors have been reported.
> > > >
> > > > The problem is that errors are mainly intended to be reported
> > > > through
> > > > fsync(). If the client is doing synchronous writes, then all is
> > > > well,
> > > > but if it is doing unstable writes, then the errors may not be
> > > > reported until the client calls COMMIT. If the file cache has
> > > > thrown out the struct file, due to memory pressure, or just
> > > > because
> > > > the client took a long while between the last WRITE and the
> > > > COMMIT,
> > > > then the error report may be lost, and the client may just
> > > > think
> > > > its data is safely stored.
> > >
> > > These were lost before the file caching patches as well,
> > > right? Or
> > > is
> > > there some regression?
> >
> > Correct. This is not a regression, but an attempt to fix a problem
> > that
> > has existed for some time now.
> >
> > > > Note that the problem is compounded by the fact that NFSv3 is
> > > > stateless,
> > > > so the server never knows that the client may have rebooted, so
> > > > there
> > > > can be no guarantee that a COMMIT will ever be sent.
> > > >
> > > > The following patch set attempts to remedy the situation using
> > > > 2
> > > > strategies:
> > > >
> > > > 1) If the inode is dirty, then avoid garbage collecting the
> > > > file
> > > > from the file cache.
> > > > 2) If the file is closed, and we see that it would have
> > > > reported
> > > > an error to COMMIT, then we bump the boot verifier in order
> > > > to
> > > > ensure the client retransmits all its writes.
> > >
> > > Sounds sensible to me.
> > >
> > > > Note that if multiple clients were writing to the same file,
> > > > then
> > > > we probably want to bump the boot verifier anyway, since only
> > > > one
> > > > COMMIT will see the error report (because the cached file is
> > > > also
> > > > shared).
> > >
> > > I'm confused by the "probably should". So that's future work? I
> > > guess
> > > it'd mean some additional work to identify that case. You can't
> > > really
> > > even distinguish clients in the NFSv3 case, but I suppose you
> > > could
> > > use
> > > IP address or TCP connection as an approximation.
> >
> > I'm suggesting we should do this too, but I haven't done so yet in
> > these patches. I'd like to hear other opinions (particularly from
> > you,
> > Chuck and Jeff).
>
> The strategy of handling these errors more carefully seems good.
> Bumping the write/commit verifier so the client writes again to
> retrieve the latent error is clever!
>
> It's not clear to me though that the NFSv3 protocol can deal with
> the multi-client write scenario, since it is stateless. We are now
> making it stateful in some sense by preserving error state on the
> server across NFS requests, without having any sense of an open
> file in the protocol itself.
>
> Would an "approximation" without open state be good enough? I
> assume you are doing this to more fully support the FlexFiles
> layout type. Do you have any analysis or thought about this next
> step?
No, see above. This has nothing to do with the flex files layout type.
It is about ensuring the integrity of _all_ unstable writes. Any client
can hit writeback errors that only manifest themselves when the data is
flushed to disk.
> I also echo Bruce's concern about whether the client implementations
> are up to snuff. There could be long-standing bugs or their protocol
> implementation could be missing parts. This is more curiosity than
> an objection, but maybe noting which client implementations you've
> tested with would be good.
So what exactly is the concern? How do you and Bruce expect this might
make things worse?
As far as I can see, if a client doesn't handle errors in COMMIT, then
it is screwed either way, since we can already end up reporting such
errors today through the fsync() mechanism. If the client ignores the
error, it ends up silently corrupting the file. If it retries the
writes as unstable, then it loops until the error is cleared. The
current patchset does not affect either case for this single client:
the declaration of a reboot just ensures that silent corruption does
not occur, and if the client handles the error correctly once it
catches it, then great, otherwise no extra harm occurs.
If we want to fix the case of multiple writers (i.e. the case where
someone is using NLM locking to mediate between writers, or they are
using O_DIRECT and some external synchronisation mechanism) then we
need to do more (hence the proposal that we extend the current patches
to also declare a reboot on any failure of COMMIT).
In that case, the broken client, that loops forever, is going to cause
contention with all the other clients anyway, since it will never
complete its I/O (or it will silently corrupt the file if it doesn't
see the error). By declaring a reboot we don't make this case worse as
far as I can see: we still end up looping forever, but the file doesn't
get silently corrupted.
The one problem is that the looping forever client can cause other
clients to loop forever on their otherwise successful writes on other
files. That's bad, but again, that's due to client behaviour that is
toxic even today.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Tue, Aug 27, 2019 at 09:59:25AM -0400, Chuck Lever wrote:
> The strategy of handling these errors more carefully seems good.
> Bumping the write/commit verifier so the client writes again to
> retrieve the latent error is clever!
>
> It's not clear to me though that the NFSv3 protocol can deal with
> the multi-client write scenario, since it is stateless. We are now
> making it stateful in some sense by preserving error state on the
> server across NFS requests, without having any sense of an open
> file in the protocol itself.
>
> Would an "approximation" without open state be good enough?
I figure there's a correct-but-slow approach which is to increment the
write verifier every time there's a write error. Does that work? If
write errors are rare enough, maybe it doesn't matter that that's slow?
Then there's various approximations you could use (IP address?) to guess
when only one client's accessing the file. You save forcing all the
clients to resend writes at the expense of some complexity and
correctness--e.g., multiple clients behind NAT might not all get write
errors.
Am I thinking aobut this right?
--b.
On Tue, Aug 27, 2019 at 02:53:01PM +0000, Trond Myklebust wrote:
> The one problem is that the looping forever client can cause other
> clients to loop forever on their otherwise successful writes on other
> files.
Yeah, that's the case I was wondering about.
> That's bad, but again, that's due to client behaviour that is
> toxic even today.
So my worry was that if write errors are rare and the consequences of
the single client looping forever are relatively mild, then there might
be deployed clients that get away with that behavior.
But maybe the behavior's a lot more "toxic" than I imagined, hence
unlikely to be very common.
--b.
On Tue, Aug 27, 2019 at 10:58:19AM -0400, [email protected] wrote:
> On Tue, Aug 27, 2019 at 02:53:01PM +0000, Trond Myklebust wrote:
> > The one problem is that the looping forever client can cause other
> > clients to loop forever on their otherwise successful writes on other
> > files.
>
> Yeah, that's the case I was wondering about.
>
> > That's bad, but again, that's due to client behaviour that is
> > toxic even today.
>
> So my worry was that if write errors are rare and the consequences of
> the single client looping forever are relatively mild, then there might
> be deployed clients that get away with that behavior.
>
> But maybe the behavior's a lot more "toxic" than I imagined, hence
> unlikely to be very common.
(And, to be clear, I like the idea, just making sure I'm not overlooking
any problems....)
--b.
On Tue, Aug 27, 2019 at 02:59:34PM +0000, Trond Myklebust wrote:
> On Tue, 2019-08-27 at 10:54 -0400, Bruce Fields wrote:
> > On Tue, Aug 27, 2019 at 09:59:25AM -0400, Chuck Lever wrote:
> > > The strategy of handling these errors more carefully seems good.
> > > Bumping the write/commit verifier so the client writes again to
> > > retrieve the latent error is clever!
> > >
> > > It's not clear to me though that the NFSv3 protocol can deal with
> > > the multi-client write scenario, since it is stateless. We are now
> > > making it stateful in some sense by preserving error state on the
> > > server across NFS requests, without having any sense of an open
> > > file in the protocol itself.
> > >
> > > Would an "approximation" without open state be good enough?
> >
> > I figure there's a correct-but-slow approach which is to increment
> > the
> > write verifier every time there's a write error. Does that work? If
> > write errors are rare enough, maybe it doesn't matter that that's
> > slow?
>
> How is that different from changing the boot verifier?
> Are you
> proposing to track write verifiers on a per-client basis? That seems
> onerous too.
Sorry, I thought "write verifier" and "boot verifier" were synonyms,
and, no, wasn't suggesting tracking it per-file.
--b.
> > Then there's various approximations you could use (IP address?) to
> > guess
> > when only one client's accessing the file. You save forcing all the
> > clients to resend writes at the expense of some complexity and
> > correctness--e.g., multiple clients behind NAT might not all get
> > write
> > errors.
> >
> > Am I thinking aobut this right?
>
> I agree that there are multiple problems with tracking on a per-client
> basis.
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> [email protected]
>
>
On Tue, 2019-08-27 at 10:54 -0400, Bruce Fields wrote:
> On Tue, Aug 27, 2019 at 09:59:25AM -0400, Chuck Lever wrote:
> > The strategy of handling these errors more carefully seems good.
> > Bumping the write/commit verifier so the client writes again to
> > retrieve the latent error is clever!
> >
> > It's not clear to me though that the NFSv3 protocol can deal with
> > the multi-client write scenario, since it is stateless. We are now
> > making it stateful in some sense by preserving error state on the
> > server across NFS requests, without having any sense of an open
> > file in the protocol itself.
> >
> > Would an "approximation" without open state be good enough?
>
> I figure there's a correct-but-slow approach which is to increment
> the
> write verifier every time there's a write error. Does that work? If
> write errors are rare enough, maybe it doesn't matter that that's
> slow?
How is that different from changing the boot verifier? Are you
proposing to track write verifiers on a per-client basis? That seems
onerous too.
> Then there's various approximations you could use (IP address?) to
> guess
> when only one client's accessing the file. You save forcing all the
> clients to resend writes at the expense of some complexity and
> correctness--e.g., multiple clients behind NAT might not all get
> write
> errors.
>
> Am I thinking aobut this right?
I agree that there are multiple problems with tracking on a per-client
basis.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Tue, 2019-08-27 at 10:59 -0400, [email protected] wrote:
> On Tue, Aug 27, 2019 at 10:58:19AM -0400, [email protected] wrote:
> > On Tue, Aug 27, 2019 at 02:53:01PM +0000, Trond Myklebust wrote:
> > > The one problem is that the looping forever client can cause
> > > other
> > > clients to loop forever on their otherwise successful writes on
> > > other
> > > files.
> >
> > Yeah, that's the case I was wondering about.
> >
> > > That's bad, but again, that's due to client behaviour that is
> > > toxic even today.
> >
> > So my worry was that if write errors are rare and the consequences
> > of
> > the single client looping forever are relatively mild, then there
> > might
> > be deployed clients that get away with that behavior.
> >
> > But maybe the behavior's a lot more "toxic" than I imagined, hence
> > unlikely to be very common.
>
> (And, to be clear, I like the idea, just making sure I'm not
> overlooking
> any problems....)
>
I'm open to other suggestions, but I'm having trouble finding one that
can scale correctly (i.e. not require per-client tracking), prevent
silent corruption (by causing clients to miss errors), while not
relying on optional features that may not be implemented by all NFSv3
clients (e.g. per-file write verifiers are not implemented by *BSD).
That said, it seems to me that to do nothing should not be an option,
as that would imply tolerating silent corruption of file data.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Tue, 2019-08-27 at 09:59 -0400, Chuck Lever wrote:
> > On Aug 26, 2019, at 5:02 PM, Trond Myklebust <[email protected]> wrote:
> >
> > On Mon, 2019-08-26 at 16:51 -0400, J. Bruce Fields wrote:
> > > On Mon, Aug 26, 2019 at 12:50:18PM -0400, Trond Myklebust wrote:
> > > > Recently, a number of changes went into the kernel to try to ensure
> > > > that I/O errors (specifically write errors) are reported to the
> > > > application once and only once. The vehicle for ensuring the errors
> > > > are reported is the struct file, which uses the 'f_wb_err' field to
> > > > track which errors have been reported.
> > > >
> > > > The problem is that errors are mainly intended to be reported
> > > > through
> > > > fsync(). If the client is doing synchronous writes, then all is
> > > > well,
> > > > but if it is doing unstable writes, then the errors may not be
> > > > reported until the client calls COMMIT. If the file cache has
> > > > thrown out the struct file, due to memory pressure, or just because
> > > > the client took a long while between the last WRITE and the COMMIT,
> > > > then the error report may be lost, and the client may just think
> > > > its data is safely stored.
> > >
> > > These were lost before the file caching patches as well, right? Or
> > > is
> > > there some regression?
> >
> > Correct. This is not a regression, but an attempt to fix a problem that
> > has existed for some time now.
> >
> > > > Note that the problem is compounded by the fact that NFSv3 is
> > > > stateless,
> > > > so the server never knows that the client may have rebooted, so
> > > > there
> > > > can be no guarantee that a COMMIT will ever be sent.
> > > >
> > > > The following patch set attempts to remedy the situation using 2
> > > > strategies:
> > > >
> > > > 1) If the inode is dirty, then avoid garbage collecting the file
> > > > from the file cache.
> > > > 2) If the file is closed, and we see that it would have reported
> > > > an error to COMMIT, then we bump the boot verifier in order to
> > > > ensure the client retransmits all its writes.
> > >
> > > Sounds sensible to me.
> > >
> > > > Note that if multiple clients were writing to the same file, then
> > > > we probably want to bump the boot verifier anyway, since only one
> > > > COMMIT will see the error report (because the cached file is also
> > > > shared).
> > >
> > > I'm confused by the "probably should". So that's future work? I
> > > guess
> > > it'd mean some additional work to identify that case. You can't
> > > really
> > > even distinguish clients in the NFSv3 case, but I suppose you could
> > > use
> > > IP address or TCP connection as an approximation.
> >
> > I'm suggesting we should do this too, but I haven't done so yet in
> > these patches. I'd like to hear other opinions (particularly from you,
> > Chuck and Jeff).
>
> The strategy of handling these errors more carefully seems good.
> Bumping the write/commit verifier so the client writes again to
> retrieve the latent error is clever!
>
Yes, this would seem to neatly solve a whole class of related problems
in these sorts of scenarios. I also think we ought to bump the verifier
whenever nfsd sees a writeback error, as you have a great point that
we'll only report writeback errors on the first COMMIT after an error
today. Fixing that would be a nice goal.
We should note though that the verifier is a per net-namespace value, so
if you get a transient writeback error on one inode, any inode that is
currently dirty will probably end up having its writes retransmitted
once you bump the verifier.
That seems like an acceptable thing to do in these scenarios, but we may
need to consider making the verifier more granular if that turns out to
cause a lot of re-write activity that isn't necessary.
> It's not clear to me though that the NFSv3 protocol can deal with
> the multi-client write scenario, since it is stateless. We are now
> making it stateful in some sense by preserving error state on the
> server across NFS requests, without having any sense of an open
> file in the protocol itself.
>
I think it's worthwhile to do the best we can here. WRITE/COMMIT are
inherently stateful to some degree in that they involve a verifier.
Trond's proposal is just utilizing that fact to ensure that we deliver
writeback errors more widely. I like it!
> Would an "approximation" without open state be good enough? I
> assume you are doing this to more fully support the FlexFiles
> layout type. Do you have any analysis or thought about this next
> step?
>
> I also echo Bruce's concern about whether the client implementations
> are up to snuff. There could be long-standing bugs or their protocol
> implementation could be missing parts. This is more curiosity than
> an objection, but maybe noting which client implementations you've
> tested with would be good.
>
>
> > > --b.
> > >
> > > > So in order to implement the above strategy, we first have to do
> > > > the following: split up the file cache to act per net namespace,
> > > > since the boot verifier is per net namespace. Then add a helper
> > > > to update the boot verifier.
> > > >
> > > > Trond Myklebust (3):
> > > > nfsd: nfsd_file cache entries should be per net namespace
> > > > nfsd: Support the server resetting the boot verifier
> > > > nfsd: Don't garbage collect files that might contain write errors
> > > >
> > > > fs/nfsd/export.c | 2 +-
> > > > fs/nfsd/filecache.c | 76 +++++++++++++++++++++++++++++++++++++--
> > > > ------
> > > > fs/nfsd/filecache.h | 3 +-
> > > > fs/nfsd/netns.h | 4 +++
> > > > fs/nfsd/nfs3xdr.c | 13 +++++---
> > > > fs/nfsd/nfs4proc.c | 14 +++------
> > > > fs/nfsd/nfsctl.c | 1 +
> > > > fs/nfsd/nfssvc.c | 32 ++++++++++++++++++-
> > > > 8 files changed, 115 insertions(+), 30 deletions(-)
> > > >
> > > > --
> > > > 2.21.0
> > --
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > [email protected]
>
> --
> Chuck Lever
>
>
>
--
Jeff Layton <[email protected]>
> On Aug 27, 2019, at 11:15 AM, Trond Myklebust <[email protected]> wrote:
>
> On Tue, 2019-08-27 at 10:59 -0400, [email protected] wrote:
>> On Tue, Aug 27, 2019 at 10:58:19AM -0400, [email protected] wrote:
>>> On Tue, Aug 27, 2019 at 02:53:01PM +0000, Trond Myklebust wrote:
>>>> The one problem is that the looping forever client can cause
>>>> other
>>>> clients to loop forever on their otherwise successful writes on
>>>> other
>>>> files.
>>>
>>> Yeah, that's the case I was wondering about.
>>>
>>>> That's bad, but again, that's due to client behaviour that is
>>>> toxic even today.
>>>
>>> So my worry was that if write errors are rare and the consequences
>>> of
>>> the single client looping forever are relatively mild, then there
>>> might
>>> be deployed clients that get away with that behavior.
>>>
>>> But maybe the behavior's a lot more "toxic" than I imagined, hence
>>> unlikely to be very common.
>>
>> (And, to be clear, I like the idea, just making sure I'm not
>> overlooking
>> any problems....)
>>
> I'm open to other suggestions, but I'm having trouble finding one that
> can scale correctly (i.e. not require per-client tracking), prevent
> silent corruption (by causing clients to miss errors), while not
> relying on optional features that may not be implemented by all NFSv3
> clients (e.g. per-file write verifiers are not implemented by *BSD).
>
> That said, it seems to me that to do nothing should not be an option,
> as that would imply tolerating silent corruption of file data.
Agree, we should move forward. I'm not saying "do nothing," I'm
just trying to understand what is improved and what is still left
to do (maybe nothing).
--
Chuck Lever
On Tue, Aug 27, 2019 at 03:15:35PM +0000, Trond Myklebust wrote:
> I'm open to other suggestions, but I'm having trouble finding one that
> can scale correctly (i.e. not require per-client tracking), prevent
> silent corruption (by causing clients to miss errors), while not
> relying on optional features that may not be implemented by all NFSv3
> clients (e.g. per-file write verifiers are not implemented by *BSD).
>
> That said, it seems to me that to do nothing should not be an option,
> as that would imply tolerating silent corruption of file data.
So should we increment the boot verifier every time we discover an error
on an asynchronous write?
--b.
On Wed, 2019-08-28 at 09:48 -0400, [email protected] wrote:
> On Tue, Aug 27, 2019 at 03:15:35PM +0000, Trond Myklebust wrote:
> > I'm open to other suggestions, but I'm having trouble finding one that
> > can scale correctly (i.e. not require per-client tracking), prevent
> > silent corruption (by causing clients to miss errors), while not
> > relying on optional features that may not be implemented by all NFSv3
> > clients (e.g. per-file write verifiers are not implemented by *BSD).
> >
> > That said, it seems to me that to do nothing should not be an option,
> > as that would imply tolerating silent corruption of file data.
>
> So should we increment the boot verifier every time we discover an error
> on an asynchronous write?
>
I think so. Otherwise, only one client will ever see that error.
--
Jeff Layton <[email protected]>
> On Aug 28, 2019, at 9:51 AM, Jeff Layton <[email protected]> wrote:
>
> On Wed, 2019-08-28 at 09:48 -0400, [email protected] wrote:
>> On Tue, Aug 27, 2019 at 03:15:35PM +0000, Trond Myklebust wrote:
>>> I'm open to other suggestions, but I'm having trouble finding one that
>>> can scale correctly (i.e. not require per-client tracking), prevent
>>> silent corruption (by causing clients to miss errors), while not
>>> relying on optional features that may not be implemented by all NFSv3
>>> clients (e.g. per-file write verifiers are not implemented by *BSD).
>>>
>>> That said, it seems to me that to do nothing should not be an option,
>>> as that would imply tolerating silent corruption of file data.
>>
>> So should we increment the boot verifier every time we discover an error
>> on an asynchronous write?
>>
>
> I think so. Otherwise, only one client will ever see that error.
+1
I'm not familiar with the details of how the Linux NFS server
implements the boot verifier: Will a verifier bump be effective
for all file systems that server exports? If so, is that an
acceptable cost?
--
Chuck Lever
On Wed, Aug 28, 2019 at 09:57:25AM -0400, Chuck Lever wrote:
>
>
> > On Aug 28, 2019, at 9:51 AM, Jeff Layton <[email protected]> wrote:
> >
> > On Wed, 2019-08-28 at 09:48 -0400, [email protected] wrote:
> >> On Tue, Aug 27, 2019 at 03:15:35PM +0000, Trond Myklebust wrote:
> >>> I'm open to other suggestions, but I'm having trouble finding one that
> >>> can scale correctly (i.e. not require per-client tracking), prevent
> >>> silent corruption (by causing clients to miss errors), while not
> >>> relying on optional features that may not be implemented by all NFSv3
> >>> clients (e.g. per-file write verifiers are not implemented by *BSD).
> >>>
> >>> That said, it seems to me that to do nothing should not be an option,
> >>> as that would imply tolerating silent corruption of file data.
> >>
> >> So should we increment the boot verifier every time we discover an error
> >> on an asynchronous write?
> >>
> >
> > I think so. Otherwise, only one client will ever see that error.
>
> +1
>
> I'm not familiar with the details of how the Linux NFS server
> implements the boot verifier: Will a verifier bump be effective
> for all file systems that server exports?
Yes. It will be per network namespace, but that's the only limit.
> If so, is that an acceptable cost?
It means clients will resend all their uncommitted writes. That could
certainly make write errors more expensive. But maybe you've already
got bigger problems if you've got a full or failing disk?
--b.
> On Aug 28, 2019, at 10:00 AM, J. Bruce Fields <[email protected]> wrote:
>
> On Wed, Aug 28, 2019 at 09:57:25AM -0400, Chuck Lever wrote:
>>
>>
>>> On Aug 28, 2019, at 9:51 AM, Jeff Layton <[email protected]> wrote:
>>>
>>> On Wed, 2019-08-28 at 09:48 -0400, [email protected] wrote:
>>>> On Tue, Aug 27, 2019 at 03:15:35PM +0000, Trond Myklebust wrote:
>>>>> I'm open to other suggestions, but I'm having trouble finding one that
>>>>> can scale correctly (i.e. not require per-client tracking), prevent
>>>>> silent corruption (by causing clients to miss errors), while not
>>>>> relying on optional features that may not be implemented by all NFSv3
>>>>> clients (e.g. per-file write verifiers are not implemented by *BSD).
>>>>>
>>>>> That said, it seems to me that to do nothing should not be an option,
>>>>> as that would imply tolerating silent corruption of file data.
>>>>
>>>> So should we increment the boot verifier every time we discover an error
>>>> on an asynchronous write?
>>>>
>>>
>>> I think so. Otherwise, only one client will ever see that error.
>>
>> +1
>>
>> I'm not familiar with the details of how the Linux NFS server
>> implements the boot verifier: Will a verifier bump be effective
>> for all file systems that server exports?
>
> Yes. It will be per network namespace, but that's the only limit.
>
>> If so, is that an acceptable cost?
>
> It means clients will resend all their uncommitted writes. That could
> certainly make write errors more expensive. But maybe you've already
> got bigger problems if you've got a full or failing disk?
One full filesystem will impact the behavior of all other exported
filesystems. That might be surprising behavior to a server administrator.
I don't have any suggestions other than maintaining a separate verifier
for each exported filesystem in each net namespace.
--
Chuck Lever
On Wed, 2019-08-28 at 10:03 -0400, Chuck Lever wrote:
> > On Aug 28, 2019, at 10:00 AM, J. Bruce Fields <[email protected]> wrote:
> >
> > On Wed, Aug 28, 2019 at 09:57:25AM -0400, Chuck Lever wrote:
> > >
> > > > On Aug 28, 2019, at 9:51 AM, Jeff Layton <[email protected]> wrote:
> > > >
> > > > On Wed, 2019-08-28 at 09:48 -0400, [email protected] wrote:
> > > > > On Tue, Aug 27, 2019 at 03:15:35PM +0000, Trond Myklebust wrote:
> > > > > > I'm open to other suggestions, but I'm having trouble finding one that
> > > > > > can scale correctly (i.e. not require per-client tracking), prevent
> > > > > > silent corruption (by causing clients to miss errors), while not
> > > > > > relying on optional features that may not be implemented by all NFSv3
> > > > > > clients (e.g. per-file write verifiers are not implemented by *BSD).
> > > > > >
> > > > > > That said, it seems to me that to do nothing should not be an option,
> > > > > > as that would imply tolerating silent corruption of file data.
> > > > >
> > > > > So should we increment the boot verifier every time we discover an error
> > > > > on an asynchronous write?
> > > > >
> > > >
> > > > I think so. Otherwise, only one client will ever see that error.
> > >
> > > +1
> > >
> > > I'm not familiar with the details of how the Linux NFS server
> > > implements the boot verifier: Will a verifier bump be effective
> > > for all file systems that server exports?
> >
> > Yes. It will be per network namespace, but that's the only limit.
> >
> > > If so, is that an acceptable cost?
> >
> > It means clients will resend all their uncommitted writes. That could
> > certainly make write errors more expensive. But maybe you've already
> > got bigger problems if you've got a full or failing disk?
>
> One full filesystem will impact the behavior of all other exported
> filesystems. That might be surprising behavior to a server administrator.
> I don't have any suggestions other than maintaining a separate verifier
> for each exported filesystem in each net namespace.
>
>
Yeah, it's not pretty, but I think the alternative is worse. Most admins
would take rotten performance over data corruption.
For the most part, these sorts of errors tend to be rare. If it turns
out to be a problem we could consider moving the verifier into
svc_export or something?
--
Jeff Layton <[email protected]>
> On Aug 28, 2019, at 10:16 AM, Jeff Layton <[email protected]> wrote:
>
> On Wed, 2019-08-28 at 10:03 -0400, Chuck Lever wrote:
>>> On Aug 28, 2019, at 10:00 AM, J. Bruce Fields <[email protected]> wrote:
>>>
>>> On Wed, Aug 28, 2019 at 09:57:25AM -0400, Chuck Lever wrote:
>>>>
>>>>> On Aug 28, 2019, at 9:51 AM, Jeff Layton <[email protected]> wrote:
>>>>>
>>>>> On Wed, 2019-08-28 at 09:48 -0400, [email protected] wrote:
>>>>>> On Tue, Aug 27, 2019 at 03:15:35PM +0000, Trond Myklebust wrote:
>>>>>>> I'm open to other suggestions, but I'm having trouble finding one that
>>>>>>> can scale correctly (i.e. not require per-client tracking), prevent
>>>>>>> silent corruption (by causing clients to miss errors), while not
>>>>>>> relying on optional features that may not be implemented by all NFSv3
>>>>>>> clients (e.g. per-file write verifiers are not implemented by *BSD).
>>>>>>>
>>>>>>> That said, it seems to me that to do nothing should not be an option,
>>>>>>> as that would imply tolerating silent corruption of file data.
>>>>>>
>>>>>> So should we increment the boot verifier every time we discover an error
>>>>>> on an asynchronous write?
>>>>>>
>>>>>
>>>>> I think so. Otherwise, only one client will ever see that error.
>>>>
>>>> +1
>>>>
>>>> I'm not familiar with the details of how the Linux NFS server
>>>> implements the boot verifier: Will a verifier bump be effective
>>>> for all file systems that server exports?
>>>
>>> Yes. It will be per network namespace, but that's the only limit.
>>>
>>>> If so, is that an acceptable cost?
>>>
>>> It means clients will resend all their uncommitted writes. That could
>>> certainly make write errors more expensive. But maybe you've already
>>> got bigger problems if you've got a full or failing disk?
>>
>> One full filesystem will impact the behavior of all other exported
>> filesystems. That might be surprising behavior to a server administrator.
>> I don't have any suggestions other than maintaining a separate verifier
>> for each exported filesystem in each net namespace.
>>
>>
>
> Yeah, it's not pretty, but I think the alternative is worse. Most admins
> would take rotten performance over data corruption.
Again, I'm not saying we should do nothing. It doesn't seem
like a per-export verifier would be much more work than a
per-net-namespace verifier.
> For the most part, these sorts of errors tend to be rare.
EIO is certainly going to be rare, agreed. ENOSPC might not be.
> If it turns
> out to be a problem we could consider moving the verifier into
> svc_export or something?
--
Chuck Lever
On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
> For the most part, these sorts of errors tend to be rare. If it turns
> out to be a problem we could consider moving the verifier into
> svc_export or something?
As Trond says, this isn't really a server implementation issue, it's a
protocol issue. If a client detects when to resend writes by storing a
single verifier per server, then returning different verifiers from
writes to different exports will have it resending every time it writes
to one export then another.
--b.
On Wed, Aug 28, 2019 at 10:40:31AM -0400, J. Bruce Fields wrote:
> On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
> > For the most part, these sorts of errors tend to be rare. If it turns
> > out to be a problem we could consider moving the verifier into
> > svc_export or something?
>
> As Trond says, this isn't really a server implementation issue, it's a
> protocol issue. If a client detects when to resend writes by storing a
> single verifier per server, then returning different verifiers from
> writes to different exports will have it resending every time it writes
> to one export then another.
The other way to limit this is by trying to detect the single-writer
case, since it's probably also rare for multiple clients to write to the
same file.
Are there any common cases where two clients share the same TCP
connection? Maybe that would be a conservative enough test?
And you wouldn't have to track every connection that's ever sent a WRITE
to a given file. Just remember the first one, with a flag to track
whether anyone else has ever written to it.
??
--b.
> On Aug 28, 2019, at 10:48 AM, Bruce Fields <[email protected]> wrote:
>
> On Wed, Aug 28, 2019 at 10:40:31AM -0400, J. Bruce Fields wrote:
>> On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
>>> For the most part, these sorts of errors tend to be rare. If it turns
>>> out to be a problem we could consider moving the verifier into
>>> svc_export or something?
>>
>> As Trond says, this isn't really a server implementation issue, it's a
>> protocol issue. If a client detects when to resend writes by storing a
>> single verifier per server, then returning different verifiers from
>> writes to different exports will have it resending every time it writes
>> to one export then another.
>
> The other way to limit this is by trying to detect the single-writer
> case, since it's probably also rare for multiple clients to write to the
> same file.
>
> Are there any common cases where two clients share the same TCP
> connection? Maybe that would be a conservative enough test?
What happens if a client reconnects? Does that bump the verifier?
If it reconnects from the same source port, can the server tell
that the connection is different?
> And you wouldn't have to track every connection that's ever sent a WRITE
> to a given file. Just remember the first one, with a flag to track
> whether anyone else has ever written to it.
>
> ??
>
> --b.
--
Chuck Lever
On Wed, 2019-08-28 at 10:40 -0400, J. Bruce Fields wrote:
> On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
> > For the most part, these sorts of errors tend to be rare. If it turns
> > out to be a problem we could consider moving the verifier into
> > svc_export or something?
>
> As Trond says, this isn't really a server implementation issue, it's a
> protocol issue. If a client detects when to resend writes by storing a
> single verifier per server, then returning different verifiers from
> writes to different exports will have it resending every time it writes
> to one export then another.
>
Huh. I've always considered the verifier to be a per-inode quantity that
we just happened to fill with a global value, but the spec _is_ a bit
vague in this regard.
I think modern Linux clients track this on a per-inode basis, but I'm
not sure about really old Linux or other clients. It'd be good to know
about BSD and Solaris, in particular.
--
Jeff Layton <[email protected]>
J. Bruce Fields wrote:
>On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
>> For the most part, these sorts of errors tend to be rare. If it turns
>> out to be a problem we could consider moving the verifier into
>> svc_export or something?
>
>As Trond says, this isn't really a server implementation issue, it's a
>protocol issue. If a client detects when to resend writes by storing a
>single verifier per server, then returning different verifiers from
>writes to different exports will have it resending every time it writes
>to one export then another.
Well, here's what RFC-1813 says about the write verifier:
This cookie must be consistent during a single instance
of the NFS version 3 protocol service and must be
unique between instances of the NFS version 3 protocol
server, where uncommitted data may be lost.
You could debate what "service" means in the above, but I'd say it isn't "per file".
However, since there is no way for an NFSv3 client to know what a "server" is,
the FreeBSD client (and maybe the other *BSD, although I haven't been involved
in them for a long time) stores the write verifier "per mount".
--> So, for the FreeBSD client, it is at the granularity of what the NFSv3 client sees as
a single file system. (Typically a single file system on the server unless the knfsd
plays the game of coalescing multiple file systems by uniquifying the I-node#, etc.)
It is conceivable that some other NFSv3 client might assume
"same server IP address --> same server" and store it "per server IP", but I have
no idea if any such client exists.
rick
On Wed, 2019-08-28 at 15:12 +0000, Rick Macklem wrote:
> J. Bruce Fields wrote:
> > On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
> > > For the most part, these sorts of errors tend to be rare. If it
> > > turns
> > > out to be a problem we could consider moving the verifier into
> > > svc_export or something?
> >
> > As Trond says, this isn't really a server implementation issue,
> > it's a
> > protocol issue. If a client detects when to resend writes by
> > storing a
> > single verifier per server, then returning different verifiers from
> > writes to different exports will have it resending every time it
> > writes
> > to one export then another.
>
> Well, here's what RFC-1813 says about the write verifier:
> This cookie must be consistent during a single instance
> of the NFS version 3 protocol service and must be
> unique between instances of the NFS version 3 protocol
> server, where uncommitted data may be lost.
> You could debate what "service" means in the above, but I'd say it
> isn't "per file".
>
> However, since there is no way for an NFSv3 client to know what a
> "server" is,
> the FreeBSD client (and maybe the other *BSD, although I haven't been
> involved
> in them for a long time) stores the write verifier "per mount".
> --> So, for the FreeBSD client, it is at the granularity of what the
> NFSv3 client sees as
> a single file system. (Typically a single file system on the
> server unless the knfsd
> plays the game of coalescing multiple file systems by
> uniquifying the I-node#, etc.)
>
> It is conceivable that some other NFSv3 client might assume
> "same server IP address --> same server" and store it "per server
> IP", but I have
> no idea if any such client exists.
>
The patchsets we've been discussing should all be compatible with
FreeBSD, as they implement a per-server boot verifier (as in different
containers representing different knfsd servers on different networks
are allowed to have different boot verifiers, but each server applies
their boot verifier globally to all exported filesystems).
We could make these boot verifiers be per filesystem to reduce the
frequency of these 'server reboots' but the expectation is that
filesystem errors on COMMIT are rare.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Wed, Aug 28, 2019 at 03:12:26PM +0000, Rick Macklem wrote:
> J. Bruce Fields wrote:
> >On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
> >> For the most part, these sorts of errors tend to be rare. If it turns
> >> out to be a problem we could consider moving the verifier into
> >> svc_export or something?
> >
> >As Trond says, this isn't really a server implementation issue, it's a
> >protocol issue. If a client detects when to resend writes by storing a
> >single verifier per server, then returning different verifiers from
> >writes to different exports will have it resending every time it writes
> >to one export then another.
>
> Well, here's what RFC-1813 says about the write verifier:
> This cookie must be consistent during a single instance
> of the NFS version 3 protocol service and must be
> unique between instances of the NFS version 3 protocol
> server, where uncommitted data may be lost.
> You could debate what "service" means in the above, but I'd say it isn't "per file".
>
> However, since there is no way for an NFSv3 client to know what a "server" is,
> the FreeBSD client (and maybe the other *BSD, although I haven't been involved
> in them for a long time) stores the write verifier "per mount".
> --> So, for the FreeBSD client, it is at the granularity of what the NFSv3 client sees as
> a single file system. (Typically a single file system on the server unless the knfsd
> plays the game of coalescing multiple file systems by uniquifying the I-node#, etc.)
Oh, well, that would be nice if we could at least count on per-filesystem.
> It is conceivable that some other NFSv3 client might assume "same
> server IP address --> same server" and store it "per server IP", but I
> have no idea if any such client exists.
Hm.
--b.
On Wed, Aug 28, 2019 at 10:50:30AM -0400, Chuck Lever wrote:
>
>
> > On Aug 28, 2019, at 10:48 AM, Bruce Fields <[email protected]> wrote:
> >
> > On Wed, Aug 28, 2019 at 10:40:31AM -0400, J. Bruce Fields wrote:
> >> On Wed, Aug 28, 2019 at 10:16:09AM -0400, Jeff Layton wrote:
> >>> For the most part, these sorts of errors tend to be rare. If it turns
> >>> out to be a problem we could consider moving the verifier into
> >>> svc_export or something?
> >>
> >> As Trond says, this isn't really a server implementation issue, it's a
> >> protocol issue. If a client detects when to resend writes by storing a
> >> single verifier per server, then returning different verifiers from
> >> writes to different exports will have it resending every time it writes
> >> to one export then another.
> >
> > The other way to limit this is by trying to detect the single-writer
> > case, since it's probably also rare for multiple clients to write to the
> > same file.
> >
> > Are there any common cases where two clients share the same TCP
> > connection? Maybe that would be a conservative enough test?
>
> What happens if a client reconnects? Does that bump the verifier?
>
> If it reconnects from the same source port, can the server tell
> that the connection is different?
I assume it creates a new socket structure.
So, you could key off either that or the (address, port) depending on
whether you'd rather risk unnecessarily incrementing the verifier after
a reconnection and write error, or risk failing to increment due to port
reuse.
There's some precedent to taking the latter choice in the DRC. But the
DRC also includes some other stuff (xid, hash) to minimize the chance of
a collision.
If you took the former choice you wouldn't literally want to key off a
pointer to the socket as it'd cause complications when freeing the
socket. But maybe you could add some kind of birth time or generation
number to struct svc_sock to help differentiate?
--b.
>
>
> > And you wouldn't have to track every connection that's ever sent a WRITE
> > to a given file. Just remember the first one, with a flag to track
> > whether anyone else has ever written to it.
> >
> > ??
> >
> > --b.
>
> --
> Chuck Lever
>
>