2021-08-26 04:29:05

by NeilBrown

[permalink] [raw]
Subject: [PATCH] NFSD: drop support for ancient file-handles


File-handles not in the "new" or "version 1" format have not been handed
out for new mounts since Linux 2.4 which was released 20 years ago.
I think it is safe to say that no such file handles are still in use,
and that we can drop support for them.

This patch also moves the nfsfh.h from the include/uapi directory into
fs/nfsd. I can find no evidence of it being used anywhere outside the
kernel. Certainly nfs-utils and wireshark do not use it.

fh_base and fh_pad are occasionally used to refer to the whole
filehandle. These are replaced with "fh_raw" which is hopefully more
meaningful.

Signed-off-by: NeilBrown <[email protected]>
---

I found
https://www.spinics.net/lists/linux-nfs/msg43280.html
"Re: [PATCH] nfsd: clean up fh_auth usage"
from 2014 where moving nfsfh.h out of uapi was considered but not
actioned. Christoph said he would "do some research if the
uapi <linux/nfsd/*.h> headers are used anywhere at all". I can find no
report on the result of that research. My own research turned up
nothing.

Thanks,
NeilBrown


fs/nfsd/lockd.c | 2 +-
fs/nfsd/nfs3xdr.c | 4 +-
fs/nfsd/nfs4callback.c | 2 +-
fs/nfsd/nfs4proc.c | 2 +-
fs/nfsd/nfs4state.c | 4 +-
fs/nfsd/nfs4xdr.c | 4 +-
fs/nfsd/nfsctl.c | 6 +-
fs/nfsd/nfsfh.c | 177 +++++++++++---------------------
fs/nfsd/nfsfh.h | 55 +++++++++-
fs/nfsd/nfsxdr.c | 4 +-
include/uapi/linux/nfsd/nfsfh.h | 116 ---------------------
11 files changed, 126 insertions(+), 250 deletions(-)
delete mode 100644 include/uapi/linux/nfsd/nfsfh.h

diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 3f5b3d7b62b7..74d1630e7994 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -33,7 +33,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
/* must initialize before using! but maxsize doesn't matter */
fh_init(&fh,0);
fh.fh_handle.fh_size = f->size;
- memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
+ memcpy((char*)&fh.fh_handle.fh_raw, f->data, f->size);
fh.fh_export = NULL;

nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 0a5ebc52e6a9..3d37923afb06 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -92,7 +92,7 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
return false;
fh_init(fhp, NFS3_FHSIZE);
fhp->fh_handle.fh_size = size;
- memcpy(&fhp->fh_handle.fh_base, p, size);
+ memcpy(&fhp->fh_handle.fh_raw, p, size);

return true;
}
@@ -131,7 +131,7 @@ svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
*p++ = cpu_to_be32(size);
if (size)
p[XDR_QUADLEN(size) - 1] = 0;
- memcpy(p, &fhp->fh_handle.fh_base, size);
+ memcpy(p, &fhp->fh_handle.fh_raw, size);

return true;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0f8b10f363e7..11f8715d92d6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -121,7 +121,7 @@ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)

BUG_ON(length > NFS4_FHSIZE);
p = xdr_reserve_space(xdr, 4 + length);
- xdr_encode_opaque(p, &fh->fh_base, length);
+ xdr_encode_opaque(p, &fh->fh_raw, length);
}

/*
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 486c5dba4b65..4872b9519a72 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -519,7 +519,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,

fh_put(&cstate->current_fh);
cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
- memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
+ memcpy(&cstate->current_fh.fh_handle.fh_raw, putfh->pf_fhval,
putfh->pf_fhlen);
ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fa67ecd5fe63..d66b4be99063 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1010,7 +1010,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
}
spin_unlock(&blocked_delegations_lock);
}
- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
if (test_bit(hash&255, bd->set[0]) &&
test_bit((hash>>8)&255, bd->set[0]) &&
test_bit((hash>>16)&255, bd->set[0]))
@@ -1029,7 +1029,7 @@ static void block_delegations(struct knfsd_fh *fh)
u32 hash;
struct bloom_pair *bd = &blocked_delegations;

- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);

spin_lock(&blocked_delegations_lock);
__set_bit(hash&255, bd->set[bd->new]);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7abeccb975b2..a54b2845473b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3110,7 +3110,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
if (!p)
goto out_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
fhp->fh_handle.fh_size);
}
if (bmval0 & FATTR4_WORD0_FILEID) {
@@ -3667,7 +3667,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
p = xdr_reserve_space(xdr, len + 4);
if (!p)
return nfserr_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
return 0;
}

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c2c3d9077dc5..449b57e5e328 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -395,12 +395,12 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
auth_domain_put(dom);
if (len)
return len;
-
+
mesg = buf;
len = SIMPLE_TRANSACTION_LIMIT;
- qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
+ qword_addhex(&mesg, &len, (char*)&fh.fh_raw, fh.fh_size);
mesg[-1] = '\n';
- return mesg - buf;
+ return mesg - buf;
}

/*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c475d2271f9c..7695c0f1eefe 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -154,11 +154,12 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
- struct fid *fid = NULL, sfid;
+ struct fid *fid = NULL;
struct svc_export *exp;
struct dentry *dentry;
int fileid_type;
int data_left = fh->fh_size/4;
+ int len;
__be32 error;

error = nfserr_stale;
@@ -167,48 +168,35 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers == 4 && fh->fh_size == 0)
return nfserr_nofilehandle;

- if (fh->fh_version == 1) {
- int len;
-
- if (--data_left < 0)
- return error;
- if (fh->fh_auth_type != 0)
- return error;
- len = key_len(fh->fh_fsid_type) / 4;
- if (len == 0)
- return error;
- if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
- /* deprecated, convert to type 3 */
- len = key_len(FSID_ENCODE_DEV)/4;
- fh->fh_fsid_type = FSID_ENCODE_DEV;
- /*
- * struct knfsd_fh uses host-endian fields, which are
- * sometimes used to hold net-endian values. This
- * confuses sparse, so we must use __force here to
- * keep it from complaining.
- */
- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
- ntohl((__force __be32)fh->fh_fsid[1])));
- fh->fh_fsid[1] = fh->fh_fsid[2];
- }
- data_left -= len;
- if (data_left < 0)
- return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
- fid = (struct fid *)(fh->fh_fsid + len);
- } else {
- __u32 tfh[2];
- dev_t xdev;
- ino_t xino;
-
- if (fh->fh_size != NFS_FHSIZE)
- return error;
- /* assume old filehandle format */
- xdev = old_decode_dev(fh->ofh_xdev);
- xino = u32_to_ino_t(fh->ofh_xino);
- mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
- exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
+ if (fh->fh_version != 1)
+ return error;
+
+ if (--data_left < 0)
+ return error;
+ if (fh->fh_auth_type != 0)
+ return error;
+ len = key_len(fh->fh_fsid_type) / 4;
+ if (len == 0)
+ return error;
+ if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+ /* deprecated, convert to type 3 */
+ len = key_len(FSID_ENCODE_DEV)/4;
+ fh->fh_fsid_type = FSID_ENCODE_DEV;
+ /*
+ * struct knfsd_fh uses host-endian fields, which are
+ * sometimes used to hold net-endian values. This
+ * confuses sparse, so we must use __force here to
+ * keep it from complaining.
+ */
+ fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+ ntohl((__force __be32)fh->fh_fsid[1])));
+ fh->fh_fsid[1] = fh->fh_fsid[2];
}
+ data_left -= len;
+ if (data_left < 0)
+ return error;
+ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ fid = (struct fid *)(fh->fh_fsid + len);

error = nfserr_stale;
if (IS_ERR(exp)) {
@@ -253,18 +241,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers > 2)
error = nfserr_badhandle;

- if (fh->fh_version != 1) {
- sfid.i32.ino = fh->ofh_ino;
- sfid.i32.gen = fh->ofh_generation;
- sfid.i32.parent_ino = fh->ofh_dirino;
- fid = &sfid;
- data_left = 3;
- if (fh->ofh_dirino == 0)
- fileid_type = FILEID_INO32_GEN;
- else
- fileid_type = FILEID_INO32_GEN_PARENT;
- } else
- fileid_type = fh->fh_fileid_type;
+ fileid_type = fh->fh_fileid_type;

if (fileid_type == FILEID_ROOT)
dentry = dget(exp->ex_path.dentry);
@@ -452,20 +429,6 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
}
}

-/*
- * for composing old style file handles
- */
-static inline void _fh_update_old(struct dentry *dentry,
- struct svc_export *exp,
- struct knfsd_fh *fh)
-{
- fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
- fh->ofh_generation = d_inode(dentry)->i_generation;
- if (d_is_dir(dentry) ||
- (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
- fh->ofh_dirino = 0;
-}
-
static bool is_root_export(struct svc_export *exp)
{
return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
@@ -600,35 +563,21 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_dentry = dget(dentry); /* our internal copy */
fhp->fh_export = exp_get(exp);

- if (fhp->fh_handle.fh_version == 0xca) {
- /* old style filehandle please */
- memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
- fhp->fh_handle.fh_size = NFS_FHSIZE;
- fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
- fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
- fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
- fhp->fh_handle.ofh_xino =
- ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
- fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
- if (inode)
- _fh_update_old(dentry, exp, &fhp->fh_handle);
- } else {
- fhp->fh_handle.fh_size =
- key_len(fhp->fh_handle.fh_fsid_type) + 4;
- fhp->fh_handle.fh_auth_type = 0;
-
- mk_fsid(fhp->fh_handle.fh_fsid_type,
- fhp->fh_handle.fh_fsid,
- ex_dev,
- d_inode(exp->ex_path.dentry)->i_ino,
- exp->ex_fsid, exp->ex_uuid);
-
- if (inode)
- _fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
- fh_put(fhp);
- return nfserr_opnotsupp;
- }
+ fhp->fh_handle.fh_size =
+ key_len(fhp->fh_handle.fh_fsid_type) + 4;
+ fhp->fh_handle.fh_auth_type = 0;
+
+ mk_fsid(fhp->fh_handle.fh_fsid_type,
+ fhp->fh_handle.fh_fsid,
+ ex_dev,
+ d_inode(exp->ex_path.dentry)->i_ino,
+ exp->ex_fsid, exp->ex_uuid);
+
+ if (inode)
+ _fh_update(fhp, exp, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
+ fh_put(fhp);
+ return nfserr_opnotsupp;
}

return 0;
@@ -649,23 +598,19 @@ fh_update(struct svc_fh *fhp)
dentry = fhp->fh_dentry;
if (d_really_is_negative(dentry))
goto out_negative;
- if (fhp->fh_handle.fh_version != 1) {
- _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
- } else {
- if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
- return 0;
+ if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+ return 0;

- _fh_update(fhp, fhp->fh_export, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
- return nfserr_opnotsupp;
- }
+ _fh_update(fhp, fhp->fh_export, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
+ return nfserr_opnotsupp;
return 0;
out_bad:
printk(KERN_ERR "fh_update: fh not verified!\n");
return nfserr_serverfault;
out_negative:
printk(KERN_ERR "fh_update: %pd2 still negative!\n",
- dentry);
+ dentry);
return nfserr_serverfault;
}

@@ -702,12 +647,12 @@ char * SVCFH_fmt(struct svc_fh *fhp)
static char buf[80];
sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
fh->fh_size,
- fh->fh_base.fh_pad[0],
- fh->fh_base.fh_pad[1],
- fh->fh_base.fh_pad[2],
- fh->fh_base.fh_pad[3],
- fh->fh_base.fh_pad[4],
- fh->fh_base.fh_pad[5]);
+ fh->fh_raw[0],
+ fh->fh_raw[1],
+ fh->fh_raw[2],
+ fh->fh_raw[3],
+ fh->fh_raw[4],
+ fh->fh_raw[5]);
return buf;
}

diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 6106697adc04..f36234c474dc 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -10,9 +10,56 @@

#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
-#include <uapi/linux/nfsd/nfsfh.h>
#include <linux/iversion.h>
#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+/*
+ * The file handle starts with a sequence of four-byte words.
+ * The first word contains a version number (1) and three descriptor bytes
+ * that tell how the remaining 3 variable length fields should be handled.
+ * These three bytes are auth_type, fsid_type and fileid_type.
+ *
+ * All four-byte values are in host-byte-order.
+ *
+ * The auth_type field is deprecated and must be set to 0.
+ *
+ * The fsid_type identifies how the filesystem (or export point) is
+ * encoded.
+ * Current values:
+ * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
+ * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
+ * says we mustn't. We must break it up and reassemble.
+ * 1 - 4 byte user specified identifier
+ * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
+ * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
+ * 4 - 4 byte inode number and 4 byte uuid
+ * 5 - 8 byte uuid
+ * 6 - 16 byte uuid
+ * 7 - 8 byte inode number and 16 byte uuid
+ *
+ * The fileid_type identified how the file within the filesystem is encoded.
+ * The values for this field are filesystem specific, exccept that
+ * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
+ * in include/linux/exportfs.h for currently registered values.
+ */
+
+struct knfsd_fh {
+ unsigned int fh_size; /*
+ * Points to the current size while
+ * building a new file handle.
+ */
+ union {
+ __u32 fh_raw[NFS4_FHSIZE/4];
+ struct {
+ __u8 fh_version; /* == 1 */
+ __u8 fh_auth_type; /* deprecated */
+ __u8 fh_fsid_type;
+ __u8 fh_fileid_type;
+ __u32 fh_fsid[]; /* flexible-array member */
+ };
+ };
+};

static inline __u32 ino_t_to_u32(ino_t ino)
{
@@ -188,7 +235,7 @@ static inline void
fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
{
dst->fh_size = src->fh_size;
- memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+ memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
}

static __inline__ struct svc_fh *
@@ -203,7 +250,7 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
{
if (fh1->fh_size != fh2->fh_size)
return false;
- if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+ if (memcmp(fh1->fh_raw, fh2->fh_raw, fh1->fh_size) != 0)
return false;
return true;
}
@@ -227,7 +274,7 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
*/
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
- return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+ return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_raw, fh->fh_size);
}
#else
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index a06c05fe3b42..082449c7d0db 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -64,7 +64,7 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
if (!p)
return false;
fh_init(fhp, NFS_FHSIZE);
- memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
+ memcpy(&fhp->fh_handle.fh_raw, p, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE;

return true;
@@ -78,7 +78,7 @@ svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp)
p = xdr_reserve_space(xdr, NFS_FHSIZE);
if (!p)
return false;
- memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+ memcpy(p, &fhp->fh_handle.fh_raw, NFS_FHSIZE);

return true;
}
diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
deleted file mode 100644
index 427294dd56a1..000000000000
--- a/include/uapi/linux/nfsd/nfsfh.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * This file describes the layout of the file handles as passed
- * over the wire.
- *
- * Copyright (C) 1995, 1996, 1997 Olaf Kirch <[email protected]>
- */
-
-#ifndef _UAPI_LINUX_NFSD_FH_H
-#define _UAPI_LINUX_NFSD_FH_H
-
-#include <linux/types.h>
-#include <linux/nfs.h>
-#include <linux/nfs2.h>
-#include <linux/nfs3.h>
-#include <linux/nfs4.h>
-
-/*
- * This is the old "dentry style" Linux NFSv2 file handle.
- *
- * The xino and xdev fields are currently used to transport the
- * ino/dev of the exported inode.
- */
-struct nfs_fhbase_old {
- __u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
- __u32 fb_ino; /* our inode number */
- __u32 fb_dirino; /* dir inode number, 0 for directories */
- __u32 fb_dev; /* our device */
- __u32 fb_xdev;
- __u32 fb_xino;
- __u32 fb_generation;
-};
-
-/*
- * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
- * by Neil Brown <[email protected]> - March 2000
- *
- * The file handle starts with a sequence of four-byte words.
- * The first word contains a version number (1) and three descriptor bytes
- * that tell how the remaining 3 variable length fields should be handled.
- * These three bytes are auth_type, fsid_type and fileid_type.
- *
- * All four-byte values are in host-byte-order.
- *
- * The auth_type field is deprecated and must be set to 0.
- *
- * The fsid_type identifies how the filesystem (or export point) is
- * encoded.
- * Current values:
- * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
- * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
- * says we mustn't. We must break it up and reassemble.
- * 1 - 4 byte user specified identifier
- * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
- * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
- * 4 - 4 byte inode number and 4 byte uuid
- * 5 - 8 byte uuid
- * 6 - 16 byte uuid
- * 7 - 8 byte inode number and 16 byte uuid
- *
- * The fileid_type identified how the file within the filesystem is encoded.
- * The values for this field are filesystem specific, exccept that
- * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
- * in include/linux/exportfs.h for currently registered values.
- */
-struct nfs_fhbase_new {
- union {
- struct {
- __u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
- __u8 fb_auth_type_aux;
- __u8 fb_fsid_type_aux;
- __u8 fb_fileid_type_aux;
- __u32 fb_auth[1];
- /* __u32 fb_fsid[0]; floating */
- /* __u32 fb_fileid[0]; floating */
- };
- struct {
- __u8 fb_version; /* == 1, even => nfs_fhbase_old */
- __u8 fb_auth_type;
- __u8 fb_fsid_type;
- __u8 fb_fileid_type;
- __u32 fb_auth_flex[]; /* flexible-array member */
- };
- };
-};
-
-struct knfsd_fh {
- unsigned int fh_size; /* significant for NFSv3.
- * Points to the current size while building
- * a new file handle
- */
- union {
- struct nfs_fhbase_old fh_old;
- __u32 fh_pad[NFS4_FHSIZE/4];
- struct nfs_fhbase_new fh_new;
- } fh_base;
-};
-
-#define ofh_dcookie fh_base.fh_old.fb_dcookie
-#define ofh_ino fh_base.fh_old.fb_ino
-#define ofh_dirino fh_base.fh_old.fb_dirino
-#define ofh_dev fh_base.fh_old.fb_dev
-#define ofh_xdev fh_base.fh_old.fb_xdev
-#define ofh_xino fh_base.fh_old.fb_xino
-#define ofh_generation fh_base.fh_old.fb_generation
-
-#define fh_version fh_base.fh_new.fb_version
-#define fh_fsid_type fh_base.fh_new.fb_fsid_type
-#define fh_auth_type fh_base.fh_new.fb_auth_type
-#define fh_fileid_type fh_base.fh_new.fb_fileid_type
-#define fh_fsid fh_base.fh_new.fb_auth_flex
-
-/* Do not use, provided for userspace compatiblity. */
-#define fh_auth fh_base.fh_new.fb_auth
-
-#endif /* _UAPI_LINUX_NFSD_FH_H */
--
2.32.0



2021-08-26 06:04:19

by NeilBrown

[permalink] [raw]
Subject: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export


[[ Hi Bruce and Chuck,
I've rebased this patch on the earlier patch I sent which allows
me to use the name "fh_flags". I've also added a missing #include.
I've added the 'Acked-by' which Joesf provided earlier for the
btrfs-part. I don't have an 'ack' for the stat.h part, but no-one
has complained wither.
I think it is as ready as it can be, and am keen to know what you
think.
I'm not *very* keen on testing s_magic in nfsd code (though we
already have a couple of such tests in nfs3proc.c), but it does have
the advantage of ensuring that no other filesystem can use this
functionality without landing a patch in fs/nfsd/.

Thanks for any review that you can provide,
NeilBrown
]]

BTRFS does not provide unique inode numbers across a filesystem.
It only provides unique inode numbers within a subvolume and
uses synthetic device numbers for different subvolumes to ensure
uniqueness for device+inode.

nfsd cannot use these varying synthetic device numbers. If nfsd were to
synthesise different stable filesystem ids to give to the client, that
would cause subvolumes to appear in the mount table on the client, even
though they don't appear in the mount table on the server. Also, NFSv3
doesn't support changing the filesystem id without a new explicit mount
on the client (this is partially supported in practice, but violates the
protocol specification and has problems in some edge cases).

So currently, the roots of all subvolumes report the same inode number
in the same filesystem to NFS clients and tools like 'find' notice that
a directory has the same identity as an ancestor, and so refuse to
enter that directory.

This patch allows btrfs (or any filesystem) to provide a 64bit number
that can be xored with the inode number to make the number more unique.
Rather than the client being certain to see duplicates, with this patch
it is possible but extremely rare.

The number that btrfs provides is a swab64() version of the subvolume
identifier. This has most entropy in the high bits (the low bits of the
subvolume identifer), while the inode has most entropy in the low bits.
The result will always be unique within a subvolume, and will almost
always be unique across the filesystem.

If an upgrade of the NFS server caused all inode numbers in an exportfs
BTRFS filesystem to appear to the client to change, the client may not
handle this well. The Linux client will cause any open files to become
'stale'. If the mount point changed inode number, the whole mount would
become inaccessible.

To avoid this, an unused byte in the filehandle (fh_auth) has been
repurposed as "fh_flags". The new behaviour of uniquifying inode number
is only activated when a new flag is set.

NFSD will only set this flag in filehandles it reports if the filehandle
of the parent (provided by the client) contains the bit, or if
- the filehandle for the parent is not provided or is for a different
export and
- the filehandle refers to a BTRFS filesystem.

Thus if you have a BTRFS filesystem originally mounted from a server
without this patch, the flag will never be set and the current behaviour
will continue. Only once you re-mount the filesystem (or the filesystem
is re-auto-mounted) will the inode numbers change. When that happens,
it is likely that the filesystem st_dev number seen on the client will
change anyway.

Acked-by: Josef Bacik <[email protected]> (for BTFS change)
Signed-off-by: NeilBrown <[email protected]>
---
fs/btrfs/inode.c | 4 ++++
fs/nfsd/nfs3xdr.c | 15 ++++++++++++++-
fs/nfsd/nfs4xdr.c | 7 ++++---
fs/nfsd/nfsfh.c | 14 ++++++++++++--
fs/nfsd/nfsfh.h | 34 +++++++++++++++++++++++++++++++---
fs/nfsd/xdr3.h | 2 ++
include/linux/stat.h | 18 ++++++++++++++++++
7 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06f9f167222b..d35e2a30f25f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9195,6 +9195,10 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
generic_fillattr(&init_user_ns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;

+ if (BTRFS_I(inode)->root->root_key.objectid != BTRFS_FS_TREE_OBJECTID)
+ stat->ino_uniquifier =
+ swab64(BTRFS_I(inode)->root->root_key.objectid);
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 3d37923afb06..5e2d5c352ecd 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -340,6 +340,7 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
{
struct user_namespace *userns = nfsd_user_namespace(rqstp);
__be32 *p;
+ u64 ino;
u64 fsid;

p = xdr_reserve_space(xdr, XDR_UNIT * 21);
@@ -377,7 +378,8 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
p = xdr_encode_hyper(p, fsid);

/* fileid */
- p = xdr_encode_hyper(p, stat->ino);
+ ino = nfsd_uniquify_ino(fhp, stat);
+ p = xdr_encode_hyper(p, ino);

p = encode_nfstime3(p, &stat->atime);
p = encode_nfstime3(p, &stat->mtime);
@@ -1151,6 +1153,17 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
if (xdr_stream_encode_item_present(xdr) < 0)
return false;
/* fileid */
+ if (!resp->dir_have_uniquifier) {
+ struct kstat stat;
+ if (fh_getattr(&resp->fh, &stat) == nfs_ok)
+ resp->dir_ino_uniquifier =
+ nfsd_ino_uniquifier(&resp->fh, &stat);
+ else
+ resp->dir_ino_uniquifier = 0;
+ resp->dir_have_uniquifier = true;
+ }
+ if (resp->dir_ino_uniquifier != ino)
+ ino ^= resp->dir_ino_uniquifier;
if (xdr_stream_encode_u64(xdr, ino) < 0)
return false;
/* name */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a54b2845473b..6e31f6286e0b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3114,10 +3114,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
fhp->fh_handle.fh_size);
}
if (bmval0 & FATTR4_WORD0_FILEID) {
+ u64 ino = nfsd_uniquify_ino(fhp, &stat);
p = xdr_reserve_space(xdr, 8);
if (!p)
goto out_resource;
- p = xdr_encode_hyper(p, stat.ino);
+ p = xdr_encode_hyper(p, ino);
}
if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
p = xdr_reserve_space(xdr, 8);
@@ -3274,7 +3275,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,

p = xdr_reserve_space(xdr, 8);
if (!p)
- goto out_resource;
+ goto out_resource;
/*
* Get parent's attributes if not ignoring crossmount
* and this is the root of a cross-mounted filesystem.
@@ -3284,7 +3285,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
err = get_parent_attributes(exp, &parent_stat);
if (err)
goto out_nfserr;
- ino = parent_stat.ino;
+ ino = nfsd_uniquify_ino(fhp, &parent_stat);
}
p = xdr_encode_hyper(p, ino);
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 7695c0f1eefe..18bb139f8bfe 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -9,6 +9,7 @@
*/

#include <linux/exportfs.h>
+#include <linux/magic.h>

#include <linux/sunrpc/svcauth_gss.h>
#include "nfsd.h"
@@ -173,7 +174,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)

if (--data_left < 0)
return error;
- if (fh->fh_auth_type != 0)
+ if ((fh->fh_flags & ~NFSD_FH_FLAG_ALL) != 0)
return error;
len = key_len(fh->fh_fsid_type) / 4;
if (len == 0)
@@ -532,6 +533,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,

struct inode * inode = d_inode(dentry);
dev_t ex_dev = exp_sb(exp)->s_dev;
+ u8 flags = 0;

dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
MAJOR(ex_dev), MINOR(ex_dev),
@@ -548,6 +550,14 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
/* If we have a ref_fh, then copy the fh_no_wcc setting from it. */
fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false;

+ if (ref_fh && ref_fh->fh_export == exp) {
+ flags = ref_fh->fh_handle.fh_flags;
+ } else {
+ /* Set flags as needed */
+ if (exp->ex_path.mnt->mnt_sb->s_magic == BTRFS_SUPER_MAGIC)
+ flags |= NFSD_FH_FLAG_INO_UNIQUIFY;
+ }
+
if (ref_fh == fhp)
fh_put(ref_fh);

@@ -565,7 +575,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,

fhp->fh_handle.fh_size =
key_len(fhp->fh_handle.fh_fsid_type) + 4;
- fhp->fh_handle.fh_auth_type = 0;
+ fhp->fh_handle.fh_flags = flags;

mk_fsid(fhp->fh_handle.fh_fsid_type,
fhp->fh_handle.fh_fsid,
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index f36234c474dc..bbc7ddd34143 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -18,11 +18,17 @@
* The file handle starts with a sequence of four-byte words.
* The first word contains a version number (1) and three descriptor bytes
* that tell how the remaining 3 variable length fields should be handled.
- * These three bytes are auth_type, fsid_type and fileid_type.
+ * These three bytes are flags, fsid_type and fileid_type.
*
* All four-byte values are in host-byte-order.
*
- * The auth_type field is deprecated and must be set to 0.
+ * The flags field (previously auth_type) can be used when nfsd behaviour
+ * needs to change in a non-compatible way, usually for some specific
+ * filesystem. Flags should only be set in filehandles for filesystems which
+ * need them.
+ * Current values:
+ * 1 - BTRFS only. Cause stat->ino_uniquifier to be used to improve inode
+ * number uniqueness.
*
* The fsid_type identifies how the filesystem (or export point) is
* encoded.
@@ -53,7 +59,7 @@ struct knfsd_fh {
__u32 fh_raw[NFS4_FHSIZE/4];
struct {
__u8 fh_version; /* == 1 */
- __u8 fh_auth_type; /* deprecated */
+ __u8 fh_flags;
__u8 fh_fsid_type;
__u8 fh_fileid_type;
__u32 fh_fsid[]; /* flexible-array member */
@@ -131,6 +137,28 @@ enum fsid_source {
};
extern enum fsid_source fsid_source(const struct svc_fh *fhp);

+enum nfsd_fh_flags {
+ NFSD_FH_FLAG_INO_UNIQUIFY = 1, /* BTRFS only */
+
+ NFSD_FH_FLAG_ALL = 1
+};
+
+static inline u64 nfsd_ino_uniquifier(const struct svc_fh *fhp,
+ const struct kstat *stat)
+{
+ if (fhp->fh_handle.fh_flags & NFSD_FH_FLAG_INO_UNIQUIFY)
+ return stat->ino_uniquifier;
+ return 0;
+}
+
+static inline u64 nfsd_uniquify_ino(const struct svc_fh *fhp,
+ const struct kstat *stat)
+{
+ u64 u = nfsd_ino_uniquifier(fhp, stat);
+ if (u != stat->ino)
+ return stat->ino ^ u;
+ return stat->ino;
+}

/*
* This might look a little large to "inline" but in all calls except
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 933008382bbe..d9b6c8314bbb 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -179,6 +179,8 @@ struct nfsd3_readdirres {
struct xdr_buf dirlist;
struct svc_fh scratch;
struct readdir_cd common;
+ u64 dir_ino_uniquifier;
+ bool dir_have_uniquifier;
unsigned int cookie_offset;
struct svc_rqst * rqstp;

diff --git a/include/linux/stat.h b/include/linux/stat.h
index fff27e603814..0f3f74d302f8 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -46,6 +46,24 @@ struct kstat {
struct timespec64 btime; /* File creation time */
u64 blocks;
u64 mnt_id;
+ /*
+ * BTRFS does not provide unique inode numbers within a filesystem,
+ * depending on a synthetic 'dev' to provide uniqueness.
+ * NFSd cannot make use of this 'dev' number so clients often see
+ * duplicate inode numbers.
+ * For BTRFS, 'ino' is unlikely to use the high bits until the filesystem
+ * has created a great many inodes.
+ * It puts another number in ino_uniquifier which:
+ * - has most entropy in the high bits
+ * - is different precisely when 'dev' is different
+ * - is stable across unmount/remount
+ * NFSd can xor this with 'ino' to get a substantially more unique
+ * number for reporting to the client.
+ * The ino_uniquifier for a directory can reasonably be applied
+ * to inode numbers reported by the readdir filldir callback.
+ * It is NOT currently exported to user-space.
+ */
+ u64 ino_uniquifier;
};

#endif
--
2.32.0



2021-08-26 14:11:15

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH] NFSD: drop support for ancient file-handles



> On Aug 26, 2021, at 12:28 AM, NeilBrown <[email protected]> wrote:
>
>
> File-handles not in the "new" or "version 1" format have not been handed
> out for new mounts since Linux 2.4 which was released 20 years ago.
> I think it is safe to say that no such file handles are still in use,
> and that we can drop support for them.
>
> This patch also moves the nfsfh.h from the include/uapi directory into
> fs/nfsd. I can find no evidence of it being used anywhere outside the
> kernel. Certainly nfs-utils and wireshark do not use it.
>
> fh_base and fh_pad are occasionally used to refer to the whole
> filehandle. These are replaced with "fh_raw" which is hopefully more
> meaningful.
>
> Signed-off-by: NeilBrown <[email protected]>
> ---
>
> I found
> https://www.spinics.net/lists/linux-nfs/msg43280.html
> "Re: [PATCH] nfsd: clean up fh_auth usage"
> from 2014 where moving nfsfh.h out of uapi was considered but not
> actioned. Christoph said he would "do some research if the
> uapi <linux/nfsd/*.h> headers are used anywhere at all". I can find no
> report on the result of that research. My own research turned up
> nothing.
>
> Thanks,
> NeilBrown

Hi Neil-

I have no philosophical objection to this clean up, but I'm
concerned a bit about timing. It's a large patch, and 5.15
should be opening on Sunday. I would prefer this to go into
5.16, if that's OK with you?


> fs/nfsd/lockd.c | 2 +-
> fs/nfsd/nfs3xdr.c | 4 +-
> fs/nfsd/nfs4callback.c | 2 +-
> fs/nfsd/nfs4proc.c | 2 +-
> fs/nfsd/nfs4state.c | 4 +-
> fs/nfsd/nfs4xdr.c | 4 +-
> fs/nfsd/nfsctl.c | 6 +-
> fs/nfsd/nfsfh.c | 177 +++++++++++---------------------
> fs/nfsd/nfsfh.h | 55 +++++++++-
> fs/nfsd/nfsxdr.c | 4 +-
> include/uapi/linux/nfsd/nfsfh.h | 116 ---------------------
> 11 files changed, 126 insertions(+), 250 deletions(-)
> delete mode 100644 include/uapi/linux/nfsd/nfsfh.h
>
> diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
> index 3f5b3d7b62b7..74d1630e7994 100644
> --- a/fs/nfsd/lockd.c
> +++ b/fs/nfsd/lockd.c
> @@ -33,7 +33,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
> /* must initialize before using! but maxsize doesn't matter */
> fh_init(&fh,0);
> fh.fh_handle.fh_size = f->size;
> - memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
> + memcpy((char*)&fh.fh_handle.fh_raw, f->data, f->size);
> fh.fh_export = NULL;
>
> nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 0a5ebc52e6a9..3d37923afb06 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -92,7 +92,7 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
> return false;
> fh_init(fhp, NFS3_FHSIZE);
> fhp->fh_handle.fh_size = size;
> - memcpy(&fhp->fh_handle.fh_base, p, size);
> + memcpy(&fhp->fh_handle.fh_raw, p, size);
>
> return true;
> }
> @@ -131,7 +131,7 @@ svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
> *p++ = cpu_to_be32(size);
> if (size)
> p[XDR_QUADLEN(size) - 1] = 0;
> - memcpy(p, &fhp->fh_handle.fh_base, size);
> + memcpy(p, &fhp->fh_handle.fh_raw, size);
>
> return true;
> }
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 0f8b10f363e7..11f8715d92d6 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -121,7 +121,7 @@ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
>
> BUG_ON(length > NFS4_FHSIZE);
> p = xdr_reserve_space(xdr, 4 + length);
> - xdr_encode_opaque(p, &fh->fh_base, length);
> + xdr_encode_opaque(p, &fh->fh_raw, length);
> }
>
> /*
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 486c5dba4b65..4872b9519a72 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -519,7 +519,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>
> fh_put(&cstate->current_fh);
> cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
> - memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
> + memcpy(&cstate->current_fh.fh_handle.fh_raw, putfh->pf_fhval,
> putfh->pf_fhlen);
> ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
> #ifdef CONFIG_NFSD_V4_2_INTER_SSC
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index fa67ecd5fe63..d66b4be99063 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -1010,7 +1010,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
> }
> spin_unlock(&blocked_delegations_lock);
> }
> - hash = jhash(&fh->fh_base, fh->fh_size, 0);
> + hash = jhash(&fh->fh_raw, fh->fh_size, 0);
> if (test_bit(hash&255, bd->set[0]) &&
> test_bit((hash>>8)&255, bd->set[0]) &&
> test_bit((hash>>16)&255, bd->set[0]))
> @@ -1029,7 +1029,7 @@ static void block_delegations(struct knfsd_fh *fh)
> u32 hash;
> struct bloom_pair *bd = &blocked_delegations;
>
> - hash = jhash(&fh->fh_base, fh->fh_size, 0);
> + hash = jhash(&fh->fh_raw, fh->fh_size, 0);
>
> spin_lock(&blocked_delegations_lock);
> __set_bit(hash&255, bd->set[bd->new]);
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 7abeccb975b2..a54b2845473b 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -3110,7 +3110,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
> if (!p)
> goto out_resource;
> - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
> + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
> fhp->fh_handle.fh_size);
> }
> if (bmval0 & FATTR4_WORD0_FILEID) {
> @@ -3667,7 +3667,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
> p = xdr_reserve_space(xdr, len + 4);
> if (!p)
> return nfserr_resource;
> - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
> + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
> return 0;
> }
>
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index c2c3d9077dc5..449b57e5e328 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -395,12 +395,12 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
> auth_domain_put(dom);
> if (len)
> return len;
> -
> +
> mesg = buf;
> len = SIMPLE_TRANSACTION_LIMIT;
> - qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
> + qword_addhex(&mesg, &len, (char*)&fh.fh_raw, fh.fh_size);
> mesg[-1] = '\n';
> - return mesg - buf;
> + return mesg - buf;
> }
>
> /*
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index c475d2271f9c..7695c0f1eefe 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -154,11 +154,12 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
> static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
> {
> struct knfsd_fh *fh = &fhp->fh_handle;
> - struct fid *fid = NULL, sfid;
> + struct fid *fid = NULL;
> struct svc_export *exp;
> struct dentry *dentry;
> int fileid_type;
> int data_left = fh->fh_size/4;
> + int len;
> __be32 error;
>
> error = nfserr_stale;
> @@ -167,48 +168,35 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
> if (rqstp->rq_vers == 4 && fh->fh_size == 0)
> return nfserr_nofilehandle;
>
> - if (fh->fh_version == 1) {
> - int len;
> -
> - if (--data_left < 0)
> - return error;
> - if (fh->fh_auth_type != 0)
> - return error;
> - len = key_len(fh->fh_fsid_type) / 4;
> - if (len == 0)
> - return error;
> - if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
> - /* deprecated, convert to type 3 */
> - len = key_len(FSID_ENCODE_DEV)/4;
> - fh->fh_fsid_type = FSID_ENCODE_DEV;
> - /*
> - * struct knfsd_fh uses host-endian fields, which are
> - * sometimes used to hold net-endian values. This
> - * confuses sparse, so we must use __force here to
> - * keep it from complaining.
> - */
> - fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
> - ntohl((__force __be32)fh->fh_fsid[1])));
> - fh->fh_fsid[1] = fh->fh_fsid[2];
> - }
> - data_left -= len;
> - if (data_left < 0)
> - return error;
> - exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
> - fid = (struct fid *)(fh->fh_fsid + len);
> - } else {
> - __u32 tfh[2];
> - dev_t xdev;
> - ino_t xino;
> -
> - if (fh->fh_size != NFS_FHSIZE)
> - return error;
> - /* assume old filehandle format */
> - xdev = old_decode_dev(fh->ofh_xdev);
> - xino = u32_to_ino_t(fh->ofh_xino);
> - mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
> - exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
> + if (fh->fh_version != 1)
> + return error;
> +
> + if (--data_left < 0)
> + return error;
> + if (fh->fh_auth_type != 0)
> + return error;
> + len = key_len(fh->fh_fsid_type) / 4;
> + if (len == 0)
> + return error;
> + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
> + /* deprecated, convert to type 3 */
> + len = key_len(FSID_ENCODE_DEV)/4;
> + fh->fh_fsid_type = FSID_ENCODE_DEV;
> + /*
> + * struct knfsd_fh uses host-endian fields, which are
> + * sometimes used to hold net-endian values. This
> + * confuses sparse, so we must use __force here to
> + * keep it from complaining.
> + */
> + fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
> + ntohl((__force __be32)fh->fh_fsid[1])));
> + fh->fh_fsid[1] = fh->fh_fsid[2];
> }
> + data_left -= len;
> + if (data_left < 0)
> + return error;
> + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
> + fid = (struct fid *)(fh->fh_fsid + len);
>
> error = nfserr_stale;
> if (IS_ERR(exp)) {
> @@ -253,18 +241,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
> if (rqstp->rq_vers > 2)
> error = nfserr_badhandle;
>
> - if (fh->fh_version != 1) {
> - sfid.i32.ino = fh->ofh_ino;
> - sfid.i32.gen = fh->ofh_generation;
> - sfid.i32.parent_ino = fh->ofh_dirino;
> - fid = &sfid;
> - data_left = 3;
> - if (fh->ofh_dirino == 0)
> - fileid_type = FILEID_INO32_GEN;
> - else
> - fileid_type = FILEID_INO32_GEN_PARENT;
> - } else
> - fileid_type = fh->fh_fileid_type;
> + fileid_type = fh->fh_fileid_type;
>
> if (fileid_type == FILEID_ROOT)
> dentry = dget(exp->ex_path.dentry);
> @@ -452,20 +429,6 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
> }
> }
>
> -/*
> - * for composing old style file handles
> - */
> -static inline void _fh_update_old(struct dentry *dentry,
> - struct svc_export *exp,
> - struct knfsd_fh *fh)
> -{
> - fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
> - fh->ofh_generation = d_inode(dentry)->i_generation;
> - if (d_is_dir(dentry) ||
> - (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
> - fh->ofh_dirino = 0;
> -}
> -
> static bool is_root_export(struct svc_export *exp)
> {
> return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
> @@ -600,35 +563,21 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
> fhp->fh_dentry = dget(dentry); /* our internal copy */
> fhp->fh_export = exp_get(exp);
>
> - if (fhp->fh_handle.fh_version == 0xca) {
> - /* old style filehandle please */
> - memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
> - fhp->fh_handle.fh_size = NFS_FHSIZE;
> - fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
> - fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
> - fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
> - fhp->fh_handle.ofh_xino =
> - ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
> - fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
> - if (inode)
> - _fh_update_old(dentry, exp, &fhp->fh_handle);
> - } else {
> - fhp->fh_handle.fh_size =
> - key_len(fhp->fh_handle.fh_fsid_type) + 4;
> - fhp->fh_handle.fh_auth_type = 0;
> -
> - mk_fsid(fhp->fh_handle.fh_fsid_type,
> - fhp->fh_handle.fh_fsid,
> - ex_dev,
> - d_inode(exp->ex_path.dentry)->i_ino,
> - exp->ex_fsid, exp->ex_uuid);
> -
> - if (inode)
> - _fh_update(fhp, exp, dentry);
> - if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
> - fh_put(fhp);
> - return nfserr_opnotsupp;
> - }
> + fhp->fh_handle.fh_size =
> + key_len(fhp->fh_handle.fh_fsid_type) + 4;
> + fhp->fh_handle.fh_auth_type = 0;
> +
> + mk_fsid(fhp->fh_handle.fh_fsid_type,
> + fhp->fh_handle.fh_fsid,
> + ex_dev,
> + d_inode(exp->ex_path.dentry)->i_ino,
> + exp->ex_fsid, exp->ex_uuid);
> +
> + if (inode)
> + _fh_update(fhp, exp, dentry);
> + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
> + fh_put(fhp);
> + return nfserr_opnotsupp;
> }
>
> return 0;
> @@ -649,23 +598,19 @@ fh_update(struct svc_fh *fhp)
> dentry = fhp->fh_dentry;
> if (d_really_is_negative(dentry))
> goto out_negative;
> - if (fhp->fh_handle.fh_version != 1) {
> - _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
> - } else {
> - if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
> - return 0;
> + if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
> + return 0;
>
> - _fh_update(fhp, fhp->fh_export, dentry);
> - if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
> - return nfserr_opnotsupp;
> - }
> + _fh_update(fhp, fhp->fh_export, dentry);
> + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
> + return nfserr_opnotsupp;
> return 0;
> out_bad:
> printk(KERN_ERR "fh_update: fh not verified!\n");
> return nfserr_serverfault;
> out_negative:
> printk(KERN_ERR "fh_update: %pd2 still negative!\n",
> - dentry);
> + dentry);
> return nfserr_serverfault;
> }
>
> @@ -702,12 +647,12 @@ char * SVCFH_fmt(struct svc_fh *fhp)
> static char buf[80];
> sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
> fh->fh_size,
> - fh->fh_base.fh_pad[0],
> - fh->fh_base.fh_pad[1],
> - fh->fh_base.fh_pad[2],
> - fh->fh_base.fh_pad[3],
> - fh->fh_base.fh_pad[4],
> - fh->fh_base.fh_pad[5]);
> + fh->fh_raw[0],
> + fh->fh_raw[1],
> + fh->fh_raw[2],
> + fh->fh_raw[3],
> + fh->fh_raw[4],
> + fh->fh_raw[5]);
> return buf;
> }
>
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index 6106697adc04..f36234c474dc 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -10,9 +10,56 @@
>
> #include <linux/crc32.h>
> #include <linux/sunrpc/svc.h>
> -#include <uapi/linux/nfsd/nfsfh.h>
> #include <linux/iversion.h>
> #include <linux/exportfs.h>
> +#include <linux/nfs4.h>
> +
> +/*
> + * The file handle starts with a sequence of four-byte words.
> + * The first word contains a version number (1) and three descriptor bytes
> + * that tell how the remaining 3 variable length fields should be handled.
> + * These three bytes are auth_type, fsid_type and fileid_type.
> + *
> + * All four-byte values are in host-byte-order.
> + *
> + * The auth_type field is deprecated and must be set to 0.
> + *
> + * The fsid_type identifies how the filesystem (or export point) is
> + * encoded.
> + * Current values:
> + * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
> + * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
> + * says we mustn't. We must break it up and reassemble.
> + * 1 - 4 byte user specified identifier
> + * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
> + * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
> + * 4 - 4 byte inode number and 4 byte uuid
> + * 5 - 8 byte uuid
> + * 6 - 16 byte uuid
> + * 7 - 8 byte inode number and 16 byte uuid
> + *
> + * The fileid_type identified how the file within the filesystem is encoded.
> + * The values for this field are filesystem specific, exccept that
> + * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
> + * in include/linux/exportfs.h for currently registered values.
> + */
> +
> +struct knfsd_fh {
> + unsigned int fh_size; /*
> + * Points to the current size while
> + * building a new file handle.
> + */
> + union {
> + __u32 fh_raw[NFS4_FHSIZE/4];
> + struct {
> + __u8 fh_version; /* == 1 */
> + __u8 fh_auth_type; /* deprecated */
> + __u8 fh_fsid_type;
> + __u8 fh_fileid_type;
> + __u32 fh_fsid[]; /* flexible-array member */
> + };
> + };
> +};
>
> static inline __u32 ino_t_to_u32(ino_t ino)
> {
> @@ -188,7 +235,7 @@ static inline void
> fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
> {
> dst->fh_size = src->fh_size;
> - memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
> + memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
> }
>
> static __inline__ struct svc_fh *
> @@ -203,7 +250,7 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
> {
> if (fh1->fh_size != fh2->fh_size)
> return false;
> - if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
> + if (memcmp(fh1->fh_raw, fh2->fh_raw, fh1->fh_size) != 0)
> return false;
> return true;
> }
> @@ -227,7 +274,7 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
> */
> static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
> {
> - return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
> + return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_raw, fh->fh_size);
> }
> #else
> static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
> diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
> index a06c05fe3b42..082449c7d0db 100644
> --- a/fs/nfsd/nfsxdr.c
> +++ b/fs/nfsd/nfsxdr.c
> @@ -64,7 +64,7 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
> if (!p)
> return false;
> fh_init(fhp, NFS_FHSIZE);
> - memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
> + memcpy(&fhp->fh_handle.fh_raw, p, NFS_FHSIZE);
> fhp->fh_handle.fh_size = NFS_FHSIZE;
>
> return true;
> @@ -78,7 +78,7 @@ svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp)
> p = xdr_reserve_space(xdr, NFS_FHSIZE);
> if (!p)
> return false;
> - memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
> + memcpy(p, &fhp->fh_handle.fh_raw, NFS_FHSIZE);
>
> return true;
> }
> diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
> deleted file mode 100644
> index 427294dd56a1..000000000000
> --- a/include/uapi/linux/nfsd/nfsfh.h
> +++ /dev/null
> @@ -1,116 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> -/*
> - * This file describes the layout of the file handles as passed
> - * over the wire.
> - *
> - * Copyright (C) 1995, 1996, 1997 Olaf Kirch <[email protected]>
> - */
> -
> -#ifndef _UAPI_LINUX_NFSD_FH_H
> -#define _UAPI_LINUX_NFSD_FH_H
> -
> -#include <linux/types.h>
> -#include <linux/nfs.h>
> -#include <linux/nfs2.h>
> -#include <linux/nfs3.h>
> -#include <linux/nfs4.h>
> -
> -/*
> - * This is the old "dentry style" Linux NFSv2 file handle.
> - *
> - * The xino and xdev fields are currently used to transport the
> - * ino/dev of the exported inode.
> - */
> -struct nfs_fhbase_old {
> - __u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
> - __u32 fb_ino; /* our inode number */
> - __u32 fb_dirino; /* dir inode number, 0 for directories */
> - __u32 fb_dev; /* our device */
> - __u32 fb_xdev;
> - __u32 fb_xino;
> - __u32 fb_generation;
> -};
> -
> -/*
> - * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
> - * by Neil Brown <[email protected]> - March 2000
> - *
> - * The file handle starts with a sequence of four-byte words.
> - * The first word contains a version number (1) and three descriptor bytes
> - * that tell how the remaining 3 variable length fields should be handled.
> - * These three bytes are auth_type, fsid_type and fileid_type.
> - *
> - * All four-byte values are in host-byte-order.
> - *
> - * The auth_type field is deprecated and must be set to 0.
> - *
> - * The fsid_type identifies how the filesystem (or export point) is
> - * encoded.
> - * Current values:
> - * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
> - * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
> - * says we mustn't. We must break it up and reassemble.
> - * 1 - 4 byte user specified identifier
> - * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
> - * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
> - * 4 - 4 byte inode number and 4 byte uuid
> - * 5 - 8 byte uuid
> - * 6 - 16 byte uuid
> - * 7 - 8 byte inode number and 16 byte uuid
> - *
> - * The fileid_type identified how the file within the filesystem is encoded.
> - * The values for this field are filesystem specific, exccept that
> - * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
> - * in include/linux/exportfs.h for currently registered values.
> - */
> -struct nfs_fhbase_new {
> - union {
> - struct {
> - __u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
> - __u8 fb_auth_type_aux;
> - __u8 fb_fsid_type_aux;
> - __u8 fb_fileid_type_aux;
> - __u32 fb_auth[1];
> - /* __u32 fb_fsid[0]; floating */
> - /* __u32 fb_fileid[0]; floating */
> - };
> - struct {
> - __u8 fb_version; /* == 1, even => nfs_fhbase_old */
> - __u8 fb_auth_type;
> - __u8 fb_fsid_type;
> - __u8 fb_fileid_type;
> - __u32 fb_auth_flex[]; /* flexible-array member */
> - };
> - };
> -};
> -
> -struct knfsd_fh {
> - unsigned int fh_size; /* significant for NFSv3.
> - * Points to the current size while building
> - * a new file handle
> - */
> - union {
> - struct nfs_fhbase_old fh_old;
> - __u32 fh_pad[NFS4_FHSIZE/4];
> - struct nfs_fhbase_new fh_new;
> - } fh_base;
> -};
> -
> -#define ofh_dcookie fh_base.fh_old.fb_dcookie
> -#define ofh_ino fh_base.fh_old.fb_ino
> -#define ofh_dirino fh_base.fh_old.fb_dirino
> -#define ofh_dev fh_base.fh_old.fb_dev
> -#define ofh_xdev fh_base.fh_old.fb_xdev
> -#define ofh_xino fh_base.fh_old.fb_xino
> -#define ofh_generation fh_base.fh_old.fb_generation
> -
> -#define fh_version fh_base.fh_new.fb_version
> -#define fh_fsid_type fh_base.fh_new.fb_fsid_type
> -#define fh_auth_type fh_base.fh_new.fb_auth_type
> -#define fh_fileid_type fh_base.fh_new.fb_fileid_type
> -#define fh_fsid fh_base.fh_new.fb_auth_flex
> -
> -/* Do not use, provided for userspace compatiblity. */
> -#define fh_auth fh_base.fh_new.fb_auth
> -
> -#endif /* _UAPI_LINUX_NFSD_FH_H */
> --
> 2.32.0
>
>

--
Chuck Lever



2021-08-26 14:54:02

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] NFSD: drop support for ancient file-handles

On Thu, Aug 26, 2021 at 02:28:15PM +1000, NeilBrown wrote:
>
> File-handles not in the "new" or "version 1" format have not been handed
> out for new mounts since Linux 2.4 which was released 20 years ago.
> I think it is safe to say that no such file handles are still in use,
> and that we can drop support for them.
>
> This patch also moves the nfsfh.h from the include/uapi directory into
> fs/nfsd. I can find no evidence of it being used anywhere outside the
> kernel. Certainly nfs-utils and wireshark do not use it.
>
> fh_base and fh_pad are occasionally used to refer to the whole
> filehandle. These are replaced with "fh_raw" which is hopefully more
> meaningful.

Oh boy, I will not miss those (fh_version == 1) cases in nfsfh.c.
Excellent.

Looks like it just needs the following folded in.

--b.

diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index db7ef07ae50c..2e2f1d5e9f62 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -61,7 +61,7 @@ nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
goto out_error;

fl->fh.size = fhp->fh_handle.fh_size;
- memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+ memcpy(fl->fh.data, &fhp->fh_handle.fh_raw, fl->fh.size);

/* Give whole file layout segments */
seg->offset = 0;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4872b9519a72..3f7e59ec4e32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1383,7 +1383,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
s_fh = &cstate->save_fh;

copy->c_fh.size = s_fh->fh_handle.fh_size;
- memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_base, copy->c_fh.size);
+ memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_raw, copy->c_fh.size);
copy->stateid.seqid = cpu_to_be32(s_stid->si_generation);
memcpy(copy->stateid.other, (void *)&s_stid->si_opaque,
sizeof(stateid_opaque_t));

2021-08-26 20:19:46

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Thu, Aug 26, 2021 at 04:03:04PM +1000, NeilBrown wrote:
>
> [[ Hi Bruce and Chuck,
> I've rebased this patch on the earlier patch I sent which allows
> me to use the name "fh_flags". I've also added a missing #include.
> I've added the 'Acked-by' which Joesf provided earlier for the
> btrfs-part. I don't have an 'ack' for the stat.h part, but no-one
> has complained wither.
> I think it is as ready as it can be, and am keen to know what you
> think.
> I'm not *very* keen on testing s_magic in nfsd code (though we
> already have a couple of such tests in nfs3proc.c), but it does have
> the advantage of ensuring that no other filesystem can use this
> functionality without landing a patch in fs/nfsd/.
>
> Thanks for any review that you can provide,
> NeilBrown
> ]]

This seems hairy, but *somebody* has hated every single solution
proposed, so, argh, I don't know, maybe it's best.

There was a ton of "but why can't we just..." in previous threads, could
we include URLs for those and/or the lwn articles? E.g.:

https://lore.kernel.org/linux-nfs/[email protected]/#b
https://lwn.net/Articles/866709/

> BTRFS does not provide unique inode numbers across a filesystem.
> It only provides unique inode numbers within a subvolume and
> uses synthetic device numbers for different subvolumes to ensure
> uniqueness for device+inode.
>
> nfsd cannot use these varying synthetic device numbers. If nfsd were to
> synthesise different stable filesystem ids to give to the client, that
> would cause subvolumes to appear in the mount table on the client, even
> though they don't appear in the mount table on the server. Also, NFSv3
> doesn't support changing the filesystem id without a new explicit mount
> on the client (this is partially supported in practice, but violates the
> protocol specification and has problems in some edge cases).
>
> So currently, the roots of all subvolumes report the same inode number
> in the same filesystem to NFS clients and tools like 'find' notice that
> a directory has the same identity as an ancestor, and so refuse to
> enter that directory.
>
> This patch allows btrfs (or any filesystem) to provide a 64bit number
> that can be xored with the inode number to make the number more unique.
> Rather than the client being certain to see duplicates, with this patch
> it is possible but extremely rare.
>
> The number that btrfs provides is a swab64() version of the subvolume
> identifier. This has most entropy in the high bits (the low bits of the
> subvolume identifer), while the inode has most entropy in the low bits.
> The result will always be unique within a subvolume, and will almost
> always be unique across the filesystem.
>
> If an upgrade of the NFS server caused all inode numbers in an exportfs
> BTRFS filesystem to appear to the client to change, the client may not
> handle this well. The Linux client will cause any open files to become
> 'stale'. If the mount point changed inode number, the whole mount would
> become inaccessible.
>
> To avoid this, an unused byte in the filehandle (fh_auth) has been
> repurposed as "fh_flags". The new behaviour of uniquifying inode number
> is only activated when a new flag is set.
>
> NFSD will only set this flag in filehandles it reports if the filehandle
> of the parent (provided by the client) contains the bit, or if
> - the filehandle for the parent is not provided or is for a different
> export and
> - the filehandle refers to a BTRFS filesystem.
>
> Thus if you have a BTRFS filesystem originally mounted from a server
> without this patch, the flag will never be set and the current behaviour
> will continue. Only once you re-mount the filesystem (or the filesystem
> is re-auto-mounted) will the inode numbers change. When that happens,
> it is likely that the filesystem st_dev number seen on the client will
> change anyway.
>
> Acked-by: Josef Bacik <[email protected]> (for BTFS change)

s/BTFS/BTRFS/.

> Signed-off-by: NeilBrown <[email protected]>
> ---
> fs/btrfs/inode.c | 4 ++++
> fs/nfsd/nfs3xdr.c | 15 ++++++++++++++-
> fs/nfsd/nfs4xdr.c | 7 ++++---
> fs/nfsd/nfsfh.c | 14 ++++++++++++--
> fs/nfsd/nfsfh.h | 34 +++++++++++++++++++++++++++++++---
> fs/nfsd/xdr3.h | 2 ++
> include/linux/stat.h | 18 ++++++++++++++++++
> 7 files changed, 85 insertions(+), 9 deletions(-)
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 06f9f167222b..d35e2a30f25f 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9195,6 +9195,10 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
> generic_fillattr(&init_user_ns, inode, stat);
> stat->dev = BTRFS_I(inode)->root->anon_dev;
>
> + if (BTRFS_I(inode)->root->root_key.objectid != BTRFS_FS_TREE_OBJECTID)
> + stat->ino_uniquifier =
> + swab64(BTRFS_I(inode)->root->root_key.objectid);
> +
> spin_lock(&BTRFS_I(inode)->lock);
> delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
> inode_bytes = inode_get_bytes(inode);
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 3d37923afb06..5e2d5c352ecd 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -340,6 +340,7 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
> {
> struct user_namespace *userns = nfsd_user_namespace(rqstp);
> __be32 *p;
> + u64 ino;
> u64 fsid;
>
> p = xdr_reserve_space(xdr, XDR_UNIT * 21);
> @@ -377,7 +378,8 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
> p = xdr_encode_hyper(p, fsid);
>
> /* fileid */
> - p = xdr_encode_hyper(p, stat->ino);
> + ino = nfsd_uniquify_ino(fhp, stat);
> + p = xdr_encode_hyper(p, ino);
>
> p = encode_nfstime3(p, &stat->atime);
> p = encode_nfstime3(p, &stat->mtime);
> @@ -1151,6 +1153,17 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
> if (xdr_stream_encode_item_present(xdr) < 0)
> return false;
> /* fileid */
> + if (!resp->dir_have_uniquifier) {
> + struct kstat stat;
> + if (fh_getattr(&resp->fh, &stat) == nfs_ok)
> + resp->dir_ino_uniquifier =
> + nfsd_ino_uniquifier(&resp->fh, &stat);
> + else
> + resp->dir_ino_uniquifier = 0;
> + resp->dir_have_uniquifier = true;

This took me a minute. So we're assuming the uniquifier stays the same
across a directory and its children (because you can't hard link across
subvolumes), and this code is just caching the uniquifier for use across
the directory--is that right?

> + }
> + if (resp->dir_ino_uniquifier != ino)
> + ino ^= resp->dir_ino_uniquifier;

I guess this check (here and in nfsd_uniquify_ino) is just to prevent
returning inode number zero?

--b.

> if (xdr_stream_encode_u64(xdr, ino) < 0)
> return false;
> /* name */
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index a54b2845473b..6e31f6286e0b 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -3114,10 +3114,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> fhp->fh_handle.fh_size);
> }
> if (bmval0 & FATTR4_WORD0_FILEID) {
> + u64 ino = nfsd_uniquify_ino(fhp, &stat);
> p = xdr_reserve_space(xdr, 8);
> if (!p)
> goto out_resource;
> - p = xdr_encode_hyper(p, stat.ino);
> + p = xdr_encode_hyper(p, ino);
> }
> if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
> p = xdr_reserve_space(xdr, 8);
> @@ -3274,7 +3275,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>
> p = xdr_reserve_space(xdr, 8);
> if (!p)
> - goto out_resource;
> + goto out_resource;
> /*
> * Get parent's attributes if not ignoring crossmount
> * and this is the root of a cross-mounted filesystem.
> @@ -3284,7 +3285,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> err = get_parent_attributes(exp, &parent_stat);
> if (err)
> goto out_nfserr;
> - ino = parent_stat.ino;
> + ino = nfsd_uniquify_ino(fhp, &parent_stat);
> }
> p = xdr_encode_hyper(p, ino);
> }
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index 7695c0f1eefe..18bb139f8bfe 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -9,6 +9,7 @@
> */
>
> #include <linux/exportfs.h>
> +#include <linux/magic.h>
>
> #include <linux/sunrpc/svcauth_gss.h>
> #include "nfsd.h"
> @@ -173,7 +174,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
>
> if (--data_left < 0)
> return error;
> - if (fh->fh_auth_type != 0)
> + if ((fh->fh_flags & ~NFSD_FH_FLAG_ALL) != 0)
> return error;
> len = key_len(fh->fh_fsid_type) / 4;
> if (len == 0)
> @@ -532,6 +533,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
>
> struct inode * inode = d_inode(dentry);
> dev_t ex_dev = exp_sb(exp)->s_dev;
> + u8 flags = 0;
>
> dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
> MAJOR(ex_dev), MINOR(ex_dev),
> @@ -548,6 +550,14 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
> /* If we have a ref_fh, then copy the fh_no_wcc setting from it. */
> fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false;
>
> + if (ref_fh && ref_fh->fh_export == exp) {
> + flags = ref_fh->fh_handle.fh_flags;
> + } else {
> + /* Set flags as needed */
> + if (exp->ex_path.mnt->mnt_sb->s_magic == BTRFS_SUPER_MAGIC)
> + flags |= NFSD_FH_FLAG_INO_UNIQUIFY;
> + }
> +
> if (ref_fh == fhp)
> fh_put(ref_fh);
>
> @@ -565,7 +575,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
>
> fhp->fh_handle.fh_size =
> key_len(fhp->fh_handle.fh_fsid_type) + 4;
> - fhp->fh_handle.fh_auth_type = 0;
> + fhp->fh_handle.fh_flags = flags;
>
> mk_fsid(fhp->fh_handle.fh_fsid_type,
> fhp->fh_handle.fh_fsid,
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index f36234c474dc..bbc7ddd34143 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -18,11 +18,17 @@
> * The file handle starts with a sequence of four-byte words.
> * The first word contains a version number (1) and three descriptor bytes
> * that tell how the remaining 3 variable length fields should be handled.
> - * These three bytes are auth_type, fsid_type and fileid_type.
> + * These three bytes are flags, fsid_type and fileid_type.
> *
> * All four-byte values are in host-byte-order.
> *
> - * The auth_type field is deprecated and must be set to 0.
> + * The flags field (previously auth_type) can be used when nfsd behaviour
> + * needs to change in a non-compatible way, usually for some specific
> + * filesystem. Flags should only be set in filehandles for filesystems which
> + * need them.
> + * Current values:
> + * 1 - BTRFS only. Cause stat->ino_uniquifier to be used to improve inode
> + * number uniqueness.
> *
> * The fsid_type identifies how the filesystem (or export point) is
> * encoded.
> @@ -53,7 +59,7 @@ struct knfsd_fh {
> __u32 fh_raw[NFS4_FHSIZE/4];
> struct {
> __u8 fh_version; /* == 1 */
> - __u8 fh_auth_type; /* deprecated */
> + __u8 fh_flags;
> __u8 fh_fsid_type;
> __u8 fh_fileid_type;
> __u32 fh_fsid[]; /* flexible-array member */
> @@ -131,6 +137,28 @@ enum fsid_source {
> };
> extern enum fsid_source fsid_source(const struct svc_fh *fhp);
>
> +enum nfsd_fh_flags {
> + NFSD_FH_FLAG_INO_UNIQUIFY = 1, /* BTRFS only */
> +
> + NFSD_FH_FLAG_ALL = 1
> +};
> +
> +static inline u64 nfsd_ino_uniquifier(const struct svc_fh *fhp,
> + const struct kstat *stat)
> +{
> + if (fhp->fh_handle.fh_flags & NFSD_FH_FLAG_INO_UNIQUIFY)
> + return stat->ino_uniquifier;
> + return 0;
> +}
> +
> +static inline u64 nfsd_uniquify_ino(const struct svc_fh *fhp,
> + const struct kstat *stat)
> +{
> + u64 u = nfsd_ino_uniquifier(fhp, stat);
> + if (u != stat->ino)
> + return stat->ino ^ u;
> + return stat->ino;
> +}
>
> /*
> * This might look a little large to "inline" but in all calls except
> diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
> index 933008382bbe..d9b6c8314bbb 100644
> --- a/fs/nfsd/xdr3.h
> +++ b/fs/nfsd/xdr3.h
> @@ -179,6 +179,8 @@ struct nfsd3_readdirres {
> struct xdr_buf dirlist;
> struct svc_fh scratch;
> struct readdir_cd common;
> + u64 dir_ino_uniquifier;
> + bool dir_have_uniquifier;
> unsigned int cookie_offset;
> struct svc_rqst * rqstp;
>
> diff --git a/include/linux/stat.h b/include/linux/stat.h
> index fff27e603814..0f3f74d302f8 100644
> --- a/include/linux/stat.h
> +++ b/include/linux/stat.h
> @@ -46,6 +46,24 @@ struct kstat {
> struct timespec64 btime; /* File creation time */
> u64 blocks;
> u64 mnt_id;
> + /*
> + * BTRFS does not provide unique inode numbers within a filesystem,
> + * depending on a synthetic 'dev' to provide uniqueness.
> + * NFSd cannot make use of this 'dev' number so clients often see
> + * duplicate inode numbers.
> + * For BTRFS, 'ino' is unlikely to use the high bits until the filesystem
> + * has created a great many inodes.
> + * It puts another number in ino_uniquifier which:
> + * - has most entropy in the high bits
> + * - is different precisely when 'dev' is different
> + * - is stable across unmount/remount
> + * NFSd can xor this with 'ino' to get a substantially more unique
> + * number for reporting to the client.
> + * The ino_uniquifier for a directory can reasonably be applied
> + * to inode numbers reported by the readdir filldir callback.
> + * It is NOT currently exported to user-space.
> + */
> + u64 ino_uniquifier;
> };
>
> #endif
> --
> 2.32.0
>
>

2021-08-26 21:38:44

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH] NFSD: drop support for ancient file-handles

On Fri, 27 Aug 2021, Chuck Lever III wrote:
>
> > On Aug 26, 2021, at 12:28 AM, NeilBrown <[email protected]> wrote:
> >
> >
> > File-handles not in the "new" or "version 1" format have not been handed
> > out for new mounts since Linux 2.4 which was released 20 years ago.
> > I think it is safe to say that no such file handles are still in use,
> > and that we can drop support for them.
> >
> > This patch also moves the nfsfh.h from the include/uapi directory into
> > fs/nfsd. I can find no evidence of it being used anywhere outside the
> > kernel. Certainly nfs-utils and wireshark do not use it.
> >
> > fh_base and fh_pad are occasionally used to refer to the whole
> > filehandle. These are replaced with "fh_raw" which is hopefully more
> > meaningful.
> >
> > Signed-off-by: NeilBrown <[email protected]>
> > ---
> >
> > I found
> > https://www.spinics.net/lists/linux-nfs/msg43280.html
> > "Re: [PATCH] nfsd: clean up fh_auth usage"
> > from 2014 where moving nfsfh.h out of uapi was considered but not
> > actioned. Christoph said he would "do some research if the
> > uapi <linux/nfsd/*.h> headers are used anywhere at all". I can find no
> > report on the result of that research. My own research turned up
> > nothing.
> >
> > Thanks,
> > NeilBrown
>
> Hi Neil-
>
> I have no philosophical objection to this clean up, but I'm
> concerned a bit about timing. It's a large patch, and 5.15
> should be opening on Sunday. I would prefer this to go into
> 5.16, if that's OK with you?

No problem at all. I enjoy the luxury of send patches whenever I'm
ready, and assume you will process them only when you are ready. I do,
of course, appreciate knowing what you plan.

Thanks,
NeilBrown

2021-08-26 21:42:08

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH] NFSD: drop support for ancient file-handles

On Fri, 27 Aug 2021, J. Bruce Fields wrote:
> On Thu, Aug 26, 2021 at 02:28:15PM +1000, NeilBrown wrote:
> >
> > File-handles not in the "new" or "version 1" format have not been handed
> > out for new mounts since Linux 2.4 which was released 20 years ago.
> > I think it is safe to say that no such file handles are still in use,
> > and that we can drop support for them.
> >
> > This patch also moves the nfsfh.h from the include/uapi directory into
> > fs/nfsd. I can find no evidence of it being used anywhere outside the
> > kernel. Certainly nfs-utils and wireshark do not use it.
> >
> > fh_base and fh_pad are occasionally used to refer to the whole
> > filehandle. These are replaced with "fh_raw" which is hopefully more
> > meaningful.
>
> Oh boy, I will not miss those (fh_version == 1) cases in nfsfh.c.
> Excellent.

:-)

>
> Looks like it just needs the following folded in.

Don't know how I missed those - so happy to have reliable reviewers!

Thanks,
NeilBrown

2021-08-26 22:11:14

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Fri, 27 Aug 2021, J. Bruce Fields wrote:
> On Thu, Aug 26, 2021 at 04:03:04PM +1000, NeilBrown wrote:
> >
> > [[ Hi Bruce and Chuck,
> > I've rebased this patch on the earlier patch I sent which allows
> > me to use the name "fh_flags". I've also added a missing #include.
> > I've added the 'Acked-by' which Joesf provided earlier for the
> > btrfs-part. I don't have an 'ack' for the stat.h part, but no-one
> > has complained wither.
> > I think it is as ready as it can be, and am keen to know what you
> > think.
> > I'm not *very* keen on testing s_magic in nfsd code (though we
> > already have a couple of such tests in nfs3proc.c), but it does have
> > the advantage of ensuring that no other filesystem can use this
> > functionality without landing a patch in fs/nfsd/.
> >
> > Thanks for any review that you can provide,
> > NeilBrown
> > ]]
>
> This seems hairy, but *somebody* has hated every single solution
> proposed, so, argh, I don't know, maybe it's best.

People don't like change I guess :-)
I think we need the fh_flags stuff for almost any fix, else existing
mounts could break when the server is upgraded. This could be needed
for any filesystem that has flawed NFS export support and needs to
seemlessly repair it.
We *might* be able to avoid that is I xored the uniquifier with the fsid
instead of the fileid (I'd have to test), but that has other problems
like polluting the client's mount table and mounted-on-fileid being hard
to manage - especially for NFSv3.
The rest is the minimum that actually achieves something.

I could still agonise of whether the swap-bits instead of swap-bytes,
and whether to leave a few high bits free for e.g. overlay. But I won't
lose sleep over it.

>
> There was a ton of "but why can't we just..." in previous threads, could
> we include URLs for those and/or the lwn articles? E.g.:
>
> https://lore.kernel.org/linux-nfs/[email protected]/#b
> https://lwn.net/Articles/866709/

I've add Link: lines. The are a couple of other threads that maybe I
could like.

> > Acked-by: Josef Bacik <[email protected]> (for BTFS change)
>
> s/BTFS/BTRFS/.

Ack.
> > /* fileid */
> > + if (!resp->dir_have_uniquifier) {
> > + struct kstat stat;
> > + if (fh_getattr(&resp->fh, &stat) == nfs_ok)
> > + resp->dir_ino_uniquifier =
> > + nfsd_ino_uniquifier(&resp->fh, &stat);
> > + else
> > + resp->dir_ino_uniquifier = 0;
> > + resp->dir_have_uniquifier = true;
>
> This took me a minute. So we're assuming the uniquifier stays the same
> across a directory and its children (because you can't hard link across
> subvolumes), and this code is just caching the uniquifier for use across
> the directory--is that right?

Yep. I think I originally planned to set dir_ino_uniquifier closer to
"open", but there wasn't a convenient place to do that, so I did it here
and added the "dir_have_uniquifier" flag.

The comment in stat.h affirms that the uniquifier for a directory can be
used for inodes reported by readdir. Note that the inode numbers might
be different to those returned by a lookup of the name - just like with
mountpoints.


>
> > + }
> > + if (resp->dir_ino_uniquifier != ino)
> > + ino ^= resp->dir_ino_uniquifier;
>
> I guess this check (here and in nfsd_uniquify_ino) is just to prevent
> returning inode number zero?

Yep. The set of valid inode numbers is 1..MAX and that set isn't closed
under xor. It is closed (and bijective) under "xor if not equals".

I've added:
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 5e2d5c352ecd..fed56edf229f 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1162,6 +1162,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
resp->dir_ino_uniquifier = 0;
resp->dir_have_uniquifier = true;
}
+ /* See comment in nfsd_uniquify_ino() */
if (resp->dir_ino_uniquifier != ino)
ino ^= resp->dir_ino_uniquifier;
if (xdr_stream_encode_u64(xdr, ino) < 0)
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index bbc7ddd34143..6dd8c7325902 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -155,6 +155,9 @@ static inline u64 nfsd_uniquify_ino(const struct svc_fh *fhp,
const struct kstat *stat)
{
u64 u = nfsd_ino_uniquifier(fhp, stat);
+ /* Neither stat->ino or return value can be zero, so
+ * if ->ino is u, return u.
+ */
if (u != stat->ino)
return stat->ino ^ u;
return stat->ino;

Thanks,
NeilBrown

2021-08-27 14:58:41

by Frank Filz

[permalink] [raw]
Subject: RE: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

> On Fri, 27 Aug 2021, J. Bruce Fields wrote:
> > On Thu, Aug 26, 2021 at 04:03:04PM +1000, NeilBrown wrote:
> > >
> > > [[ Hi Bruce and Chuck,
> > > I've rebased this patch on the earlier patch I sent which allows
> > > me to use the name "fh_flags". I've also added a missing #include.
> > > I've added the 'Acked-by' which Joesf provided earlier for the
> > > btrfs-part. I don't have an 'ack' for the stat.h part, but no-one
> > > has complained wither.
> > > I think it is as ready as it can be, and am keen to know what you
> > > think.
> > > I'm not *very* keen on testing s_magic in nfsd code (though we
> > > already have a couple of such tests in nfs3proc.c), but it does have
> > > the advantage of ensuring that no other filesystem can use this
> > > functionality without landing a patch in fs/nfsd/.
> > >
> > > Thanks for any review that you can provide,
> > > NeilBrown
> > > ]]
> >
> > This seems hairy, but *somebody* has hated every single solution
> > proposed, so, argh, I don't know, maybe it's best.
>
> People don't like change I guess :-)
> I think we need the fh_flags stuff for almost any fix, else existing mounts could
> break when the server is upgraded. This could be needed for any filesystem that
> has flawed NFS export support and needs to seemlessly repair it.
> We *might* be able to avoid that is I xored the uniquifier with the fsid instead of
> the fileid (I'd have to test), but that has other problems like polluting the client's
> mount table and mounted-on-fileid being hard to manage - especially for NFSv3.
> The rest is the minimum that actually achieves something.

Changing the fsid for sub-volumes is Ganesha's solution (before adding that, we couldn't even export the sub-volumes at all).

Mangling the fileid is definitely the best solution if there will be lots of sub-volumes. For a few sub-volumes changing fsid does create additional mount points on the client with some issues, but does guarantee there will be no fileid collision.

My gut feel is your solution is the best one and Ganesha may need to switch to that solution.

Frank

2021-08-27 15:15:39

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] NFSD: drop support for ancient file-handles

On Thu, Aug 26, 2021 at 02:28:15PM +1000, NeilBrown wrote:
> This patch also moves the nfsfh.h from the include/uapi directory into
> fs/nfsd. I can find no evidence of it being used anywhere outside the
> kernel. Certainly nfs-utils and wireshark do not use it.

That sounds fine, but I'd split this into a separate patch.

> fh_base and fh_pad are occasionally used to refer to the whole
> filehandle. These are replaced with "fh_raw" which is hopefully more
> meaningful.

I think that kind of cleanup should also be a separate patch. That
being said as far as I can tell fh_raw is only ever used in context
where we can just pass a void pointer. So just giving the struct
for the "new" file handle after fh_size a name and passing that
would be much cleaner than a union with a char array.


> I found
> https://www.spinics.net/lists/linux-nfs/msg43280.html
> "Re: [PATCH] nfsd: clean up fh_auth usage"
> from 2014 where moving nfsfh.h out of uapi was considered but not
> actioned. Christoph said he would "do some research if the
> uapi <linux/nfsd/*.h> headers are used anywhere at all". I can find no
> report on the result of that research. My own research turned up
> nothing.

I can't remember doing much of research, and certainly not of finding
anything.

> - memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
> + memcpy((char*)&fh.fh_handle.fh_raw, f->data, f->size);

Indedpendnt on what we're going to pass here, I don't think we
should need cast like this one (there are a few more).

2021-08-27 16:22:02

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Thu, Aug 26, 2021 at 04:03:04PM +1000, NeilBrown wrote:
>
> [[ Hi Bruce and Chuck,
> I've rebased this patch on the earlier patch I sent which allows
> me to use the name "fh_flags". I've also added a missing #include.
> I've added the 'Acked-by' which Joesf provided earlier for the
> btrfs-part. I don't have an 'ack' for the stat.h part, but no-one
> has complained wither.
> I think it is as ready as it can be, and am keen to know what you
> think.
> I'm not *very* keen on testing s_magic in nfsd code (though we
> already have a couple of such tests in nfs3proc.c), but it does have
> the advantage of ensuring that no other filesystem can use this
> functionality without landing a patch in fs/nfsd/.
>
> Thanks for any review that you can provide,
> NeilBrown
> ]]
>
> BTRFS does not provide unique inode numbers across a filesystem.
> It only provides unique inode numbers within a subvolume and
> uses synthetic device numbers for different subvolumes to ensure
> uniqueness for device+inode.
>
> nfsd cannot use these varying synthetic device numbers. If nfsd were to
> synthesise different stable filesystem ids to give to the client, that
> would cause subvolumes to appear in the mount table on the client, even
> though they don't appear in the mount table on the server. Also, NFSv3
> doesn't support changing the filesystem id without a new explicit mount
> on the client (this is partially supported in practice, but violates the
> protocol specification and has problems in some edge cases).
>
> So currently, the roots of all subvolumes report the same inode number
> in the same filesystem to NFS clients and tools like 'find' notice that
> a directory has the same identity as an ancestor, and so refuse to
> enter that directory.
>
> This patch allows btrfs (or any filesystem) to provide a 64bit number
> that can be xored with the inode number to make the number more unique.
> Rather than the client being certain to see duplicates, with this patch
> it is possible but extremely rare.

Yikes. Please don't do something like this that will eventually
cause collisions.

2021-08-27 18:32:42

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Fri, Aug 27, 2021 at 08:10:38AM +1000, NeilBrown wrote:
> On Fri, 27 Aug 2021, J. Bruce Fields wrote:
> > On Thu, Aug 26, 2021 at 04:03:04PM +1000, NeilBrown wrote:
> > > + }
> > > + if (resp->dir_ino_uniquifier != ino)
> > > + ino ^= resp->dir_ino_uniquifier;
> >
> > I guess this check (here and in nfsd_uniquify_ino) is just to prevent
> > returning inode number zero?
>
> Yep. The set of valid inode numbers is 1..MAX and that set isn't closed
> under xor.

I was curious....

The NFS specs don't require FILEID to be nonzero as far as I can tell.

Our client doesn't treat fileid 0 specially. In the case it has to
return a 32-bit inode it xors the high and low parts and makes no effort
I can see to check for the 0 case.

I modified a server to return 0 for FILEID and MOUNTED_ON_FILEID on one
particular file, and an strace shows that's happily passed on to
userspace:

getdents64(3, [..., {d_ino=0, d_off=2048, d_reclen=32,
d_type=DT_REG, d_name="LOCKTESTFILE"}]

But ls silently skips that file in the output. Huh.

--b.

> It is closed (and bijective) under "xor if not equals".
>
> I've added:
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 5e2d5c352ecd..fed56edf229f 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -1162,6 +1162,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
> resp->dir_ino_uniquifier = 0;
> resp->dir_have_uniquifier = true;
> }
> + /* See comment in nfsd_uniquify_ino() */
> if (resp->dir_ino_uniquifier != ino)
> ino ^= resp->dir_ino_uniquifier;
> if (xdr_stream_encode_u64(xdr, ino) < 0)
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index bbc7ddd34143..6dd8c7325902 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -155,6 +155,9 @@ static inline u64 nfsd_uniquify_ino(const struct svc_fh *fhp,
> const struct kstat *stat)
> {
> u64 u = nfsd_ino_uniquifier(fhp, stat);
> + /* Neither stat->ino or return value can be zero, so
> + * if ->ino is u, return u.
> + */
> if (u != stat->ino)
> return stat->ino ^ u;
> return stat->ino;
>
> Thanks,
> NeilBrown

2021-08-27 22:57:40

by NeilBrown

[permalink] [raw]
Subject: RE: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Sat, 28 Aug 2021, Frank Filz wrote:
>
> Changing the fsid for sub-volumes is Ganesha's solution (before adding
> that, we couldn't even export the sub-volumes at all).

What does Ganesha use for the mounted-on-fileid? There doesn't seem to
be an "obvious" answer so I wonder what was chosen.

>
> Mangling the fileid is definitely the best solution if there will be lots of sub-volumes. For a few sub-volumes changing fsid does create additional mount points on the client with some issues, but does guarantee there will be no fileid collision.
>
> My gut feel is your solution is the best one and Ganesha may need to switch to that solution.

Thanks for the encouragement. Changing the fsid does seem easier is
many ways, but I don't really like the consequences or implications.

Thanks,
NeilBrown

2021-08-27 23:02:07

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Sat, 28 Aug 2021, J. Bruce Fields wrote:
> On Fri, Aug 27, 2021 at 08:10:38AM +1000, NeilBrown wrote:
> > On Fri, 27 Aug 2021, J. Bruce Fields wrote:
> > > On Thu, Aug 26, 2021 at 04:03:04PM +1000, NeilBrown wrote:
> > > > + }
> > > > + if (resp->dir_ino_uniquifier != ino)
> > > > + ino ^= resp->dir_ino_uniquifier;
> > >
> > > I guess this check (here and in nfsd_uniquify_ino) is just to prevent
> > > returning inode number zero?
> >
> > Yep. The set of valid inode numbers is 1..MAX and that set isn't closed
> > under xor.
>
> I was curious....
>
> The NFS specs don't require FILEID to be nonzero as far as I can tell.
>
> Our client doesn't treat fileid 0 specially. In the case it has to
> return a 32-bit inode it xors the high and low parts and makes no effort
> I can see to check for the 0 case.
>
> I modified a server to return 0 for FILEID and MOUNTED_ON_FILEID on one
> particular file, and an strace shows that's happily passed on to
> userspace:
>
> getdents64(3, [..., {d_ino=0, d_off=2048, d_reclen=32,
> d_type=DT_REG, d_name="LOCKTESTFILE"}]
>
> But ls silently skips that file in the output. Huh.
>

What DO they teach in history class?
The original Unix File System (edition 6, 7 at least) had 16 bytes per
entry in directories (that could be read with read(2)). Two bytes of
inode number and 14 bytes of name.
When a name was deleted, the inode number was over-written with '0'.
So code - and coders - from that era ignore directory entries with
d_ino==0.

I'm not certain what inode 0 was used for, but I think it had some
behind-the-scenes role. Free-space?

So some code might work find with ino==0, but I'd rather not risk it.

NeilBrown

2021-08-27 23:05:29

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Sat, 28 Aug 2021, Christoph Hellwig wrote:
> On Thu, Aug 26, 2021 at 04:03:04PM +1000, NeilBrown wrote:
> >
> > This patch allows btrfs (or any filesystem) to provide a 64bit number
> > that can be xored with the inode number to make the number more unique.
> > Rather than the client being certain to see duplicates, with this patch
> > it is possible but extremely rare.
>
> Yikes. Please don't do something like this that will eventually
> cause collisions.
>
>

There doesn't seem to be any other option - and this is still an
improvement over the current behaviour.

Collisions should still be quite a few years away, and there is hope
that the btrfs developers will take action before then to provide some
certainty. Not much hope, but some.

NeilBrown

2021-08-27 23:27:44

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH] NFSD: drop support for ancient file-handles

On Sat, 28 Aug 2021, Christoph Hellwig wrote:
> On Thu, Aug 26, 2021 at 02:28:15PM +1000, NeilBrown wrote:
> > This patch also moves the nfsfh.h from the include/uapi directory into
> > fs/nfsd. I can find no evidence of it being used anywhere outside the
> > kernel. Certainly nfs-utils and wireshark do not use it.
>
> That sounds fine, but I'd split this into a separate patch.

Thanks for your review. Your suggestions seem appropriate.
I'll send out revised patches in a couple of days.

NeilBrown


>
> > fh_base and fh_pad are occasionally used to refer to the whole
> > filehandle. These are replaced with "fh_raw" which is hopefully more
> > meaningful.
>
> I think that kind of cleanup should also be a separate patch. That
> being said as far as I can tell fh_raw is only ever used in context
> where we can just pass a void pointer. So just giving the struct
> for the "new" file handle after fh_size a name and passing that
> would be much cleaner than a union with a char array.
>
>
> > I found
> > https://www.spinics.net/lists/linux-nfs/msg43280.html
> > "Re: [PATCH] nfsd: clean up fh_auth usage"
> > from 2014 where moving nfsfh.h out of uapi was considered but not
> > actioned. Christoph said he would "do some research if the
> > uapi <linux/nfsd/*.h> headers are used anywhere at all". I can find no
> > report on the result of that research. My own research turned up
> > nothing.
>
> I can't remember doing much of research, and certainly not of finding
> anything.
>
> > - memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
> > + memcpy((char*)&fh.fh_handle.fh_raw, f->data, f->size);
>
> Indedpendnt on what we're going to pass here, I don't think we
> should need cast like this one (there are a few more).
>
>

2021-08-27 23:46:55

by Frank Filz

[permalink] [raw]
Subject: RE: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

> On Sat, 28 Aug 2021, Frank Filz wrote:
> >
> > Changing the fsid for sub-volumes is Ganesha's solution (before adding
> > that, we couldn't even export the sub-volumes at all).
>
> What does Ganesha use for the mounted-on-fileid? There doesn't seem to be an
> "obvious" answer so I wonder what was chosen.

We only make mounted_on_fileid different from fileid on our export boundaries, and even then, it's not a terribly correct thing for FSAL_VFS (our module for interfacing with kernel filesystems) since user space to my knowledge has no way to get any information on an inode that serves as a mount point.

What clients actually do anything with mounted_on_fileid, and what sorts of things do they do with it? I know the AIX client was interested in it (from having worked on security negotiation back in 2006), but I have never been able to test Ganesha with an AIX client. For normal Linux client operations, what Ganesha does seems to work OK.

> >
> > Mangling the fileid is definitely the best solution if there will be lots of sub-
> volumes. For a few sub-volumes changing fsid does create additional mount
> points on the client with some issues, but does guarantee there will be no fileid
> collision.
> >
> > My gut feel is your solution is the best one and Ganesha may need to switch to
> that solution.
>
> Thanks for the encouragement. Changing the fsid does seem easier is many
> ways, but I don't really like the consequences or implications.

Yea, there really isn't a good solution here.

Probably really client applications need to do something different to detect infinite loops and not care so much about unique fileids.

Frank

2021-08-27 23:55:21

by NeilBrown

[permalink] [raw]
Subject: RE: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Sat, 28 Aug 2021, Frank Filz wrote:

> > On Sat, 28 Aug 2021, Frank Filz wrote:
> > >
> > > Changing the fsid for sub-volumes is Ganesha's solution (before adding
> > > that, we couldn't even export the sub-volumes at all).
> >
> > What does Ganesha use for the mounted-on-fileid? There doesn't seem to be an
> > "obvious" answer so I wonder what was chosen.
>
> We only make mounted_on_fileid different from fileid on our export
> boundaries, and even then, it's not a terribly correct thing for
> FSAL_VFS (our module for interfacing with kernel filesystems) since
> user space to my knowledge has no way to get any information on an
> inode that serves as a mount point.

It is possible to see the mounted-on inode number by doing a readdir()
of the parent directory and looking at d_ino. It is a bit round-about.

>
> What clients actually do anything with mounted_on_fileid, and what
> sorts of things do they do with it? I know the AIX client was
> interested in it (from having worked on security negotiation back in
> 2006), but I have never been able to test Ganesha with an AIX client.
> For normal Linux client operations, what Ganesha does seems to work
> OK.

On the Linux client, if you stat() a directory that is a mountpoint on
the server, you will see a directory with the inode number being the
mounted-on-fileid. That directory is an automount-point, and when you
access anything in it, the 'real' directory gets mounted and the
mounted-on-fileid disappears.
So if you reported a mounted-on-fileid the same as the fileid, which
would be 256 on btrfs, du and find can get confused. To be safe, the
mounted-of-fileid needs to be different to all ancestors.

NeilBrown

2021-08-28 02:22:24

by Frank Filz

[permalink] [raw]
Subject: RE: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

> On Sat, 28 Aug 2021, Frank Filz wrote:
>
> > > On Sat, 28 Aug 2021, Frank Filz wrote:
> > > >
> > > > Changing the fsid for sub-volumes is Ganesha's solution (before
> > > > adding that, we couldn't even export the sub-volumes at all).
> > >
> > > What does Ganesha use for the mounted-on-fileid? There doesn't seem
> > > to be an "obvious" answer so I wonder what was chosen.
> >
> > We only make mounted_on_fileid different from fileid on our export
> > boundaries, and even then, it's not a terribly correct thing for
> > FSAL_VFS (our module for interfacing with kernel filesystems) since
> > user space to my knowledge has no way to get any information on an
> > inode that serves as a mount point.
>
> It is possible to see the mounted-on inode number by doing a readdir() of the
> parent directory and looking at d_ino. It is a bit round-about.

Ahh, I'll have to keep that in mind, I'm not totally sure, but I think AIX mostly used/got the mounted_on_fileid during a READDIR which if that translates into a readdir to the underlying filesystem (via getdents in our case) then we have the d_ino to fill in mounted_on_fileid. I think it's less likely a situation with a LOOKUP. I think AIX used it when it got NFS4ERR_WRONGSEC when doing a READDIR, it would then go back and do a READDIR just asking for mounted_on_fileid (which as a property of the owning directory, we decided was OK to give for an inode that was in a new export with different security flavor requirements). I think AIX used the mounted_on_fileid to instantiate some kind of junction inode in the directory.

> >
> > What clients actually do anything with mounted_on_fileid, and what
> > sorts of things do they do with it? I know the AIX client was
> > interested in it (from having worked on security negotiation back in
> > 2006), but I have never been able to test Ganesha with an AIX client.
> > For normal Linux client operations, what Ganesha does seems to work
> > OK.
>
> On the Linux client, if you stat() a directory that is a mountpoint on the server,
> you will see a directory with the inode number being the mounted-on-fileid.
> That directory is an automount-point, and when you access anything in it, the
> 'real' directory gets mounted and the mounted-on-fileid disappears.
> So if you reported a mounted-on-fileid the same as the fileid, which would be
> 256 on btrfs, du and find can get confused. To be safe, the mounted-of-fileid
> needs to be different to all ancestors.
>
> NeilBrown

Frank

2021-08-28 07:11:47

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Sat, Aug 28, 2021 at 09:05:08AM +1000, NeilBrown wrote:
> There doesn't seem to be any other option - and this is still an
> improvement over the current behaviour.
>
> Collisions should still be quite a few years away, and there is hope
> that the btrfs developers will take action before then to provide some
> certainty. Not much hope, but some.

I think that is a very dangerous assumption. Given how every inode
allocation tends to be somewhat predictable I'm also really worried
that this actually opens up an attach vector. Last but not least
I also very much disagree with any of the impact to common code.
Most importantly the kstat structure, which exist to support the stat
family of system calls and not as a side channel for NFS file handles
(nevermind that it is hidden in a nfs patch and didn't even Cc the
fsdevel list), but also all the impact to the generic nfsd code for
this very broken concept. If you want to support such a scheme in
btrfs as the lesser of evils (which I disagree with), please make
sure it stays self-contained in the btrfs specific file handle
encoding and decoding.

2021-08-31 04:43:26

by NeilBrown

[permalink] [raw]
Subject: [PATCH 1/2 v2] NFSD: drop support for ancient filehandles


Filehandles not in the "new" or "version 1" format have not been handed
out for new mounts since Linux 2.4 which was released 20 years ago.
I think it is safe to say that no such file handles are still in use,
and that we can drop support for them.

This patch also moves the nfs_fh.h (with old-format filehandle
information removed) from the include/uapi directory into fs/nfsd. I
can find no evidence of it being used anywhere outside the kernel.
Certainly nfs-utils and wireshark do not use it.

As these declarations are no longer in 'uapi' we can use the 'u8' style
of integer type rather than '__u8'.

Signed-off-by: NeilBrown <[email protected]>
---
fs/nfsd/nfsfh.c | 157 +++++++++++---------------------
fs/nfsd/nfsfh.h | 69 +++++++++++++-
include/uapi/linux/nfsd/nfsfh.h | 116 -----------------------
3 files changed, 119 insertions(+), 223 deletions(-)
delete mode 100644 include/uapi/linux/nfsd/nfsfh.h

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c475d2271f9c..9194a940b23d 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -154,11 +154,12 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
- struct fid *fid = NULL, sfid;
+ struct fid *fid = NULL;
struct svc_export *exp;
struct dentry *dentry;
int fileid_type;
int data_left = fh->fh_size/4;
+ int len;
__be32 error;

error = nfserr_stale;
@@ -167,48 +168,35 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers == 4 && fh->fh_size == 0)
return nfserr_nofilehandle;

- if (fh->fh_version == 1) {
- int len;
-
- if (--data_left < 0)
- return error;
- if (fh->fh_auth_type != 0)
- return error;
- len = key_len(fh->fh_fsid_type) / 4;
- if (len == 0)
- return error;
- if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
- /* deprecated, convert to type 3 */
- len = key_len(FSID_ENCODE_DEV)/4;
- fh->fh_fsid_type = FSID_ENCODE_DEV;
- /*
- * struct knfsd_fh uses host-endian fields, which are
- * sometimes used to hold net-endian values. This
- * confuses sparse, so we must use __force here to
- * keep it from complaining.
- */
- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
- ntohl((__force __be32)fh->fh_fsid[1])));
- fh->fh_fsid[1] = fh->fh_fsid[2];
- }
- data_left -= len;
- if (data_left < 0)
- return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
- fid = (struct fid *)(fh->fh_fsid + len);
- } else {
- __u32 tfh[2];
- dev_t xdev;
- ino_t xino;
-
- if (fh->fh_size != NFS_FHSIZE)
- return error;
- /* assume old filehandle format */
- xdev = old_decode_dev(fh->ofh_xdev);
- xino = u32_to_ino_t(fh->ofh_xino);
- mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
- exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
+ if (fh->fh_version != 1)
+ return error;
+
+ if (--data_left < 0)
+ return error;
+ if (fh->fh_auth_type != 0)
+ return error;
+ len = key_len(fh->fh_fsid_type) / 4;
+ if (len == 0)
+ return error;
+ if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+ /* deprecated, convert to type 3 */
+ len = key_len(FSID_ENCODE_DEV)/4;
+ fh->fh_fsid_type = FSID_ENCODE_DEV;
+ /*
+ * struct knfsd_fh uses host-endian fields, which are
+ * sometimes used to hold net-endian values. This
+ * confuses sparse, so we must use __force here to
+ * keep it from complaining.
+ */
+ fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+ ntohl((__force __be32)fh->fh_fsid[1])));
+ fh->fh_fsid[1] = fh->fh_fsid[2];
}
+ data_left -= len;
+ if (data_left < 0)
+ return error;
+ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ fid = (struct fid *)(fh->fh_fsid + len);

error = nfserr_stale;
if (IS_ERR(exp)) {
@@ -253,18 +241,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers > 2)
error = nfserr_badhandle;

- if (fh->fh_version != 1) {
- sfid.i32.ino = fh->ofh_ino;
- sfid.i32.gen = fh->ofh_generation;
- sfid.i32.parent_ino = fh->ofh_dirino;
- fid = &sfid;
- data_left = 3;
- if (fh->ofh_dirino == 0)
- fileid_type = FILEID_INO32_GEN;
- else
- fileid_type = FILEID_INO32_GEN_PARENT;
- } else
- fileid_type = fh->fh_fileid_type;
+ fileid_type = fh->fh_fileid_type;

if (fileid_type == FILEID_ROOT)
dentry = dget(exp->ex_path.dentry);
@@ -452,20 +429,6 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
}
}

-/*
- * for composing old style file handles
- */
-static inline void _fh_update_old(struct dentry *dentry,
- struct svc_export *exp,
- struct knfsd_fh *fh)
-{
- fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
- fh->ofh_generation = d_inode(dentry)->i_generation;
- if (d_is_dir(dentry) ||
- (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
- fh->ofh_dirino = 0;
-}
-
static bool is_root_export(struct svc_export *exp)
{
return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
@@ -600,35 +563,21 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_dentry = dget(dentry); /* our internal copy */
fhp->fh_export = exp_get(exp);

- if (fhp->fh_handle.fh_version == 0xca) {
- /* old style filehandle please */
- memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
- fhp->fh_handle.fh_size = NFS_FHSIZE;
- fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
- fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
- fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
- fhp->fh_handle.ofh_xino =
- ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
- fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
- if (inode)
- _fh_update_old(dentry, exp, &fhp->fh_handle);
- } else {
- fhp->fh_handle.fh_size =
- key_len(fhp->fh_handle.fh_fsid_type) + 4;
- fhp->fh_handle.fh_auth_type = 0;
-
- mk_fsid(fhp->fh_handle.fh_fsid_type,
- fhp->fh_handle.fh_fsid,
- ex_dev,
- d_inode(exp->ex_path.dentry)->i_ino,
- exp->ex_fsid, exp->ex_uuid);
-
- if (inode)
- _fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
- fh_put(fhp);
- return nfserr_opnotsupp;
- }
+ fhp->fh_handle.fh_size =
+ key_len(fhp->fh_handle.fh_fsid_type) + 4;
+ fhp->fh_handle.fh_auth_type = 0;
+
+ mk_fsid(fhp->fh_handle.fh_fsid_type,
+ fhp->fh_handle.fh_fsid,
+ ex_dev,
+ d_inode(exp->ex_path.dentry)->i_ino,
+ exp->ex_fsid, exp->ex_uuid);
+
+ if (inode)
+ _fh_update(fhp, exp, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
+ fh_put(fhp);
+ return nfserr_opnotsupp;
}

return 0;
@@ -649,16 +598,12 @@ fh_update(struct svc_fh *fhp)
dentry = fhp->fh_dentry;
if (d_really_is_negative(dentry))
goto out_negative;
- if (fhp->fh_handle.fh_version != 1) {
- _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
- } else {
- if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
- return 0;
+ if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+ return 0;

- _fh_update(fhp, fhp->fh_export, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
- return nfserr_opnotsupp;
- }
+ _fh_update(fhp, fhp->fh_export, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
+ return nfserr_opnotsupp;
return 0;
out_bad:
printk(KERN_ERR "fh_update: fh not verified!\n");
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 6106697adc04..17fc6f57d1bb 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -10,9 +10,76 @@

#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
-#include <uapi/linux/nfsd/nfsfh.h>
#include <linux/iversion.h>
#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+/*
+ * The file handle starts with a sequence of four-byte words.
+ * The first word contains a version number (1) and three descriptor bytes
+ * that tell how the remaining 3 variable length fields should be handled.
+ * These three bytes are auth_type, fsid_type and fileid_type.
+ *
+ * All four-byte values are in host-byte-order.
+ *
+ * The auth_type field is deprecated and must be set to 0.
+ *
+ * The fsid_type identifies how the filesystem (or export point) is
+ * encoded.
+ * Current values:
+ * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
+ * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
+ * says we mustn't. We must break it up and reassemble.
+ * 1 - 4 byte user specified identifier
+ * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
+ * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
+ * 4 - 4 byte inode number and 4 byte uuid
+ * 5 - 8 byte uuid
+ * 6 - 16 byte uuid
+ * 7 - 8 byte inode number and 16 byte uuid
+ *
+ * The fileid_type identifies how the file within the filesystem is encoded.
+ * The values for this field are filesystem specific, exccept that
+ * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
+ * in include/linux/exportfs.h for currently registered values.
+ */
+struct nfs_fhbase_new {
+ union {
+ struct {
+ u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
+ u8 fb_auth_type_aux;
+ u8 fb_fsid_type_aux;
+ u8 fb_fileid_type_aux;
+ u32 fb_auth[1];
+ /* u32 fb_fsid[0]; floating */
+ /* u32 fb_fileid[0]; floating */
+ };
+ struct {
+ u8 fb_version; /* == 1, even => nfs_fhbase_old */
+ u8 fb_auth_type;
+ u8 fb_fsid_type;
+ u8 fb_fileid_type;
+ u32 fb_auth_flex[]; /* flexible-array member */
+ };
+ };
+};
+
+struct knfsd_fh {
+ unsigned int fh_size; /* significant for NFSv3.
+ * Points to the current size while building
+ * a new file handle
+ */
+ union {
+ u32 fh_pad[NFS4_FHSIZE/4];
+ struct nfs_fhbase_new fh_new;
+ } fh_base;
+};
+
+#define fh_version fh_base.fh_new.fb_version
+#define fh_fsid_type fh_base.fh_new.fb_fsid_type
+#define fh_auth_type fh_base.fh_new.fb_auth_type
+#define fh_fileid_type fh_base.fh_new.fb_fileid_type
+#define fh_fsid fh_base.fh_new.fb_auth_flex

static inline __u32 ino_t_to_u32(ino_t ino)
{
diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
deleted file mode 100644
index 427294dd56a1..000000000000
--- a/include/uapi/linux/nfsd/nfsfh.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * This file describes the layout of the file handles as passed
- * over the wire.
- *
- * Copyright (C) 1995, 1996, 1997 Olaf Kirch <[email protected]>
- */
-
-#ifndef _UAPI_LINUX_NFSD_FH_H
-#define _UAPI_LINUX_NFSD_FH_H
-
-#include <linux/types.h>
-#include <linux/nfs.h>
-#include <linux/nfs2.h>
-#include <linux/nfs3.h>
-#include <linux/nfs4.h>
-
-/*
- * This is the old "dentry style" Linux NFSv2 file handle.
- *
- * The xino and xdev fields are currently used to transport the
- * ino/dev of the exported inode.
- */
-struct nfs_fhbase_old {
- __u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
- __u32 fb_ino; /* our inode number */
- __u32 fb_dirino; /* dir inode number, 0 for directories */
- __u32 fb_dev; /* our device */
- __u32 fb_xdev;
- __u32 fb_xino;
- __u32 fb_generation;
-};
-
-/*
- * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
- * by Neil Brown <[email protected]> - March 2000
- *
- * The file handle starts with a sequence of four-byte words.
- * The first word contains a version number (1) and three descriptor bytes
- * that tell how the remaining 3 variable length fields should be handled.
- * These three bytes are auth_type, fsid_type and fileid_type.
- *
- * All four-byte values are in host-byte-order.
- *
- * The auth_type field is deprecated and must be set to 0.
- *
- * The fsid_type identifies how the filesystem (or export point) is
- * encoded.
- * Current values:
- * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
- * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
- * says we mustn't. We must break it up and reassemble.
- * 1 - 4 byte user specified identifier
- * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
- * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
- * 4 - 4 byte inode number and 4 byte uuid
- * 5 - 8 byte uuid
- * 6 - 16 byte uuid
- * 7 - 8 byte inode number and 16 byte uuid
- *
- * The fileid_type identified how the file within the filesystem is encoded.
- * The values for this field are filesystem specific, exccept that
- * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
- * in include/linux/exportfs.h for currently registered values.
- */
-struct nfs_fhbase_new {
- union {
- struct {
- __u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
- __u8 fb_auth_type_aux;
- __u8 fb_fsid_type_aux;
- __u8 fb_fileid_type_aux;
- __u32 fb_auth[1];
- /* __u32 fb_fsid[0]; floating */
- /* __u32 fb_fileid[0]; floating */
- };
- struct {
- __u8 fb_version; /* == 1, even => nfs_fhbase_old */
- __u8 fb_auth_type;
- __u8 fb_fsid_type;
- __u8 fb_fileid_type;
- __u32 fb_auth_flex[]; /* flexible-array member */
- };
- };
-};
-
-struct knfsd_fh {
- unsigned int fh_size; /* significant for NFSv3.
- * Points to the current size while building
- * a new file handle
- */
- union {
- struct nfs_fhbase_old fh_old;
- __u32 fh_pad[NFS4_FHSIZE/4];
- struct nfs_fhbase_new fh_new;
- } fh_base;
-};
-
-#define ofh_dcookie fh_base.fh_old.fb_dcookie
-#define ofh_ino fh_base.fh_old.fb_ino
-#define ofh_dirino fh_base.fh_old.fb_dirino
-#define ofh_dev fh_base.fh_old.fb_dev
-#define ofh_xdev fh_base.fh_old.fb_xdev
-#define ofh_xino fh_base.fh_old.fb_xino
-#define ofh_generation fh_base.fh_old.fb_generation
-
-#define fh_version fh_base.fh_new.fb_version
-#define fh_fsid_type fh_base.fh_new.fb_fsid_type
-#define fh_auth_type fh_base.fh_new.fb_auth_type
-#define fh_fileid_type fh_base.fh_new.fb_fileid_type
-#define fh_fsid fh_base.fh_new.fb_auth_flex
-
-/* Do not use, provided for userspace compatiblity. */
-#define fh_auth fh_base.fh_new.fb_auth
-
-#endif /* _UAPI_LINUX_NFSD_FH_H */
--
2.32.0

2021-08-31 04:44:10

by NeilBrown

[permalink] [raw]
Subject: [PATCH 2/2] NFSD: simplify struct nfsfh


Most of the fields in 'struct knfsd_fh' are 2 levels deep (a union and a
struct) and are accessed using macros:

#define fh_FOO fh_base.fh_new.fb_FOO

This patch makes the union and struct anonymous, so that "fh_FOO" can be
a name directly within 'struct knfsd_fh' and the #defines aren't needed.

The file handle as a whole is sometimes accessed as "fh_base" or
"fh_base.fh_pad", neither of which are particularly helpful names.
As the struct holding the filehandle is now anonymous, we
cannot use the name of that, so we union it with 'fh_raw' and use that
where the raw filehandle is needed.

fh_raw is a 'char' array, removing any need to cast it for memcpy etc.

SVCFH_fmt() is simplified using the "%ph" printk format. This
changes the appearance of filehandles in dprintk() debugging, making
them a little more precise.

Signed-off-by: NeilBrown <[email protected]>
---
fs/nfsd/flexfilelayout.c | 2 +-
fs/nfsd/lockd.c | 2 +-
fs/nfsd/nfs3xdr.c | 4 ++--
fs/nfsd/nfs4callback.c | 2 +-
fs/nfsd/nfs4proc.c | 4 ++--
fs/nfsd/nfs4state.c | 4 ++--
fs/nfsd/nfs4xdr.c | 4 ++--
fs/nfsd/nfsctl.c | 6 ++---
fs/nfsd/nfsfh.c | 13 ++++-------
fs/nfsd/nfsfh.h | 50 ++++++++++++----------------------------
fs/nfsd/nfsxdr.c | 4 ++--
11 files changed, 35 insertions(+), 60 deletions(-)

diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index db7ef07ae50c..2e2f1d5e9f62 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -61,7 +61,7 @@ nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
goto out_error;

fl->fh.size = fhp->fh_handle.fh_size;
- memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+ memcpy(fl->fh.data, &fhp->fh_handle.fh_raw, fl->fh.size);

/* Give whole file layout segments */
seg->offset = 0;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 3f5b3d7b62b7..9f6eb091db08 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -33,7 +33,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
/* must initialize before using! but maxsize doesn't matter */
fh_init(&fh,0);
fh.fh_handle.fh_size = f->size;
- memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
+ memcpy(&fh.fh_handle.fh_raw, f->data, f->size);
fh.fh_export = NULL;

nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 0a5ebc52e6a9..3d37923afb06 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -92,7 +92,7 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
return false;
fh_init(fhp, NFS3_FHSIZE);
fhp->fh_handle.fh_size = size;
- memcpy(&fhp->fh_handle.fh_base, p, size);
+ memcpy(&fhp->fh_handle.fh_raw, p, size);

return true;
}
@@ -131,7 +131,7 @@ svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
*p++ = cpu_to_be32(size);
if (size)
p[XDR_QUADLEN(size) - 1] = 0;
- memcpy(p, &fhp->fh_handle.fh_base, size);
+ memcpy(p, &fhp->fh_handle.fh_raw, size);

return true;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0f8b10f363e7..11f8715d92d6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -121,7 +121,7 @@ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)

BUG_ON(length > NFS4_FHSIZE);
p = xdr_reserve_space(xdr, 4 + length);
- xdr_encode_opaque(p, &fh->fh_base, length);
+ xdr_encode_opaque(p, &fh->fh_raw, length);
}

/*
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 486c5dba4b65..3f7e59ec4e32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -519,7 +519,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,

fh_put(&cstate->current_fh);
cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
- memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
+ memcpy(&cstate->current_fh.fh_handle.fh_raw, putfh->pf_fhval,
putfh->pf_fhlen);
ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
@@ -1383,7 +1383,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
s_fh = &cstate->save_fh;

copy->c_fh.size = s_fh->fh_handle.fh_size;
- memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_base, copy->c_fh.size);
+ memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_raw, copy->c_fh.size);
copy->stateid.seqid = cpu_to_be32(s_stid->si_generation);
memcpy(copy->stateid.other, (void *)&s_stid->si_opaque,
sizeof(stateid_opaque_t));
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fa67ecd5fe63..d66b4be99063 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1010,7 +1010,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
}
spin_unlock(&blocked_delegations_lock);
}
- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
if (test_bit(hash&255, bd->set[0]) &&
test_bit((hash>>8)&255, bd->set[0]) &&
test_bit((hash>>16)&255, bd->set[0]))
@@ -1029,7 +1029,7 @@ static void block_delegations(struct knfsd_fh *fh)
u32 hash;
struct bloom_pair *bd = &blocked_delegations;

- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);

spin_lock(&blocked_delegations_lock);
__set_bit(hash&255, bd->set[bd->new]);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7abeccb975b2..a54b2845473b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3110,7 +3110,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
if (!p)
goto out_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
fhp->fh_handle.fh_size);
}
if (bmval0 & FATTR4_WORD0_FILEID) {
@@ -3667,7 +3667,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
p = xdr_reserve_space(xdr, len + 4);
if (!p)
return nfserr_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
return 0;
}

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c2c3d9077dc5..5e48bc48942e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -395,12 +395,12 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
auth_domain_put(dom);
if (len)
return len;
-
+
mesg = buf;
len = SIMPLE_TRANSACTION_LIMIT;
- qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
+ qword_addhex(&mesg, &len, fh.fh_raw, fh.fh_size);
mesg[-1] = '\n';
- return mesg - buf;
+ return mesg - buf;
}

/*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 9194a940b23d..e74ee4e63866 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -643,16 +643,11 @@ fh_put(struct svc_fh *fhp)
char * SVCFH_fmt(struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
+ static char buf[2+1+1+64*3+1];

- static char buf[80];
- sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
- fh->fh_size,
- fh->fh_base.fh_pad[0],
- fh->fh_base.fh_pad[1],
- fh->fh_base.fh_pad[2],
- fh->fh_base.fh_pad[3],
- fh->fh_base.fh_pad[4],
- fh->fh_base.fh_pad[5]);
+ if (fh->fh_size < 0 || fh->fh_size> 64)
+ return "bad-fh";
+ sprintf(buf, "%d: %*ph", fh->fh_size, fh->fh_size, fh->fh_raw);
return buf;
}

diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 17fc6f57d1bb..d11e4b6870d6 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -43,44 +43,24 @@
* filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
* in include/linux/exportfs.h for currently registered values.
*/
-struct nfs_fhbase_new {
- union {
- struct {
- u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
- u8 fb_auth_type_aux;
- u8 fb_fsid_type_aux;
- u8 fb_fileid_type_aux;
- u32 fb_auth[1];
- /* u32 fb_fsid[0]; floating */
- /* u32 fb_fileid[0]; floating */
- };
- struct {
- u8 fb_version; /* == 1, even => nfs_fhbase_old */
- u8 fb_auth_type;
- u8 fb_fsid_type;
- u8 fb_fileid_type;
- u32 fb_auth_flex[]; /* flexible-array member */
- };
- };
-};

struct knfsd_fh {
- unsigned int fh_size; /* significant for NFSv3.
- * Points to the current size while building
- * a new file handle
+ unsigned int fh_size; /*
+ * Points to the current size while
+ * building a new file handle.
*/
union {
- u32 fh_pad[NFS4_FHSIZE/4];
- struct nfs_fhbase_new fh_new;
- } fh_base;
+ char fh_raw[NFS4_FHSIZE];
+ struct {
+ u8 fh_version; /* == 1 */
+ u8 fh_auth_type; /* deprecated */
+ u8 fh_fsid_type;
+ u8 fh_fileid_type;
+ u32 fh_fsid[]; /* flexible-array member */
+ };
+ };
};

-#define fh_version fh_base.fh_new.fb_version
-#define fh_fsid_type fh_base.fh_new.fb_fsid_type
-#define fh_auth_type fh_base.fh_new.fb_auth_type
-#define fh_fileid_type fh_base.fh_new.fb_fileid_type
-#define fh_fsid fh_base.fh_new.fb_auth_flex
-
static inline __u32 ino_t_to_u32(ino_t ino)
{
return (__u32) ino;
@@ -255,7 +235,7 @@ static inline void
fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
{
dst->fh_size = src->fh_size;
- memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+ memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
}

static __inline__ struct svc_fh *
@@ -270,7 +250,7 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
{
if (fh1->fh_size != fh2->fh_size)
return false;
- if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+ if (memcmp(fh1->fh_raw, fh2->fh_raw, fh1->fh_size) != 0)
return false;
return true;
}
@@ -294,7 +274,7 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
*/
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
- return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+ return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);
}
#else
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index a06c05fe3b42..082449c7d0db 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -64,7 +64,7 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
if (!p)
return false;
fh_init(fhp, NFS_FHSIZE);
- memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
+ memcpy(&fhp->fh_handle.fh_raw, p, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE;

return true;
@@ -78,7 +78,7 @@ svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp)
p = xdr_reserve_space(xdr, NFS_FHSIZE);
if (!p)
return false;
- memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+ memcpy(p, &fhp->fh_handle.fh_raw, NFS_FHSIZE);

return true;
}
--
2.32.0

2021-08-31 05:00:53

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Sat, 28 Aug 2021, Christoph Hellwig wrote:
> On Sat, Aug 28, 2021 at 09:05:08AM +1000, NeilBrown wrote:
> > There doesn't seem to be any other option - and this is still an
> > improvement over the current behaviour.
> >
> > Collisions should still be quite a few years away, and there is hope
> > that the btrfs developers will take action before then to provide some
> > certainty. Not much hope, but some.
>
> I think that is a very dangerous assumption. Given how every inode
> allocation tends to be somewhat predictable I'm also really worried
> that this actually opens up an attach vector. Last but not least

It doesn't 'open up' anything because it is already possible to cause inode
number collisions for NFS mounts of BTRFS. So this patch is a net
improvement.

I agree that it isn't perfect, but it is the best I have managed to find
and it does solve real problems. Can you suggest any way to make it
better?

> I also very much disagree with any of the impact to common code.
> Most importantly the kstat structure, which exist to support the stat
> family of system calls and not as a side channel for NFS file handles
> (nevermind that it is hidden in a nfs patch and didn't even Cc the
> fsdevel list), but also all the impact to the generic nfsd code for
> this very broken concept. If you want to support such a scheme in
> btrfs as the lesser of evils (which I disagree with), please make
> sure it stays self-contained in the btrfs specific file handle
> encoding and decoding.

Making the change purely in btrfs is simply not possible. There is no
way for btrfs to provide nfsd with a different inode number. To move
the bulk of the change into btrfs code we would need - at the very least
- some way for nfsd to provide the filehandle when requesting stat
information. We would also need to provide a reference filehandle when
requesting a dentry->filehandle conversion. Cluttering the
export_operations like that just for btrfs doesn't seem like the right
balance. I agree that cluttering kstat is not ideal, but it was a case
of choosing the minimum change for the maximum effect.

fsdevel *was* cc:ed on the early discussion of this patch

https://lwn.net/ml/linux-fsdevel/[email protected]/

I felt we were at a point where I really needed to focus in on the nfsd
side of the discussion, with the nfsd developers.

As you are probably aware I have already been through several approaches
to solve this problem. I'm not against exploring other avenues, but
only if they are genuinely likely to provide measurably better results.
I'd be very happy to hear your suggestions.

NeilBrown

2021-09-01 07:22:15

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Tue, Aug 31, 2021 at 02:59:05PM +1000, NeilBrown wrote:
> Making the change purely in btrfs is simply not possible. There is no
> way for btrfs to provide nfsd with a different inode number. To move
> the bulk of the change into btrfs code we would need - at the very least
> - some way for nfsd to provide the filehandle when requesting stat
> information. We would also need to provide a reference filehandle when
> requesting a dentry->filehandle conversion. Cluttering the
> export_operations like that just for btrfs doesn't seem like the right
> balance. I agree that cluttering kstat is not ideal, but it was a case
> of choosing the minimum change for the maximum effect.

So you're papering over a btrfs bug by piling up cludges in the nsdd
code that has not business even knowing about this btrfs bug, while
leaving other users of inodes numbers and file handles broken?

If you only care about file handles: this is what the export operations
are for. If you care about inode numbers: well, it is up to btrfs
to generate uniqueue inode numbers. It currently doesn't do that, and
no amount of papering over that in nfsd is going to fix the issue.

If XORing a little more entropy into the inode number is a good enough
band aid (and I strongly disagree with that), do it inside btrfs for
every place they report the inode number. There is nothing NFS-specific
about that.

2021-09-01 07:45:57

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 2/2] NFSD: simplify struct nfsfh

Looks good,

Reviewed-by: Christoph Hellwig <[email protected]>

2021-09-01 07:45:58

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] NFSD: drop support for ancient filehandles

The content looks fine, but I still think moving the definition out of
the uapi and removing the ancient file handles are different things
that should be separate patches with separate commit logs explaining
the story.

2021-09-01 14:23:19

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] NFSD: drop support for ancient filehandles



> On Sep 1, 2021, at 3:44 AM, Christoph Hellwig <[email protected]> wrote:
>
> The content looks fine, but I still think moving the definition out of
> the uapi and removing the ancient file handles are different things
> that should be separate patches with separate commit logs explaining
> the story.

Was thinking the same thing when looking at v1 of this patch set.
Neil, it's a nit, of course, but would you split this patch up,
please and thanks?

--
Chuck Lever



2021-09-01 15:29:00

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Wed, Sep 01, 2021 at 08:20:06AM +0100, Christoph Hellwig wrote:
> On Tue, Aug 31, 2021 at 02:59:05PM +1000, NeilBrown wrote:
> > Making the change purely in btrfs is simply not possible. There is no
> > way for btrfs to provide nfsd with a different inode number. To move
> > the bulk of the change into btrfs code we would need - at the very least
> > - some way for nfsd to provide the filehandle when requesting stat
> > information. We would also need to provide a reference filehandle when
> > requesting a dentry->filehandle conversion. Cluttering the
> > export_operations like that just for btrfs doesn't seem like the right
> > balance. I agree that cluttering kstat is not ideal, but it was a case
> > of choosing the minimum change for the maximum effect.
>
> So you're papering over a btrfs bug by piling up cludges in the nsdd
> code that has not business even knowing about this btrfs bug, while
> leaving other users of inodes numbers and file handles broken?
>
> If you only care about file handles: this is what the export operations
> are for. If you care about inode numbers: well, it is up to btrfs
> to generate uniqueue inode numbers. It currently doesn't do that, and
> no amount of papering over that in nfsd is going to fix the issue.
>
> If XORing a little more entropy

It's stronger than "a little more entropy". We know enough about how
the numbers being XOR'd grow to know that collisions are only going to
happen in some extreme use cases. (If I understand correctly.)

> into the inode number is a good enough band aid (and I strongly
> disagree with that), do it inside btrfs for every place they report
> the inode number. There is nothing NFS-specific about that.

Neil tried something like that:

https://lore.kernel.org/linux-nfs/[email protected]/

"The patch below, which is just a proof-of-concept, changes
btrfs to report a uniform st_dev, and different (64bit) st_ino
in different subvols."

(Though actually you're proposing keeping separate st_dev?)

I looked back through a couple threads to try to understand why we
couldn't do that (on new filesystems, with a mkfs option to choose new
or old behavior) and still don't understand. But the threads are long.

There are objections to a new mount option (which seem obviously wrong;
this should be a persistent feature of the on-disk filesystem).

--b.

2021-09-02 01:20:08

by NeilBrown

[permalink] [raw]
Subject: [PATCH 1/3 v3] NFSD: move filehandle format declarations out of "uapi".


A small part of the declaration concerning filehandle format are
currently in the "uapi" include directory:
include/uapi/linux/nfsd/nfsfh.h

There is a lot more to the filehandle format, including "enum fid_type"
and "enum nfsd_fsid" which are not exported via "uapi".

This small part of the filehandle definition is of minimal use outside
of the kernel, and I can find no evidence that an other code is using
it. Certainly nfs-utils and wireshark (The most likely candidates) do not
use these declarations.

So move it out of "uapi" by copying the content from
include/uapi/linux/nfsd/nfsfh.h
into
fs/nfsd/nfsfh.h

A few unnecessary "#include" directives are not copied, and neither is
the #define of fh_auth, which is annotated as being for userspace only.

The copyright claims in the uapi file are identical to those in the nfsd
file, so there is no need to copy those.

The "__u32" style integer types are only needed in "uapi". In
kernel-only code we can use the more familiar "u32" style.

Signed-off-by: NeilBrown <[email protected]>
---
fs/nfsd/nfsfh.h | 98 ++++++++++++++++++++++++++-
include/uapi/linux/nfsd/nfsfh.h | 116 --------------------------------
2 files changed, 97 insertions(+), 117 deletions(-)
delete mode 100644 include/uapi/linux/nfsd/nfsfh.h

diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 6106697adc04..988e4dfdfbd9 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -10,9 +10,105 @@

#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
-#include <uapi/linux/nfsd/nfsfh.h>
#include <linux/iversion.h>
#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+
+/*
+ * This is the old "dentry style" Linux NFSv2 file handle.
+ *
+ * The xino and xdev fields are currently used to transport the
+ * ino/dev of the exported inode.
+ */
+struct nfs_fhbase_old {
+ u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
+ u32 fb_ino; /* our inode number */
+ u32 fb_dirino; /* dir inode number, 0 for directories */
+ u32 fb_dev; /* our device */
+ u32 fb_xdev;
+ u32 fb_xino;
+ u32 fb_generation;
+};
+
+/*
+ * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
+ * by Neil Brown <[email protected]> - March 2000
+ *
+ * The file handle starts with a sequence of four-byte words.
+ * The first word contains a version number (1) and three descriptor bytes
+ * that tell how the remaining 3 variable length fields should be handled.
+ * These three bytes are auth_type, fsid_type and fileid_type.
+ *
+ * All four-byte values are in host-byte-order.
+ *
+ * The auth_type field is deprecated and must be set to 0.
+ *
+ * The fsid_type identifies how the filesystem (or export point) is
+ * encoded.
+ * Current values:
+ * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
+ * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
+ * says we mustn't. We must break it up and reassemble.
+ * 1 - 4 byte user specified identifier
+ * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
+ * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
+ * 4 - 4 byte inode number and 4 byte uuid
+ * 5 - 8 byte uuid
+ * 6 - 16 byte uuid
+ * 7 - 8 byte inode number and 16 byte uuid
+ *
+ * The fileid_type identified how the file within the filesystem is encoded.
+ * The values for this field are filesystem specific, exccept that
+ * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
+ * in include/linux/exportfs.h for currently registered values.
+ */
+struct nfs_fhbase_new {
+ union {
+ struct {
+ u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
+ u8 fb_auth_type_aux;
+ u8 fb_fsid_type_aux;
+ u8 fb_fileid_type_aux;
+ u32 fb_auth[1];
+ /* u32 fb_fsid[0]; floating */
+ /* u32 fb_fileid[0]; floating */
+ };
+ struct {
+ u8 fb_version; /* == 1, even => nfs_fhbase_old */
+ u8 fb_auth_type;
+ u8 fb_fsid_type;
+ u8 fb_fileid_type;
+ u32 fb_auth_flex[]; /* flexible-array member */
+ };
+ };
+};
+
+struct knfsd_fh {
+ unsigned int fh_size; /* significant for NFSv3.
+ * Points to the current size while building
+ * a new file handle
+ */
+ union {
+ struct nfs_fhbase_old fh_old;
+ u32 fh_pad[NFS4_FHSIZE/4];
+ struct nfs_fhbase_new fh_new;
+ } fh_base;
+};
+
+#define ofh_dcookie fh_base.fh_old.fb_dcookie
+#define ofh_ino fh_base.fh_old.fb_ino
+#define ofh_dirino fh_base.fh_old.fb_dirino
+#define ofh_dev fh_base.fh_old.fb_dev
+#define ofh_xdev fh_base.fh_old.fb_xdev
+#define ofh_xino fh_base.fh_old.fb_xino
+#define ofh_generation fh_base.fh_old.fb_generation
+
+#define fh_version fh_base.fh_new.fb_version
+#define fh_fsid_type fh_base.fh_new.fb_fsid_type
+#define fh_auth_type fh_base.fh_new.fb_auth_type
+#define fh_fileid_type fh_base.fh_new.fb_fileid_type
+#define fh_fsid fh_base.fh_new.fb_auth_flex

static inline __u32 ino_t_to_u32(ino_t ino)
{
diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
deleted file mode 100644
index 427294dd56a1..000000000000
--- a/include/uapi/linux/nfsd/nfsfh.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * This file describes the layout of the file handles as passed
- * over the wire.
- *
- * Copyright (C) 1995, 1996, 1997 Olaf Kirch <[email protected]>
- */
-
-#ifndef _UAPI_LINUX_NFSD_FH_H
-#define _UAPI_LINUX_NFSD_FH_H
-
-#include <linux/types.h>
-#include <linux/nfs.h>
-#include <linux/nfs2.h>
-#include <linux/nfs3.h>
-#include <linux/nfs4.h>
-
-/*
- * This is the old "dentry style" Linux NFSv2 file handle.
- *
- * The xino and xdev fields are currently used to transport the
- * ino/dev of the exported inode.
- */
-struct nfs_fhbase_old {
- __u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
- __u32 fb_ino; /* our inode number */
- __u32 fb_dirino; /* dir inode number, 0 for directories */
- __u32 fb_dev; /* our device */
- __u32 fb_xdev;
- __u32 fb_xino;
- __u32 fb_generation;
-};
-
-/*
- * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
- * by Neil Brown <[email protected]> - March 2000
- *
- * The file handle starts with a sequence of four-byte words.
- * The first word contains a version number (1) and three descriptor bytes
- * that tell how the remaining 3 variable length fields should be handled.
- * These three bytes are auth_type, fsid_type and fileid_type.
- *
- * All four-byte values are in host-byte-order.
- *
- * The auth_type field is deprecated and must be set to 0.
- *
- * The fsid_type identifies how the filesystem (or export point) is
- * encoded.
- * Current values:
- * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
- * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
- * says we mustn't. We must break it up and reassemble.
- * 1 - 4 byte user specified identifier
- * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
- * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
- * 4 - 4 byte inode number and 4 byte uuid
- * 5 - 8 byte uuid
- * 6 - 16 byte uuid
- * 7 - 8 byte inode number and 16 byte uuid
- *
- * The fileid_type identified how the file within the filesystem is encoded.
- * The values for this field are filesystem specific, exccept that
- * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
- * in include/linux/exportfs.h for currently registered values.
- */
-struct nfs_fhbase_new {
- union {
- struct {
- __u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
- __u8 fb_auth_type_aux;
- __u8 fb_fsid_type_aux;
- __u8 fb_fileid_type_aux;
- __u32 fb_auth[1];
- /* __u32 fb_fsid[0]; floating */
- /* __u32 fb_fileid[0]; floating */
- };
- struct {
- __u8 fb_version; /* == 1, even => nfs_fhbase_old */
- __u8 fb_auth_type;
- __u8 fb_fsid_type;
- __u8 fb_fileid_type;
- __u32 fb_auth_flex[]; /* flexible-array member */
- };
- };
-};
-
-struct knfsd_fh {
- unsigned int fh_size; /* significant for NFSv3.
- * Points to the current size while building
- * a new file handle
- */
- union {
- struct nfs_fhbase_old fh_old;
- __u32 fh_pad[NFS4_FHSIZE/4];
- struct nfs_fhbase_new fh_new;
- } fh_base;
-};
-
-#define ofh_dcookie fh_base.fh_old.fb_dcookie
-#define ofh_ino fh_base.fh_old.fb_ino
-#define ofh_dirino fh_base.fh_old.fb_dirino
-#define ofh_dev fh_base.fh_old.fb_dev
-#define ofh_xdev fh_base.fh_old.fb_xdev
-#define ofh_xino fh_base.fh_old.fb_xino
-#define ofh_generation fh_base.fh_old.fb_generation
-
-#define fh_version fh_base.fh_new.fb_version
-#define fh_fsid_type fh_base.fh_new.fb_fsid_type
-#define fh_auth_type fh_base.fh_new.fb_auth_type
-#define fh_fileid_type fh_base.fh_new.fb_fileid_type
-#define fh_fsid fh_base.fh_new.fb_auth_flex
-
-/* Do not use, provided for userspace compatiblity. */
-#define fh_auth fh_base.fh_new.fb_auth
-
-#endif /* _UAPI_LINUX_NFSD_FH_H */
--
2.32.0

2021-09-02 04:10:01

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Wed, 01 Sep 2021, Christoph Hellwig wrote:
> On Tue, Aug 31, 2021 at 02:59:05PM +1000, NeilBrown wrote:
> > Making the change purely in btrfs is simply not possible. There is no
> > way for btrfs to provide nfsd with a different inode number. To move
> > the bulk of the change into btrfs code we would need - at the very least
> > - some way for nfsd to provide the filehandle when requesting stat
> > information. We would also need to provide a reference filehandle when
> > requesting a dentry->filehandle conversion. Cluttering the
> > export_operations like that just for btrfs doesn't seem like the right
> > balance. I agree that cluttering kstat is not ideal, but it was a case
> > of choosing the minimum change for the maximum effect.
>
> So you're papering over a btrfs bug by piling up cludges in the nsdd
> code that has not business even knowing about this btrfs bug, while
> leaving other users of inodes numbers and file handles broken?
>
> If you only care about file handles: this is what the export operations
> are for. If you care about inode numbers: well, it is up to btrfs
> to generate uniqueue inode numbers. It currently doesn't do that, and
> no amount of papering over that in nfsd is going to fix the issue.
>
> If XORing a little more entropy into the inode number is a good enough
> band aid (and I strongly disagree with that), do it inside btrfs for
> every place they report the inode number. There is nothing NFS-specific
> about that.
>

Hi Christoph,
I have to say that I struggle with some of these conversations with
you.
I don't know if it is deliberate on your part, or inadvertent, or
purely in my imagination, but your attitude often seems combative. I
find that to be a disincentive to continuing to engage, which I need to
work hard to overcome. If I'm misunderstanding you, I apologise and
simply ask that you do what you can to compensate for my apparent
sensitivity.

Your attitude seems to be that this is a btrfs problem and must be
fixed in btrfs. I agree about the source of the problem - specifically:
Commit 3394e1607eaf ("Btrfs: Give each subvol and snapshot their own anonymous devid")
took a wrong turn. But I don't think we can completely isolate any
part of the kernel, and we need to work together to solve problems that
affect us, no matter the cause. Similarly our code needs to work
together.

Highlighting the various problems with the proposed solution doesn't
help a lot - they are fairly obvious. Proposing solutions would be
much more helpful, and I have no doubt that your different experience
and perspective could help me see things that I have missed. Any help
that you can provide would certainly be appreciated.

Thanks,
NeilBrown

2021-09-02 04:15:17

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Thu, 02 Sep 2021, J. Bruce Fields wrote:
> I looked back through a couple threads to try to understand why we
> couldn't do that (on new filesystems, with a mkfs option to choose new
> or old behavior) and still don't understand. But the threads are long.
>
> There are objections to a new mount option (which seem obviously wrong;
> this should be a persistent feature of the on-disk filesystem).

I hadn't thought much (if at all) about a persistent filesystem feature
flag. I'll try that now.

There are two features of interest. One is completely unique inode
numbers, the other is reporting different st_dev for different
subvolumes. I think these need to be kept separate, though the second
would depend on the first. They would be similar to my "inumbits" and
"numdevs" mount options, though with less flexibility. I think that
they would need strong semantics to be acceptable - "mostly unique"
isn't really acceptable once we are changing the on-disk data.

The "unique inode numbers" bit (UIN) would require that file object-ids
fit in some number of bits (maybe 40) and that subvolume numbers fit in
the remaining bits (24) and would then combine them together for the
inode number. This could obviously be set at mkfs time. Could it be
set on an unmounted filesystem?

The "single-dev" flag (SD) could be toggled any time that UIN was set,
and mkfs would default it on if UIN was selected.

If UIN was in effect, then creating a subvol beyond the permitted max
would have to fail. 24 bits is small enough that we would probably want
a warning of impending doom - maybe at 23 bits? The current 48bits
doesn't need that.
Similarly creating an inode beyond 40bits would have to fail. This is
probably more problematic and so might need more warnings. Do we want a
warning each time any subvol crosses some limit? If not we would need a
flag for each warning.

What should a sysadmin do when they see the warning? If 40 bit an
unacceptable limit of the total number of inodes in a subvol, or is it
only a problem because of btrfs' practice of never reusing object-ids?

Backup-and-restore would compact object-ids, but would be a big cost.
Off-line reindexing would be cheaper (does anyone else remember using
"renum" programs with BASIC??). Online lazy re-indexing might be
possible if the inode number was maintained separately from the
object-id and an atomic "switch which inode number to use" could be done
at mount time.

Setting UIN on an existing filesystem would require checking that only
24bit are used for subvolumes (easy) and that only 40 were usgd for
objects in any individual subvolume (presumably that would require
checking all subvolumes, which might take a little while, but shouldn't
take more than a few minutes.

Doing this would break any indexes that might be created over files, and
would probably upset any active NFS mounts, and would likely have other
problems. Se it would need to be a well-documented step with clear
rewards.

An alternative to renumbering would be to maintain file-ids and
subvolume-ids which are separate from the object-id. Apparently reusing
subvolume object-ids is not possible and reusing file object-ids is
quite costly. If the file-id were separate from the object-id, these
problems would vanish.

This would require extra space in the inode (there are several reserved
u64s, so that isn't a problem) and space in each directory entry (might
be more of a problem). It would also require some way to keep track of
used (or unused) id numbers. This avoids the cost of renumbering, by
spreading it out over every creation. I suspect the average
inode-creation overhead could be kept quite low, but not quite zero.

I believe that some code *knows* that the root of any btrfs subvolumes
has inode number 256. systemd seems to use this. I have no idea what
else might depend on inode numbers in some way.

I suspect that if we tried to roll out a change like this, either almost
no-one would use it (if it wasn't the default), or things would start
breaking (if it was). I'm not against breaking things, but we need to
be sure there is a solution for fixing them, and I'm certainly not up to
doing that myself.

So yes - I think that using a mkfs option would open up other avenues
for a solution. There would still be a lot of work to find something
that continues to meet everyone's needs.

The advantage of an nfsd-focusses solution is that we can have working
code today with minimal down-sides. I'm certainly not prepared to go
digging through btrfs code to determine how to implement a btrfs-only
solution without strong buy-in from btrfs maintainers.

NeilBrown

2021-09-02 07:14:53

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Wed, Sep 01, 2021 at 11:22:51AM -0400, J. Bruce Fields wrote:
> It's stronger than "a little more entropy". We know enough about how
> the numbers being XOR'd grow to know that collisions are only going to
> happen in some extreme use cases. (If I understand correctly.)

Do we know that a malicious attacker can't reproduce the collisions?
Because that is the case to worry about.

> > into the inode number is a good enough band aid (and I strongly
> > disagree with that), do it inside btrfs for every place they report
> > the inode number. There is nothing NFS-specific about that.
>
> Neil tried something like that:
>
> https://lore.kernel.org/linux-nfs/[email protected]/
>
> "The patch below, which is just a proof-of-concept, changes
> btrfs to report a uniform st_dev, and different (64bit) st_ino
> in different subvols."
>
> (Though actually you're proposing keeping separate st_dev?)

No, I'm not suggestion to keep a separate st_dev in that case. So the
above scheme looks like the most reasonable (or least unreasonable) of
the approaches I've seen so far. I have to admit I've only noticed it
now given how deep it was hidden in a thread that I only followed bit
while on vacation.

> I looked back through a couple threads to try to understand why we
> couldn't do that (on new filesystems, with a mkfs option to choose new
> or old behavior) and still don't understand. But the threads are long.
>
> There are objections to a new mount option (which seem obviously wrong;
> this should be a persistent feature of the on-disk filesystem).

Yes. Anything like this needs to be persisted. But a mount option
might still be a reasonable way to set that persistent flag.

2021-09-02 07:20:16

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Thu, Sep 02, 2021 at 02:06:54PM +1000, NeilBrown wrote:
> Hi Christoph,
> I have to say that I struggle with some of these conversations with
> you.
> I don't know if it is deliberate on your part, or inadvertent, or
> purely in my imagination, but your attitude often seems combative. I
> find that to be a disincentive to continuing to engage, which I need to
> work hard to overcome. If I'm misunderstanding you, I apologise and
> simply ask that you do what you can to compensate for my apparent
> sensitivity.

I would not call it combative. It is sticking to the points and the red
lines.

> Your attitude seems to be that this is a btrfs problem and must be
> fixed in btrfs.

Yes.

2021-09-02 07:58:15

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Thu, 2 Sept 2021 at 09:18, Christoph Hellwig <[email protected]> wrote:

> > Your attitude seems to be that this is a btrfs problem and must be
> > fixed in btrfs.
>
> Yes.

st_ino space issues affect overlayfs as well. The two problems are
not the same, but do share some characteristics. And this limitation
will likely come up again in the future.

I suspect the long term solution would involve introducing new
userspace API (variable length inode numbers) and deprecating st_ino.
E.g. make stat return an error if the inode number doesn't fit into
st_ino and add a statx extension to return the full number. But this
would be a long process...

Thanks,
Miklos

2021-09-02 14:21:20

by Frank Filz

[permalink] [raw]
Subject: RE: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

> On Thu, 2 Sept 2021 at 09:18, Christoph Hellwig <[email protected]> wrote:
>
> > > Your attitude seems to be that this is a btrfs problem and must be
> > > fixed in btrfs.
> >
> > Yes.
>
> st_ino space issues affect overlayfs as well. The two problems are
> not the same, but do share some characteristics. And this limitation will likely
> come up again in the future.
>
> I suspect the long term solution would involve introducing new userspace API
> (variable length inode numbers) and deprecating st_ino.
> E.g. make stat return an error if the inode number doesn't fit into st_ino and add
> a statx extension to return the full number. But this would be a long process...

But then what do we do where fileid in NFS is only 64 bits?

The solution of giving each subvol a separate fsid is the only real solution to enlarging the NFS fileid space, however that has downsides on the client side.

Frank

2021-09-02 23:03:58

by NeilBrown

[permalink] [raw]
Subject: RE: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Fri, 03 Sep 2021, Frank Filz wrote:
> > On Thu, 2 Sept 2021 at 09:18, Christoph Hellwig <[email protected]> wrote:
> >
> > > > Your attitude seems to be that this is a btrfs problem and must be
> > > > fixed in btrfs.
> > >
> > > Yes.
> >
> > st_ino space issues affect overlayfs as well. The two problems are
> > not the same, but do share some characteristics. And this limitation will likely
> > come up again in the future.
> >
> > I suspect the long term solution would involve introducing new userspace API
> > (variable length inode numbers) and deprecating st_ino.
> > E.g. make stat return an error if the inode number doesn't fit into st_ino and add
> > a statx extension to return the full number. But this would be a long process...
>
> But then what do we do where fileid in NFS is only 64 bits?

Hence my suggestion that user-space should move to using the filehandle.

Two file handles being different doesn't guarantee that the two objects
are different, but the problems caused by incorrectly assuming two
things are different are much less than the problems caused by
incorrectly assuming two things are the same.

>
> The solution of giving each subvol a separate fsid is the only real
> solution to enlarging the NFS fileid space, however that has downsides
> on the client side.

And this doesn't help overlayfs...

NeilBrown

2021-09-05 16:09:28

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Thu, Sep 02, 2021 at 02:14:17PM +1000, NeilBrown wrote:
> On Thu, 02 Sep 2021, J. Bruce Fields wrote:
> > I looked back through a couple threads to try to understand why we
> > couldn't do that (on new filesystems, with a mkfs option to choose new
> > or old behavior) and still don't understand. But the threads are long.
> >
> > There are objections to a new mount option (which seem obviously wrong;
> > this should be a persistent feature of the on-disk filesystem).
>
> I hadn't thought much (if at all) about a persistent filesystem feature
> flag. I'll try that now.
>
> There are two features of interest. One is completely unique inode
> numbers, the other is reporting different st_dev for different
> subvolumes. I think these need to be kept separate, though the second
> would depend on the first. They would be similar to my "inumbits" and
> "numdevs" mount options, though with less flexibility. I think that
> they would need strong semantics to be acceptable - "mostly unique"
> isn't really acceptable once we are changing the on-disk data.

I don't quite follow that.

Also the "on-disk data" here is literally just one more flag bit in some
superblock field, right?

> I believe that some code *knows* that the root of any btrfs subvolumes
> has inode number 256. systemd seems to use this. I have no idea what
> else might depend on inode numbers in some way.

Looking. Ugh, yes, there's abtrfs_might_be_subvol that takes a struct
stat and returns:

return S_ISDIR(st->st_mode) && st->st_ino == 256;

I wonder why it does that? Are there situations where all it has is a
file descriptor (so it can't easily compare st_dev with the parent?)
And if you NFS-export and wanted to answer the same question on the
client side, I wonder what you'd do.

--b.

2021-09-06 01:33:30

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Mon, 06 Sep 2021, J. Bruce Fields wrote:
> On Thu, Sep 02, 2021 at 02:14:17PM +1000, NeilBrown wrote:
> > On Thu, 02 Sep 2021, J. Bruce Fields wrote:
> > > I looked back through a couple threads to try to understand why we
> > > couldn't do that (on new filesystems, with a mkfs option to choose new
> > > or old behavior) and still don't understand. But the threads are long.
> > >
> > > There are objections to a new mount option (which seem obviously wrong;
> > > this should be a persistent feature of the on-disk filesystem).
> >
> > I hadn't thought much (if at all) about a persistent filesystem feature
> > flag. I'll try that now.
> >
> > There are two features of interest. One is completely unique inode
> > numbers, the other is reporting different st_dev for different
> > subvolumes. I think these need to be kept separate, though the second
> > would depend on the first. They would be similar to my "inumbits" and
> > "numdevs" mount options, though with less flexibility. I think that
> > they would need strong semantics to be acceptable - "mostly unique"
> > isn't really acceptable once we are changing the on-disk data.
>
> I don't quite follow that.

I agree it is a bit of a leap.

>
> Also the "on-disk data" here is literally just one more flag bit in some
> superblock field, right?

Maybe. I *could* be just one bit.
But even "just one bit" is, I think, more of a support commitement than
adding a mount option.
Mount options are fairly obvious to the user. super-blocks not as much.
So "just one bit" might still be "one more question" than the supoort
people need to ask when handling a problem report.

When I wrote that I was thinking about how I would be comfortable with
if I were a btrfs maintainer. And I don't think I'd like to spend and
on-disk change and only gain a "mostly harmless" solution.

Christoph's comment about possible vulnerabilities are probably part of
this. I think that over NFS, concern about a user being able to
synthesise an inode number conflict is probably "mountain out of mole
hill" territory. However for local access, I cannot convince myself
that it won't be a problem. I can imagine (incautiously written)
auditing scans getting confused, and while auditing over NFS doesn't
make much sense, auditing locally does.

>
> > I believe that some code *knows* that the root of any btrfs subvolumes
> > has inode number 256. systemd seems to use this. I have no idea what
> > else might depend on inode numbers in some way.
>
> Looking. Ugh, yes, there's abtrfs_might_be_subvol that takes a struct
> stat and returns:
>
> return S_ISDIR(st->st_mode) && st->st_ino == 256;
>
> I wonder why it does that? Are there situations where all it has is a
> file descriptor (so it can't easily compare st_dev with the parent?)
> And if you NFS-export and wanted to answer the same question on the
> client side, I wonder what you'd do.

There are also a few references to BTRFS_FIRST_FREE_OBJECTID which is
256.

Uses seem to include:
- managing quotas, which fits with my idea that subvols are like
project-quota trees.
- optionally preventing "rm -r" from removing subvols
- some switching to/from "readonly" which I cannot follow
- some special handling of user home-directories when they are
subvols

These are probably reasonable and do point to subvols being a little bit
like separate filesytems. These would break if we changed local inode
numbers.

The project-quota management and the read-only setting are not available
via NFS so changing the inode number seen that way is not likely to
matter as much. In any case, detecting "256" is only useful if you can
also detect "is btrfs", and you cannot do that of NFS.

Once upon a time ext[234] had a set of inode flags and xfs separately
had a bunch of inode flags. These are now unified (to a degree) in
'struct fsattr' accessed by FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.

btrfs supports that interface, but doesn't appear to have extended it
for subvol-specific things - preferring to create btrfs-specific ioctls
instead. Maybe they weren't designed to be extensible enough.

Maybe what we really need is for a bunch of diverse filesystem
developers to get together and agree on some new common interface for
subvolume management, including coming up with some sort of definition
of what a subvolume "is".

Until that happens (and the new interfaces are implemented and widely
used) I can only see two possible solutions to the current
NFS-export-of-btrfs problem:

1/ change nfsd to export a different inode number to the one btrfs uses
(or maybe a different fsid, but that is problematic in other ways)
2/ change userspace to check filehandles and not assume two things are
the same if their filehandles are different.

Maybe I should write a patch for fts_read() and see how much glibc folk
will hate it.

NeilBrown

2021-09-11 14:14:45

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

> Maybe what we really need is for a bunch of diverse filesystem
> developers to get together and agree on some new common interface for
> subvolume management, including coming up with some sort of definition
> of what a subvolume "is".

Neil,

Seeing that LSF/MM is not expected to gather in the foreseen future, would
you like to submit this as a topic for discussion in LPC Filesystem MC [1]?
I know this is last minute, but we've just extended the CFP deadline
until Sep 15 (MC is on Sep 21), so if you post a proposal, I think we will
be able to fit this session in the final schedule.

Granted, I don't know how many of the stakeholders plan to attend
the LPC Filesystem MC, but at least Josef should be there ;)

I do have one general question about the expected behavior -
In his comment to the LWN article [2], Josef writes:

"The st_dev thing is unfortunate, but again is the result of a lack of
interfaces.
Very early on we had problems with rsync wandering into snapshots and
copying loads of stuff. Find as well would get tripped up.
The way these tools figure out if they've wandered into another file system
is if the st_dev is different..."

If your plan goes through to export the main btrfs filesystem and
subvolumes as a uniform st_dev namespace to the NFS client,
what's to stop those old issues from remerging on NFS exported btrfs?

IOW, the user experience you are trying to solve is inability of 'find'
to traverse the unified btrfs namespace, but Josef's comment indicates
that some users were explicitly unhappy from 'find' trying to traverse
into subvolumes to begin with.

So is there really a globally expected user experience?
If not, then I really don't see how an nfs export option can be avoided.

Thanks,
Amir.

[1] https://www.linuxplumbersconf.org/event/11/page/104-accepted-microconferences#cont-filesys
[2] https://lwn.net/Articles/867509/

2021-09-13 00:45:09

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Sun, 12 Sep 2021, Amir Goldstein wrote:
> > Maybe what we really need is for a bunch of diverse filesystem
> > developers to get together and agree on some new common interface for
> > subvolume management, including coming up with some sort of definition
> > of what a subvolume "is".
>
> Neil,
>
> Seeing that LSF/MM is not expected to gather in the foreseen future, would
> you like to submit this as a topic for discussion in LPC Filesystem MC [1]?
> I know this is last minute, but we've just extended the CFP deadline
> until Sep 15 (MC is on Sep 21), so if you post a proposal, I think we will
> be able to fit this session in the final schedule.

Thanks for the suggestion. Maybe that is a good idea... But I don't
personally find face-to-face interactions particularly useful - though
other people obviously do. I need thinking time after receiving new
ideas, so I can be sure that I understand them properly. Face-to-face
doesn't allow me that thinking time.

So: no, I won't be proposing anything for LPC.

>
> Granted, I don't know how many of the stakeholders plan to attend
> the LPC Filesystem MC, but at least Josef should be there ;)
>
> I do have one general question about the expected behavior -
> In his comment to the LWN article [2], Josef writes:
>
> "The st_dev thing is unfortunate, but again is the result of a lack of
> interfaces.
> Very early on we had problems with rsync wandering into snapshots and
> copying loads of stuff. Find as well would get tripped up.
> The way these tools figure out if they've wandered into another file system
> is if the st_dev is different..."
>
> If your plan goes through to export the main btrfs filesystem and
> subvolumes as a uniform st_dev namespace to the NFS client,
> what's to stop those old issues from remerging on NFS exported btrfs?

That comment from Josef was interesting.... It doesn't align with
Commit 3394e1607eaf ("Btrfs: Give each subvol and snapshot their own anonymous devid")
when Chris Mason introduced the per-subvol device number with the
justification that:
Each subvolume has its own private inode number space, and so we need
to fill in different device numbers for each subvolume to avoid confusing
applications.

But I understand that history can be messy and maybe there were several
justifications of which Josef remembers one and Chris reported
another.

If rsync did, in fact, wander into subvols and didn't get put off by the
duplicate inode numbers (like 'find' does), then it would still do that
when accessing btrfs over NFS. This has always been the case. Chris'
"fix" only affected local access, it didn't change NFS access at all.

>
> IOW, the user experience you are trying to solve is inability of 'find'
> to traverse the unified btrfs namespace, but Josef's comment indicates
> that some users were explicitly unhappy from 'find' trying to traverse
> into subvolumes to begin with.

I believe that even 12 years ago, find would have complained if it saw a
directory with the same inode as an ancestor. Chris's fix wouldn't
prevent find from entering in that case, because it wouldn't enter
anyway.

>
> So is there really a globally expected user experience?

No. Everybody wants what they want. There is some overlap, not no
guarantees. That is the unavoidable consequence of ignoring standards
when implementing functionality.

> If not, then I really don't see how an nfs export option can be avoided.

And I really don't see how an nfs export option would help... Different
people within and organisation and using the same export might have
different expectations.

Thanks,
NeilBrown


>
> Thanks,
> Amir.
>
> [1] https://www.linuxplumbersconf.org/event/11/page/104-accepted-microconferences#cont-filesys
> [2] https://lwn.net/Articles/867509/
>
>

2021-09-13 10:05:05

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

> > I do have one general question about the expected behavior -
> > In his comment to the LWN article [2], Josef writes:
> >
> > "The st_dev thing is unfortunate, but again is the result of a lack of
> > interfaces.
> > Very early on we had problems with rsync wandering into snapshots and
> > copying loads of stuff. Find as well would get tripped up.
> > The way these tools figure out if they've wandered into another file system
> > is if the st_dev is different..."
> >
> > If your plan goes through to export the main btrfs filesystem and
> > subvolumes as a uniform st_dev namespace to the NFS client,
> > what's to stop those old issues from remerging on NFS exported btrfs?
>
> That comment from Josef was interesting.... It doesn't align with
> Commit 3394e1607eaf ("Btrfs: Give each subvol and snapshot their own anonymous devid")
> when Chris Mason introduced the per-subvol device number with the
> justification that:
> Each subvolume has its own private inode number space, and so we need
> to fill in different device numbers for each subvolume to avoid confusing
> applications.
>
> But I understand that history can be messy and maybe there were several
> justifications of which Josef remembers one and Chris reported
> another.
>

I don't see a contradiction between the two reasons.
Reporting two different objects with the same st_ino;st_dev is a problem
and so is rsync wandering into subvolumes is another problem.

Separate st_dev solves the first problem and leaves the behavior
or rsync in the hands of the user (i.e. rsync --one-file-system).

> If rsync did, in fact, wander into subvols and didn't get put off by the
> duplicate inode numbers (like 'find' does), then it would still do that
> when accessing btrfs over NFS. This has always been the case. Chris'
> "fix" only affected local access, it didn't change NFS access at all.
>

Right, so the right fix IMO would be to provide similar semantics
to the NFS client, like your first patch set tried to do.

> >
> > IOW, the user experience you are trying to solve is inability of 'find'
> > to traverse the unified btrfs namespace, but Josef's comment indicates
> > that some users were explicitly unhappy from 'find' trying to traverse
> > into subvolumes to begin with.
>
> I believe that even 12 years ago, find would have complained if it saw a
> directory with the same inode as an ancestor. Chris's fix wouldn't
> prevent find from entering in that case, because it wouldn't enter
> anyway.
>
> >
> > So is there really a globally expected user experience?
>
> No. Everybody wants what they want. There is some overlap, not no
> guarantees. That is the unavoidable consequence of ignoring standards
> when implementing functionality.
>
> > If not, then I really don't see how an nfs export option can be avoided.
>
> And I really don't see how an nfs export option would help... Different
> people within and organisation and using the same export might have
> different expectations.

That's true.
But if admin decides to export a specific btrfs mount as a non-unified
filesystem, then NFS clients can decide whether ot not to auto-mount the
exported subvolumes and different users on the client machine can decide
if they want to rsync or rsync --one-file-system, just as they would with
local btrfs.

And maybe I am wrong, but I don't see how the decision on whether to
export a non-unified btrfs can be made a btrfs option or a nfsd global
option, that's why I ended up with export option.

Thanks,
Amir.

2021-09-14 01:25:18

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Mon, 13 Sep 2021, Amir Goldstein wrote:
>
> Right, so the right fix IMO would be to provide similar semantics
> to the NFS client, like your first patch set tried to do.
>

Like every other approach, this sounds good and sensible ... until
you examine the details.

For NFSv3 (RFC1813) this would be a protocol violation.
Section 3.3.3 (LOOKUP) says:
A server will not allow a LOOKUP operation to cross a mountpoint to
the root of a different filesystem, even if the filesystem is
exported.

The filesystem is represented by the fsid, so this implies that the fsid
of an object reported by LOOKUP must be the same as the fsid of the
directory used in the LOOKUP.

Linux NFS does allow this restriction to be bypassed with the "crossmnt"
export option. Maybe if crossmnt were given it would be defensible to
change the fsid - if crossmnt is not given, we leave the current
behaviour. Note that this is a hack and while it is extremely useful,
it does not produce a seemly experience. You can get exactly the same
problems with "find" - just not as uniformly (mounting with "-o noac"
makes them uniform).

For NFSv4, we need to provide a "mounted-on" fileid for any mountpoint.
btrfs doesn't have a mounted-on fileid that can be used. We can fake
something that might work reasonably well - but it would be fake. (but
then ... btrfs already provided bogus information in getdents when there
is a subvol root in the directory).

But these are relatively minor. The bigger problem is /proc/mounts. If
btrfs maintainers were willing to have every active subvolume appear in
/proc/mounts, then I would be happy to fiddle the NFS fsid and allow
every active NFS/btrfs subvolume to appear in /proc/mounts on the NFS
client. But they aren't. So I am not.

> > And I really don't see how an nfs export option would help... Different
> > people within and organisation and using the same export might have
> > different expectations.
>
> That's true.
> But if admin decides to export a specific btrfs mount as a non-unified
> filesystem, then NFS clients can decide whether ot not to auto-mount the
> exported subvolumes and different users on the client machine can decide
> if they want to rsync or rsync --one-file-system, just as they would with
> local btrfs.
>
> And maybe I am wrong, but I don't see how the decision on whether to
> export a non-unified btrfs can be made a btrfs option or a nfsd global
> option, that's why I ended up with export option.

Just because a btrfs option and global nfsd option are bad, that doesn't
mean an export option must be good. It needs to be presented and
defended on its own merits.

My current opinion (and I must admit I am feeling rather jaded about the
whole thing), is that while btrfs is a very interesting and valuable
experiment in fs design, it contains core mistakes that cannot be
incrementally fixed. It should be marked as legacy with all current
behaviour declared as intentional and not subject to change. This would
make way for a new "betrfs" which was designed based on all that we have
learned. It would use the same code base, but present a more coherent
interface. Exactly what that interface would be has yet to be decided,
but we would not be bound to maintain anything just because btrfs
supports it.

NeilBrown

2021-09-14 05:46:45

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Tue, Sep 14, 2021 at 1:59 AM NeilBrown <[email protected]> wrote:
>
> On Mon, 13 Sep 2021, Amir Goldstein wrote:
> >
> > Right, so the right fix IMO would be to provide similar semantics
> > to the NFS client, like your first patch set tried to do.
> >
>
> Like every other approach, this sounds good and sensible ... until
> you examine the details.
>
> For NFSv3 (RFC1813) this would be a protocol violation.
> Section 3.3.3 (LOOKUP) says:
> A server will not allow a LOOKUP operation to cross a mountpoint to
> the root of a different filesystem, even if the filesystem is
> exported.
>
> The filesystem is represented by the fsid, so this implies that the fsid
> of an object reported by LOOKUP must be the same as the fsid of the
> directory used in the LOOKUP.
>
> Linux NFS does allow this restriction to be bypassed with the "crossmnt"
> export option. Maybe if crossmnt were given it would be defensible to
> change the fsid - if crossmnt is not given, we leave the current
> behaviour. Note that this is a hack and while it is extremely useful,
> it does not produce a seemly experience. You can get exactly the same
> problems with "find" - just not as uniformly (mounting with "-o noac"
> makes them uniform).
>

I don't understand why we would need to talk about NFSv3.
This btrfs export issue has been with us for a while.
I see no reason to address it for old protocols if we can address
it with a new protocol with better support for the concept of fsid hierarchy.

> For NFSv4, we need to provide a "mounted-on" fileid for any mountpoint.
> btrfs doesn't have a mounted-on fileid that can be used. We can fake
> something that might work reasonably well - but it would be fake. (but
> then ... btrfs already provided bogus information in getdents when there
> is a subvol root in the directory).
>

That seems easy to solve by passing some flag to ->encode_fh()
or if the behavior is persistent in btrfs by some mkfs/module/mount option
then btrfs_encode_fh() will always encode the subvol root inode as
resident of the parent tree-id, because nfsd anyway does not ->encode_fh()
for export roots, right?

> But these are relatively minor. The bigger problem is /proc/mounts. If
> btrfs maintainers were willing to have every active subvolume appear in
> /proc/mounts, then I would be happy to fiddle the NFS fsid and allow
> every active NFS/btrfs subvolume to appear in /proc/mounts on the NFS
> client. But they aren't. So I am not.
>

I don't understand why you need to tie the two together.
I would suggest:
1. Export different fsid's per subvols to NFSv4 based on statx()
exported tree-id
2. NFS client side uses user configuration to determine which subvols
to auto-mount
3. [optional] Provide a way to configure btrfs using mkfs/module/mount option
to behave locally the same as the NFS client, which will allow
user configuration
to determine with subvols to auto-mount locally

I admit that my understanding of the full picture is limited, but I don't
understand why #3 is a strict dependency for #1 and #2.

> > > And I really don't see how an nfs export option would help... Different
> > > people within and organisation and using the same export might have
> > > different expectations.
> >
> > That's true.
> > But if admin decides to export a specific btrfs mount as a non-unified
> > filesystem, then NFS clients can decide whether ot not to auto-mount the
> > exported subvolumes and different users on the client machine can decide
> > if they want to rsync or rsync --one-file-system, just as they would with
> > local btrfs.
> >
> > And maybe I am wrong, but I don't see how the decision on whether to
> > export a non-unified btrfs can be made a btrfs option or a nfsd global
> > option, that's why I ended up with export option.
>
> Just because a btrfs option and global nfsd option are bad, that doesn't
> mean an export option must be good. It needs to be presented and
> defended on its own merits.
>
> My current opinion (and I must admit I am feeling rather jaded about the
> whole thing), is that while btrfs is a very interesting and valuable
> experiment in fs design, it contains core mistakes that cannot be
> incrementally fixed. It should be marked as legacy with all current
> behaviour declared as intentional and not subject to change. This would
> make way for a new "betrfs" which was designed based on all that we have
> learned. It would use the same code base, but present a more coherent
> interface. Exactly what that interface would be has yet to be decided,
> but we would not be bound to maintain anything just because btrfs
> supports it.
>

There is no need for a new driver name (like ext3=>ext4)
Both ext4 and xfs have features that can be determined in mkfs time.
This user experience change does not involve on-disk format changes,
so it is a much easier case, because at least technically, there is nothing
preventing an administrator from turning the user experience feature
on and off with proper care of the consequences.

Which brings me to another point.
This discussion presents several technical challenges and you have
been very creative in presenting technical solutions, but I think that
the nature of the problem is more on the administrative side.

I see this as an unfortunate flaw in our design process, when
filesystem developers have long discussions about issues where
some of the material stakeholders (i.e. administrators) are not in the loop.
But I do not have very good ideas on how to address this flaw.

Thanks,
Amir.

2021-09-21 03:41:27

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] BTRFS/NFSD: provide more unique inode number for btrfs export

On Tue, 14 Sep 2021, Amir Goldstein wrote:
> On Tue, Sep 14, 2021 at 1:59 AM NeilBrown <[email protected]> wrote:
> >
> > On Mon, 13 Sep 2021, Amir Goldstein wrote:
> > >
> > > Right, so the right fix IMO would be to provide similar semantics
> > > to the NFS client, like your first patch set tried to do.
> > >
> >
> > Like every other approach, this sounds good and sensible ... until
> > you examine the details.
> >
> > For NFSv3 (RFC1813) this would be a protocol violation.
> > Section 3.3.3 (LOOKUP) says:
> > A server will not allow a LOOKUP operation to cross a mountpoint to
> > the root of a different filesystem, even if the filesystem is
> > exported.
> >
> > The filesystem is represented by the fsid, so this implies that the fsid
> > of an object reported by LOOKUP must be the same as the fsid of the
> > directory used in the LOOKUP.
> >
> > Linux NFS does allow this restriction to be bypassed with the "crossmnt"
> > export option. Maybe if crossmnt were given it would be defensible to
> > change the fsid - if crossmnt is not given, we leave the current
> > behaviour. Note that this is a hack and while it is extremely useful,
> > it does not produce a seemly experience. You can get exactly the same
> > problems with "find" - just not as uniformly (mounting with "-o noac"
> > makes them uniform).
> >
>
> I don't understand why we would need to talk about NFSv3.
> This btrfs export issue has been with us for a while.
> I see no reason to address it for old protocols if we can address
> it with a new protocol with better support for the concept of fsid hierarchy.
>
> > For NFSv4, we need to provide a "mounted-on" fileid for any mountpoint.
> > btrfs doesn't have a mounted-on fileid that can be used. We can fake
> > something that might work reasonably well - but it would be fake. (but
> > then ... btrfs already provided bogus information in getdents when there
> > is a subvol root in the directory).
> >
>
> That seems easy to solve by passing some flag to ->encode_fh()
> or if the behavior is persistent in btrfs by some mkfs/module/mount option
> then btrfs_encode_fh() will always encode the subvol root inode as
> resident of the parent tree-id, because nfsd anyway does not ->encode_fh()
> for export roots, right?

->encode_fh has nothing to do with getting the mounted-on fileid.
With a normal mount point, there are two inodes, one in each vfsmount.
We can call ->getattr to get kstat info including the inode number.
nfsd does that for the underlying vfsmnt/inode to get the mounted-on
fileid. What should it do for btrfs "subvols"?

>
> > But these are relatively minor. The bigger problem is /proc/mounts. If
> > btrfs maintainers were willing to have every active subvolume appear in
> > /proc/mounts, then I would be happy to fiddle the NFS fsid and allow
> > every active NFS/btrfs subvolume to appear in /proc/mounts on the NFS
> > client. But they aren't. So I am not.
> >
>
> I don't understand why you need to tie the two together.

Because they are the same thing.
The most concrete reason is that any name that appears in /proc/mounts is
public. People understand that when they mount filesystems. People
don't need to understand that when creating private subvols.
There is anecdotal evidence that people might expect subvol paths to be
private. If they then access those subvols via NFS, the names suddenly
become public.

> I would suggest:
> 1. Export different fsid's per subvols to NFSv4 based on statx()
> exported tree-id
> 2. NFS client side uses user configuration to determine which subvols
> to auto-mount

That is a non-started. Subvols currently don't need mounting, they
transparently appear. Requiring client-side configuration would be a
major cost for some users.

> 3. [optional] Provide a way to configure btrfs using mkfs/module/mount option
> to behave locally the same as the NFS client, which will allow
> user configuration
> to determine with subvols to auto-mount locally
>
> I admit that my understanding of the full picture is limited, but I don't
> understand why #3 is a strict dependency for #1 and #2.
>
> > > > And I really don't see how an nfs export option would help... Different
> > > > people within and organisation and using the same export might have
> > > > different expectations.
> > >
> > > That's true.
> > > But if admin decides to export a specific btrfs mount as a non-unified
> > > filesystem, then NFS clients can decide whether ot not to auto-mount the
> > > exported subvolumes and different users on the client machine can decide
> > > if they want to rsync or rsync --one-file-system, just as they would with
> > > local btrfs.
> > >
> > > And maybe I am wrong, but I don't see how the decision on whether to
> > > export a non-unified btrfs can be made a btrfs option or a nfsd global
> > > option, that's why I ended up with export option.
> >
> > Just because a btrfs option and global nfsd option are bad, that doesn't
> > mean an export option must be good. It needs to be presented and
> > defended on its own merits.
> >
> > My current opinion (and I must admit I am feeling rather jaded about the
> > whole thing), is that while btrfs is a very interesting and valuable
> > experiment in fs design, it contains core mistakes that cannot be
> > incrementally fixed. It should be marked as legacy with all current
> > behaviour declared as intentional and not subject to change. This would
> > make way for a new "betrfs" which was designed based on all that we have
> > learned. It would use the same code base, but present a more coherent
> > interface. Exactly what that interface would be has yet to be decided,
> > but we would not be bound to maintain anything just because btrfs
> > supports it.
> >
>
> There is no need for a new driver name (like ext3=>ext4)
> Both ext4 and xfs have features that can be determined in mkfs time.
> This user experience change does not involve on-disk format changes,
> so it is a much easier case, because at least technically, there is nothing
> preventing an administrator from turning the user experience feature
> on and off with proper care of the consequences.
>
> Which brings me to another point.
> This discussion presents several technical challenges and you have
> been very creative in presenting technical solutions, but I think that
> the nature of the problem is more on the administrative side.
>
> I see this as an unfortunate flaw in our design process, when
> filesystem developers have long discussions about issues where
> some of the material stakeholders (i.e. administrators) are not in the loop.
> But I do not have very good ideas on how to address this flaw.

I agree this is more than a technical question. I don't see it as
particularly an admin issue, because non-admin users can create subvols.

I see it as a conceptual problem. What is a "subvol"? What do we want
it to be. Does it make sense for the subvol namespace to align with the
filesystem namespace?
Subvols are more than directories, but less than filesystems. How can
be best characterise them and thing about them? Are they directories
with extra features, or filesystems with some limitation (and some extra
features)? Or are they something completely new?
What sort of identity information do applications *need* about files an
filesystems and how can we best provide that within the context of
existing APIs?

NeilBrown

2021-09-23 21:21:40

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 1/3 v3] NFSD: move filehandle format declarations out of "uapi".

These 3 still apply after fixing up a couple minor conflicts; queued up
for 5.16.--b.

On Thu, Sep 02, 2021 at 11:14:47AM +1000, NeilBrown wrote:
>
> A small part of the declaration concerning filehandle format are
> currently in the "uapi" include directory:
> include/uapi/linux/nfsd/nfsfh.h
>
> There is a lot more to the filehandle format, including "enum fid_type"
> and "enum nfsd_fsid" which are not exported via "uapi".
>
> This small part of the filehandle definition is of minimal use outside
> of the kernel, and I can find no evidence that an other code is using
> it. Certainly nfs-utils and wireshark (The most likely candidates) do not
> use these declarations.
>
> So move it out of "uapi" by copying the content from
> include/uapi/linux/nfsd/nfsfh.h
> into
> fs/nfsd/nfsfh.h
>
> A few unnecessary "#include" directives are not copied, and neither is
> the #define of fh_auth, which is annotated as being for userspace only.
>
> The copyright claims in the uapi file are identical to those in the nfsd
> file, so there is no need to copy those.
>
> The "__u32" style integer types are only needed in "uapi". In
> kernel-only code we can use the more familiar "u32" style.
>
> Signed-off-by: NeilBrown <[email protected]>
> ---
> fs/nfsd/nfsfh.h | 98 ++++++++++++++++++++++++++-
> include/uapi/linux/nfsd/nfsfh.h | 116 --------------------------------
> 2 files changed, 97 insertions(+), 117 deletions(-)
> delete mode 100644 include/uapi/linux/nfsd/nfsfh.h
>
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index 6106697adc04..988e4dfdfbd9 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -10,9 +10,105 @@
>
> #include <linux/crc32.h>
> #include <linux/sunrpc/svc.h>
> -#include <uapi/linux/nfsd/nfsfh.h>
> #include <linux/iversion.h>
> #include <linux/exportfs.h>
> +#include <linux/nfs4.h>
> +
> +
> +/*
> + * This is the old "dentry style" Linux NFSv2 file handle.
> + *
> + * The xino and xdev fields are currently used to transport the
> + * ino/dev of the exported inode.
> + */
> +struct nfs_fhbase_old {
> + u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
> + u32 fb_ino; /* our inode number */
> + u32 fb_dirino; /* dir inode number, 0 for directories */
> + u32 fb_dev; /* our device */
> + u32 fb_xdev;
> + u32 fb_xino;
> + u32 fb_generation;
> +};
> +
> +/*
> + * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
> + * by Neil Brown <[email protected]> - March 2000
> + *
> + * The file handle starts with a sequence of four-byte words.
> + * The first word contains a version number (1) and three descriptor bytes
> + * that tell how the remaining 3 variable length fields should be handled.
> + * These three bytes are auth_type, fsid_type and fileid_type.
> + *
> + * All four-byte values are in host-byte-order.
> + *
> + * The auth_type field is deprecated and must be set to 0.
> + *
> + * The fsid_type identifies how the filesystem (or export point) is
> + * encoded.
> + * Current values:
> + * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
> + * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
> + * says we mustn't. We must break it up and reassemble.
> + * 1 - 4 byte user specified identifier
> + * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
> + * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
> + * 4 - 4 byte inode number and 4 byte uuid
> + * 5 - 8 byte uuid
> + * 6 - 16 byte uuid
> + * 7 - 8 byte inode number and 16 byte uuid
> + *
> + * The fileid_type identified how the file within the filesystem is encoded.
> + * The values for this field are filesystem specific, exccept that
> + * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
> + * in include/linux/exportfs.h for currently registered values.
> + */
> +struct nfs_fhbase_new {
> + union {
> + struct {
> + u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
> + u8 fb_auth_type_aux;
> + u8 fb_fsid_type_aux;
> + u8 fb_fileid_type_aux;
> + u32 fb_auth[1];
> + /* u32 fb_fsid[0]; floating */
> + /* u32 fb_fileid[0]; floating */
> + };
> + struct {
> + u8 fb_version; /* == 1, even => nfs_fhbase_old */
> + u8 fb_auth_type;
> + u8 fb_fsid_type;
> + u8 fb_fileid_type;
> + u32 fb_auth_flex[]; /* flexible-array member */
> + };
> + };
> +};
> +
> +struct knfsd_fh {
> + unsigned int fh_size; /* significant for NFSv3.
> + * Points to the current size while building
> + * a new file handle
> + */
> + union {
> + struct nfs_fhbase_old fh_old;
> + u32 fh_pad[NFS4_FHSIZE/4];
> + struct nfs_fhbase_new fh_new;
> + } fh_base;
> +};
> +
> +#define ofh_dcookie fh_base.fh_old.fb_dcookie
> +#define ofh_ino fh_base.fh_old.fb_ino
> +#define ofh_dirino fh_base.fh_old.fb_dirino
> +#define ofh_dev fh_base.fh_old.fb_dev
> +#define ofh_xdev fh_base.fh_old.fb_xdev
> +#define ofh_xino fh_base.fh_old.fb_xino
> +#define ofh_generation fh_base.fh_old.fb_generation
> +
> +#define fh_version fh_base.fh_new.fb_version
> +#define fh_fsid_type fh_base.fh_new.fb_fsid_type
> +#define fh_auth_type fh_base.fh_new.fb_auth_type
> +#define fh_fileid_type fh_base.fh_new.fb_fileid_type
> +#define fh_fsid fh_base.fh_new.fb_auth_flex
>
> static inline __u32 ino_t_to_u32(ino_t ino)
> {
> diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
> deleted file mode 100644
> index 427294dd56a1..000000000000
> --- a/include/uapi/linux/nfsd/nfsfh.h
> +++ /dev/null
> @@ -1,116 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> -/*
> - * This file describes the layout of the file handles as passed
> - * over the wire.
> - *
> - * Copyright (C) 1995, 1996, 1997 Olaf Kirch <[email protected]>
> - */
> -
> -#ifndef _UAPI_LINUX_NFSD_FH_H
> -#define _UAPI_LINUX_NFSD_FH_H
> -
> -#include <linux/types.h>
> -#include <linux/nfs.h>
> -#include <linux/nfs2.h>
> -#include <linux/nfs3.h>
> -#include <linux/nfs4.h>
> -
> -/*
> - * This is the old "dentry style" Linux NFSv2 file handle.
> - *
> - * The xino and xdev fields are currently used to transport the
> - * ino/dev of the exported inode.
> - */
> -struct nfs_fhbase_old {
> - __u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
> - __u32 fb_ino; /* our inode number */
> - __u32 fb_dirino; /* dir inode number, 0 for directories */
> - __u32 fb_dev; /* our device */
> - __u32 fb_xdev;
> - __u32 fb_xino;
> - __u32 fb_generation;
> -};
> -
> -/*
> - * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
> - * by Neil Brown <[email protected]> - March 2000
> - *
> - * The file handle starts with a sequence of four-byte words.
> - * The first word contains a version number (1) and three descriptor bytes
> - * that tell how the remaining 3 variable length fields should be handled.
> - * These three bytes are auth_type, fsid_type and fileid_type.
> - *
> - * All four-byte values are in host-byte-order.
> - *
> - * The auth_type field is deprecated and must be set to 0.
> - *
> - * The fsid_type identifies how the filesystem (or export point) is
> - * encoded.
> - * Current values:
> - * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
> - * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
> - * says we mustn't. We must break it up and reassemble.
> - * 1 - 4 byte user specified identifier
> - * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
> - * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
> - * 4 - 4 byte inode number and 4 byte uuid
> - * 5 - 8 byte uuid
> - * 6 - 16 byte uuid
> - * 7 - 8 byte inode number and 16 byte uuid
> - *
> - * The fileid_type identified how the file within the filesystem is encoded.
> - * The values for this field are filesystem specific, exccept that
> - * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
> - * in include/linux/exportfs.h for currently registered values.
> - */
> -struct nfs_fhbase_new {
> - union {
> - struct {
> - __u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
> - __u8 fb_auth_type_aux;
> - __u8 fb_fsid_type_aux;
> - __u8 fb_fileid_type_aux;
> - __u32 fb_auth[1];
> - /* __u32 fb_fsid[0]; floating */
> - /* __u32 fb_fileid[0]; floating */
> - };
> - struct {
> - __u8 fb_version; /* == 1, even => nfs_fhbase_old */
> - __u8 fb_auth_type;
> - __u8 fb_fsid_type;
> - __u8 fb_fileid_type;
> - __u32 fb_auth_flex[]; /* flexible-array member */
> - };
> - };
> -};
> -
> -struct knfsd_fh {
> - unsigned int fh_size; /* significant for NFSv3.
> - * Points to the current size while building
> - * a new file handle
> - */
> - union {
> - struct nfs_fhbase_old fh_old;
> - __u32 fh_pad[NFS4_FHSIZE/4];
> - struct nfs_fhbase_new fh_new;
> - } fh_base;
> -};
> -
> -#define ofh_dcookie fh_base.fh_old.fb_dcookie
> -#define ofh_ino fh_base.fh_old.fb_ino
> -#define ofh_dirino fh_base.fh_old.fb_dirino
> -#define ofh_dev fh_base.fh_old.fb_dev
> -#define ofh_xdev fh_base.fh_old.fb_xdev
> -#define ofh_xino fh_base.fh_old.fb_xino
> -#define ofh_generation fh_base.fh_old.fb_generation
> -
> -#define fh_version fh_base.fh_new.fb_version
> -#define fh_fsid_type fh_base.fh_new.fb_fsid_type
> -#define fh_auth_type fh_base.fh_new.fb_auth_type
> -#define fh_fileid_type fh_base.fh_new.fb_fileid_type
> -#define fh_fsid fh_base.fh_new.fb_auth_flex
> -
> -/* Do not use, provided for userspace compatiblity. */
> -#define fh_auth fh_base.fh_new.fb_auth
> -
> -#endif /* _UAPI_LINUX_NFSD_FH_H */
> --
> 2.32.0

2021-09-25 06:48:06

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH 1/3 v3] NFSD: move filehandle format declarations out of "uapi".

On Fri, 24 Sep 2021, Bruce Fields wrote:
> These 3 still apply after fixing up a couple minor conflicts; queued up
> for 5.16.--b.


Cool, thanks.

NeilBrown

>
> On Thu, Sep 02, 2021 at 11:14:47AM +1000, NeilBrown wrote:
> >
> > A small part of the declaration concerning filehandle format are
> > currently in the "uapi" include directory:
> > include/uapi/linux/nfsd/nfsfh.h
> >
> > There is a lot more to the filehandle format, including "enum fid_type"
> > and "enum nfsd_fsid" which are not exported via "uapi".
> >
> > This small part of the filehandle definition is of minimal use outside
> > of the kernel, and I can find no evidence that an other code is using
> > it. Certainly nfs-utils and wireshark (The most likely candidates) do not
> > use these declarations.
> >
> > So move it out of "uapi" by copying the content from
> > include/uapi/linux/nfsd/nfsfh.h
> > into
> > fs/nfsd/nfsfh.h
> >
> > A few unnecessary "#include" directives are not copied, and neither is
> > the #define of fh_auth, which is annotated as being for userspace only.
> >
> > The copyright claims in the uapi file are identical to those in the nfsd
> > file, so there is no need to copy those.
> >
> > The "__u32" style integer types are only needed in "uapi". In
> > kernel-only code we can use the more familiar "u32" style.
> >
> > Signed-off-by: NeilBrown <[email protected]>
> > ---
> > fs/nfsd/nfsfh.h | 98 ++++++++++++++++++++++++++-
> > include/uapi/linux/nfsd/nfsfh.h | 116 --------------------------------
> > 2 files changed, 97 insertions(+), 117 deletions(-)
> > delete mode 100644 include/uapi/linux/nfsd/nfsfh.h
> >
> > diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> > index 6106697adc04..988e4dfdfbd9 100644
> > --- a/fs/nfsd/nfsfh.h
> > +++ b/fs/nfsd/nfsfh.h
> > @@ -10,9 +10,105 @@
> >
> > #include <linux/crc32.h>
> > #include <linux/sunrpc/svc.h>
> > -#include <uapi/linux/nfsd/nfsfh.h>
> > #include <linux/iversion.h>
> > #include <linux/exportfs.h>
> > +#include <linux/nfs4.h>
> > +
> > +
> > +/*
> > + * This is the old "dentry style" Linux NFSv2 file handle.
> > + *
> > + * The xino and xdev fields are currently used to transport the
> > + * ino/dev of the exported inode.
> > + */
> > +struct nfs_fhbase_old {
> > + u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
> > + u32 fb_ino; /* our inode number */
> > + u32 fb_dirino; /* dir inode number, 0 for directories */
> > + u32 fb_dev; /* our device */
> > + u32 fb_xdev;
> > + u32 fb_xino;
> > + u32 fb_generation;
> > +};
> > +
> > +/*
> > + * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
> > + * by Neil Brown <[email protected]> - March 2000
> > + *
> > + * The file handle starts with a sequence of four-byte words.
> > + * The first word contains a version number (1) and three descriptor bytes
> > + * that tell how the remaining 3 variable length fields should be handled.
> > + * These three bytes are auth_type, fsid_type and fileid_type.
> > + *
> > + * All four-byte values are in host-byte-order.
> > + *
> > + * The auth_type field is deprecated and must be set to 0.
> > + *
> > + * The fsid_type identifies how the filesystem (or export point) is
> > + * encoded.
> > + * Current values:
> > + * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
> > + * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
> > + * says we mustn't. We must break it up and reassemble.
> > + * 1 - 4 byte user specified identifier
> > + * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
> > + * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
> > + * 4 - 4 byte inode number and 4 byte uuid
> > + * 5 - 8 byte uuid
> > + * 6 - 16 byte uuid
> > + * 7 - 8 byte inode number and 16 byte uuid
> > + *
> > + * The fileid_type identified how the file within the filesystem is encoded.
> > + * The values for this field are filesystem specific, exccept that
> > + * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
> > + * in include/linux/exportfs.h for currently registered values.
> > + */
> > +struct nfs_fhbase_new {
> > + union {
> > + struct {
> > + u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
> > + u8 fb_auth_type_aux;
> > + u8 fb_fsid_type_aux;
> > + u8 fb_fileid_type_aux;
> > + u32 fb_auth[1];
> > + /* u32 fb_fsid[0]; floating */
> > + /* u32 fb_fileid[0]; floating */
> > + };
> > + struct {
> > + u8 fb_version; /* == 1, even => nfs_fhbase_old */
> > + u8 fb_auth_type;
> > + u8 fb_fsid_type;
> > + u8 fb_fileid_type;
> > + u32 fb_auth_flex[]; /* flexible-array member */
> > + };
> > + };
> > +};
> > +
> > +struct knfsd_fh {
> > + unsigned int fh_size; /* significant for NFSv3.
> > + * Points to the current size while building
> > + * a new file handle
> > + */
> > + union {
> > + struct nfs_fhbase_old fh_old;
> > + u32 fh_pad[NFS4_FHSIZE/4];
> > + struct nfs_fhbase_new fh_new;
> > + } fh_base;
> > +};
> > +
> > +#define ofh_dcookie fh_base.fh_old.fb_dcookie
> > +#define ofh_ino fh_base.fh_old.fb_ino
> > +#define ofh_dirino fh_base.fh_old.fb_dirino
> > +#define ofh_dev fh_base.fh_old.fb_dev
> > +#define ofh_xdev fh_base.fh_old.fb_xdev
> > +#define ofh_xino fh_base.fh_old.fb_xino
> > +#define ofh_generation fh_base.fh_old.fb_generation
> > +
> > +#define fh_version fh_base.fh_new.fb_version
> > +#define fh_fsid_type fh_base.fh_new.fb_fsid_type
> > +#define fh_auth_type fh_base.fh_new.fb_auth_type
> > +#define fh_fileid_type fh_base.fh_new.fb_fileid_type
> > +#define fh_fsid fh_base.fh_new.fb_auth_flex
> >
> > static inline __u32 ino_t_to_u32(ino_t ino)
> > {
> > diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
> > deleted file mode 100644
> > index 427294dd56a1..000000000000
> > --- a/include/uapi/linux/nfsd/nfsfh.h
> > +++ /dev/null
> > @@ -1,116 +0,0 @@
> > -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > -/*
> > - * This file describes the layout of the file handles as passed
> > - * over the wire.
> > - *
> > - * Copyright (C) 1995, 1996, 1997 Olaf Kirch <[email protected]>
> > - */
> > -
> > -#ifndef _UAPI_LINUX_NFSD_FH_H
> > -#define _UAPI_LINUX_NFSD_FH_H
> > -
> > -#include <linux/types.h>
> > -#include <linux/nfs.h>
> > -#include <linux/nfs2.h>
> > -#include <linux/nfs3.h>
> > -#include <linux/nfs4.h>
> > -
> > -/*
> > - * This is the old "dentry style" Linux NFSv2 file handle.
> > - *
> > - * The xino and xdev fields are currently used to transport the
> > - * ino/dev of the exported inode.
> > - */
> > -struct nfs_fhbase_old {
> > - __u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
> > - __u32 fb_ino; /* our inode number */
> > - __u32 fb_dirino; /* dir inode number, 0 for directories */
> > - __u32 fb_dev; /* our device */
> > - __u32 fb_xdev;
> > - __u32 fb_xino;
> > - __u32 fb_generation;
> > -};
> > -
> > -/*
> > - * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
> > - * by Neil Brown <[email protected]> - March 2000
> > - *
> > - * The file handle starts with a sequence of four-byte words.
> > - * The first word contains a version number (1) and three descriptor bytes
> > - * that tell how the remaining 3 variable length fields should be handled.
> > - * These three bytes are auth_type, fsid_type and fileid_type.
> > - *
> > - * All four-byte values are in host-byte-order.
> > - *
> > - * The auth_type field is deprecated and must be set to 0.
> > - *
> > - * The fsid_type identifies how the filesystem (or export point) is
> > - * encoded.
> > - * Current values:
> > - * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
> > - * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
> > - * says we mustn't. We must break it up and reassemble.
> > - * 1 - 4 byte user specified identifier
> > - * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
> > - * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
> > - * 4 - 4 byte inode number and 4 byte uuid
> > - * 5 - 8 byte uuid
> > - * 6 - 16 byte uuid
> > - * 7 - 8 byte inode number and 16 byte uuid
> > - *
> > - * The fileid_type identified how the file within the filesystem is encoded.
> > - * The values for this field are filesystem specific, exccept that
> > - * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
> > - * in include/linux/exportfs.h for currently registered values.
> > - */
> > -struct nfs_fhbase_new {
> > - union {
> > - struct {
> > - __u8 fb_version_aux; /* == 1, even => nfs_fhbase_old */
> > - __u8 fb_auth_type_aux;
> > - __u8 fb_fsid_type_aux;
> > - __u8 fb_fileid_type_aux;
> > - __u32 fb_auth[1];
> > - /* __u32 fb_fsid[0]; floating */
> > - /* __u32 fb_fileid[0]; floating */
> > - };
> > - struct {
> > - __u8 fb_version; /* == 1, even => nfs_fhbase_old */
> > - __u8 fb_auth_type;
> > - __u8 fb_fsid_type;
> > - __u8 fb_fileid_type;
> > - __u32 fb_auth_flex[]; /* flexible-array member */
> > - };
> > - };
> > -};
> > -
> > -struct knfsd_fh {
> > - unsigned int fh_size; /* significant for NFSv3.
> > - * Points to the current size while building
> > - * a new file handle
> > - */
> > - union {
> > - struct nfs_fhbase_old fh_old;
> > - __u32 fh_pad[NFS4_FHSIZE/4];
> > - struct nfs_fhbase_new fh_new;
> > - } fh_base;
> > -};
> > -
> > -#define ofh_dcookie fh_base.fh_old.fb_dcookie
> > -#define ofh_ino fh_base.fh_old.fb_ino
> > -#define ofh_dirino fh_base.fh_old.fb_dirino
> > -#define ofh_dev fh_base.fh_old.fb_dev
> > -#define ofh_xdev fh_base.fh_old.fb_xdev
> > -#define ofh_xino fh_base.fh_old.fb_xino
> > -#define ofh_generation fh_base.fh_old.fb_generation
> > -
> > -#define fh_version fh_base.fh_new.fb_version
> > -#define fh_fsid_type fh_base.fh_new.fb_fsid_type
> > -#define fh_auth_type fh_base.fh_new.fb_auth_type
> > -#define fh_fileid_type fh_base.fh_new.fb_fileid_type
> > -#define fh_fsid fh_base.fh_new.fb_auth_flex
> > -
> > -/* Do not use, provided for userspace compatiblity. */
> > -#define fh_auth fh_base.fh_new.fb_auth
> > -
> > -#endif /* _UAPI_LINUX_NFSD_FH_H */
> > --
> > 2.32.0
>
>