2022-08-16 13:31:24

by Jeff Layton

[permalink] [raw]
Subject: [PATCH 0/4] vfs: expose the inode change attribute via statx

The i_version counter is currently only really visible via knfsd with
NFSv4, so testing its behavior has always been quite difficult. The main
goal of this patchset is to remedy that.

The idea is to expose i_version to userland via statx for all
filesystems that support it. The initial usecase for this is to allow
for better testing of i_version counter behavior, but it may be useful
for userland nfs servers like nfs-ganesha and possibly other situations
in the future.

I'll be posting patches for xfsprogs and xfstests that use and test this
functionality soon.

Jeff Layton (4):
vfs: report change attribute in statx for IS_I_VERSION inodes
nfs: report the change attribute if requested
afs: fill out change attribute in statx replies
ceph: fill in the change attribute in statx requests

fs/afs/inode.c | 2 ++
fs/ceph/inode.c | 14 +++++++++-----
fs/nfs/inode.c | 7 +++++--
fs/stat.c | 7 +++++++
include/linux/stat.h | 1 +
include/uapi/linux/stat.h | 3 ++-
samples/vfs/test-statx.c | 8 ++++++--
7 files changed, 32 insertions(+), 10 deletions(-)

--
2.37.2


2022-08-16 13:31:24

by Jeff Layton

[permalink] [raw]
Subject: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

From: Jeff Layton <[email protected]>

Claim one of the spare fields in struct statx to hold a 64-bit change
attribute. When statx requests this attribute, do an
inode_query_iversion and fill the result in the field.

Also update the test-statx.c program to display the change attribute and
the mountid as well.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/stat.c | 7 +++++++
include/linux/stat.h | 1 +
include/uapi/linux/stat.h | 3 ++-
samples/vfs/test-statx.c | 8 ++++++--
4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/stat.c b/fs/stat.c
index 9ced8860e0f3..7c3d063c31ba 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -17,6 +17,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/compat.h>
+#include <linux/iversion.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -118,6 +119,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
STATX_ATTR_DAX);

+ if ((request_mask & STATX_CHANGE_ATTR) && IS_I_VERSION(inode)) {
+ stat->result_mask |= STATX_CHANGE_ATTR;
+ stat->change_attr = inode_query_iversion(inode);
+ }
+
mnt_userns = mnt_user_ns(path->mnt);
if (inode->i_op->getattr)
return inode->i_op->getattr(mnt_userns, path, stat,
@@ -611,6 +617,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dev_major = MAJOR(stat->dev);
tmp.stx_dev_minor = MINOR(stat->dev);
tmp.stx_mnt_id = stat->mnt_id;
+ tmp.stx_change_attr = stat->change_attr;

return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..7b444c2ad0ad 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -50,6 +50,7 @@ struct kstat {
struct timespec64 btime; /* File creation time */
u64 blocks;
u64 mnt_id;
+ u64 change_attr;
};

#endif
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1500a0f58041..fd839ec76aa4 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -124,7 +124,7 @@ struct statx {
__u32 stx_dev_minor;
/* 0x90 */
__u64 stx_mnt_id;
- __u64 __spare2;
+ __u64 stx_change_attr; /* Inode change attribute */
/* 0xa0 */
__u64 __spare3[12]; /* Spare space for future expansion */
/* 0x100 */
@@ -152,6 +152,7 @@ struct statx {
#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
#define STATX_BTIME 0x00000800U /* Want/got stx_btime */
#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
+#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */

#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */

diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c
index 49c7a46cee07..b104909721c4 100644
--- a/samples/vfs/test-statx.c
+++ b/samples/vfs/test-statx.c
@@ -107,6 +107,8 @@ static void dump_statx(struct statx *stx)
printf("Device: %-15s", buffer);
if (stx->stx_mask & STATX_INO)
printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino);
+ if (stx->stx_mask & STATX_MNT_ID)
+ printf(" MountId: %llx"), stx->stx_mnt_id;
if (stx->stx_mask & STATX_NLINK)
printf(" Links: %-5u", stx->stx_nlink);
if (stx->stx_mask & STATX_TYPE) {
@@ -145,7 +147,9 @@ static void dump_statx(struct statx *stx)
if (stx->stx_mask & STATX_CTIME)
print_time("Change: ", &stx->stx_ctime);
if (stx->stx_mask & STATX_BTIME)
- print_time(" Birth: ", &stx->stx_btime);
+ print_time("Birth: ", &stx->stx_btime);
+ if (stx->stx_mask & STATX_CHANGE_ATTR)
+ printf(" Change Attr: 0x%llx\n", stx->stx_change_attr);

if (stx->stx_attributes_mask) {
unsigned char bits, mbits;
@@ -218,7 +222,7 @@ int main(int argc, char **argv)
struct statx stx;
int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW;

- unsigned int mask = STATX_BASIC_STATS | STATX_BTIME;
+ unsigned int mask = STATX_BASIC_STATS | STATX_BTIME | STATX_MNT_ID | STATX_CHANGE_ATTR;

for (argv++; *argv; argv++) {
if (strcmp(*argv, "-F") == 0) {
--
2.37.2

2022-08-16 13:31:24

by Jeff Layton

[permalink] [raw]
Subject: [PATCH 2/4] nfs: report the change attribute if requested

Allow NFS to report the i_version in statx. Since the cost to fetch it
is relatively cheap, do it unconditionally and just set the flag if it
looks like it's valid.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/inode.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b4e46b0ffa2d..43e23ec2a64d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -829,6 +829,8 @@ static u32 nfs_get_valid_attrmask(struct inode *inode)
reply_mask |= STATX_UID | STATX_GID;
if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
reply_mask |= STATX_BLOCKS;
+ if (!(cache_validity & NFS_INO_INVALID_CHANGE))
+ reply_mask |= STATX_CHANGE_ATTR;
return reply_mask;
}

@@ -847,7 +849,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,

request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID |
STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME |
- STATX_INO | STATX_SIZE | STATX_BLOCKS;
+ STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_CHANGE_ATTR;

if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
if (readdirplus_enabled)
@@ -876,7 +878,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
/* Is the user requesting attributes that might need revalidation? */
if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME|
STATX_MTIME|STATX_UID|STATX_GID|
- STATX_SIZE|STATX_BLOCKS)))
+ STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_ATTR)))
goto out_no_revalidate;

/* Check whether the cached attributes are stale */
@@ -914,6 +916,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,

generic_fillattr(&init_user_ns, inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+ stat->change_attr = inode_peek_iversion_raw(inode);
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
out:
--
2.37.2

2022-08-16 13:31:25

by Jeff Layton

[permalink] [raw]
Subject: [PATCH 3/4] afs: fill out change attribute in statx replies

Always copy the change attribute in a statx reply, and set the
STATX_CHGATTR bit unconditionally.

Signed-off-by: Jeff Layton <[email protected]>
---
fs/afs/inode.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 6d3a3dbe4928..bc65cc2dd2d5 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -762,6 +762,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(&init_user_ns, inode, stat);
+ stat->change_attr = inode_peek_iversion_raw(inode);
+ stat->result_mask |= STATX_CHANGE_ATTR;
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
stat->nlink -= 1;
--
2.37.2

2022-08-16 13:58:09

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

On Tue, Aug 16, 2022 at 09:27:56AM -0400, Jeff Layton wrote:
> From: Jeff Layton <[email protected]>
>
> Claim one of the spare fields in struct statx to hold a 64-bit change
> attribute. When statx requests this attribute, do an
> inode_query_iversion and fill the result in the field.
>
> Also update the test-statx.c program to display the change attribute and
> the mountid as well.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/stat.c | 7 +++++++
> include/linux/stat.h | 1 +
> include/uapi/linux/stat.h | 3 ++-
> samples/vfs/test-statx.c | 8 ++++++--
> 4 files changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/fs/stat.c b/fs/stat.c
> index 9ced8860e0f3..7c3d063c31ba 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -17,6 +17,7 @@
> #include <linux/syscalls.h>
> #include <linux/pagemap.h>
> #include <linux/compat.h>
> +#include <linux/iversion.h>
>
> #include <linux/uaccess.h>
> #include <asm/unistd.h>
> @@ -118,6 +119,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
> stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
> STATX_ATTR_DAX);
>
> + if ((request_mask & STATX_CHANGE_ATTR) && IS_I_VERSION(inode)) {
> + stat->result_mask |= STATX_CHANGE_ATTR;
> + stat->change_attr = inode_query_iversion(inode);
> + }
> +
> mnt_userns = mnt_user_ns(path->mnt);
> if (inode->i_op->getattr)
> return inode->i_op->getattr(mnt_userns, path, stat,
> @@ -611,6 +617,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
> tmp.stx_dev_major = MAJOR(stat->dev);
> tmp.stx_dev_minor = MINOR(stat->dev);
> tmp.stx_mnt_id = stat->mnt_id;
> + tmp.stx_change_attr = stat->change_attr;
>
> return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
> }
> diff --git a/include/linux/stat.h b/include/linux/stat.h
> index 7df06931f25d..7b444c2ad0ad 100644
> --- a/include/linux/stat.h
> +++ b/include/linux/stat.h
> @@ -50,6 +50,7 @@ struct kstat {
> struct timespec64 btime; /* File creation time */
> u64 blocks;
> u64 mnt_id;
> + u64 change_attr;
> };
>
> #endif
> diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
> index 1500a0f58041..fd839ec76aa4 100644
> --- a/include/uapi/linux/stat.h
> +++ b/include/uapi/linux/stat.h
> @@ -124,7 +124,7 @@ struct statx {
> __u32 stx_dev_minor;
> /* 0x90 */
> __u64 stx_mnt_id;
> - __u64 __spare2;
> + __u64 stx_change_attr; /* Inode change attribute */
> /* 0xa0 */
> __u64 __spare3[12]; /* Spare space for future expansion */
> /* 0x100 */
> @@ -152,6 +152,7 @@ struct statx {
> #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
> #define STATX_BTIME 0x00000800U /* Want/got stx_btime */
> #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
> +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */

I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag
and field. Or I fail to understand what exact information this will
expose and how userspace will consume it.
To me the naming gives the impression that some set of generic
attributes have changed but given that statx is about querying file
attributes this becomes confusing.

Wouldn't it make more sense this time to expose it as what it is and
call this STATX_INO_VERSION and __u64 stx_ino_version?

2022-08-16 14:01:14

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

Christian Brauner <[email protected]> wrote:

> > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */
>
> I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag
> and field. Or I fail to understand what exact information this will
> expose and how userspace will consume it.
> To me the naming gives the impression that some set of generic
> attributes have changed but given that statx is about querying file
> attributes this becomes confusing.
>
> Wouldn't it make more sense this time to expose it as what it is and
> call this STATX_INO_VERSION and __u64 stx_ino_version?

I'm not sure that STATX_INO_VERSION is better that might get confused with the
version number that's used to uniquify inode slots (ie. deal with inode number
reuse).

The problem is that we need fsinfo() or similar to qualify what this means.
On some filesystems, it's only changed when the data content changes, but on
others it may get changed when, say, xattrs get changed; further, on some
filesystems it might be monotonically incremented, but on others it's just
supposed to be different between two consecutive changes (nfs, IIRC).

David

2022-08-16 14:01:51

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

On Tue, 2022-08-16 at 15:44 +0200, Christian Brauner wrote:
> On Tue, Aug 16, 2022 at 09:27:56AM -0400, Jeff Layton wrote:
> > From: Jeff Layton <[email protected]>
> >
> > Claim one of the spare fields in struct statx to hold a 64-bit change
> > attribute. When statx requests this attribute, do an
> > inode_query_iversion and fill the result in the field.
> >
> > Also update the test-statx.c program to display the change attribute and
> > the mountid as well.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/stat.c | 7 +++++++
> > include/linux/stat.h | 1 +
> > include/uapi/linux/stat.h | 3 ++-
> > samples/vfs/test-statx.c | 8 ++++++--
> > 4 files changed, 16 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/stat.c b/fs/stat.c
> > index 9ced8860e0f3..7c3d063c31ba 100644
> > --- a/fs/stat.c
> > +++ b/fs/stat.c
> > @@ -17,6 +17,7 @@
> > #include <linux/syscalls.h>
> > #include <linux/pagemap.h>
> > #include <linux/compat.h>
> > +#include <linux/iversion.h>
> >
> > #include <linux/uaccess.h>
> > #include <asm/unistd.h>
> > @@ -118,6 +119,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
> > stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
> > STATX_ATTR_DAX);
> >
> > + if ((request_mask & STATX_CHANGE_ATTR) && IS_I_VERSION(inode)) {
> > + stat->result_mask |= STATX_CHANGE_ATTR;
> > + stat->change_attr = inode_query_iversion(inode);
> > + }
> > +
> > mnt_userns = mnt_user_ns(path->mnt);
> > if (inode->i_op->getattr)
> > return inode->i_op->getattr(mnt_userns, path, stat,
> > @@ -611,6 +617,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
> > tmp.stx_dev_major = MAJOR(stat->dev);
> > tmp.stx_dev_minor = MINOR(stat->dev);
> > tmp.stx_mnt_id = stat->mnt_id;
> > + tmp.stx_change_attr = stat->change_attr;
> >
> > return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
> > }
> > diff --git a/include/linux/stat.h b/include/linux/stat.h
> > index 7df06931f25d..7b444c2ad0ad 100644
> > --- a/include/linux/stat.h
> > +++ b/include/linux/stat.h
> > @@ -50,6 +50,7 @@ struct kstat {
> > struct timespec64 btime; /* File creation time */
> > u64 blocks;
> > u64 mnt_id;
> > + u64 change_attr;
> > };
> >
> > #endif
> > diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
> > index 1500a0f58041..fd839ec76aa4 100644
> > --- a/include/uapi/linux/stat.h
> > +++ b/include/uapi/linux/stat.h
> > @@ -124,7 +124,7 @@ struct statx {
> > __u32 stx_dev_minor;
> > /* 0x90 */
> > __u64 stx_mnt_id;
> > - __u64 __spare2;
> > + __u64 stx_change_attr; /* Inode change attribute */
> > /* 0xa0 */
> > __u64 __spare3[12]; /* Spare space for future expansion */
> > /* 0x100 */
> > @@ -152,6 +152,7 @@ struct statx {
> > #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
> > #define STATX_BTIME 0x00000800U /* Want/got stx_btime */
> > #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
> > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */
>
> I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag
> and field. Or I fail to understand what exact information this will
> expose and how userspace will consume it.
> To me the naming gives the impression that some set of generic
> attributes have changed but given that statx is about querying file
> attributes this becomes confusing.
>
> Wouldn't it make more sense this time to expose it as what it is and
> call this STATX_INO_VERSION and __u64 stx_ino_version?

"Let the great bikesheddening begin!"

In all seriousness though, you do have a good point. The NFS RFCs call
this the "change attribute", so I went forward with that parlance here.
I'm not opposed to changing the naming -- STATX_INO_VERSION sounds fine
to me.

Let's see if anyone else has a better name before I make any changes
though.
--
Jeff Layton <[email protected]>

2022-08-16 14:04:00

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

On Tue, 2022-08-16 at 14:55 +0100, David Howells wrote:
> Christian Brauner <[email protected]> wrote:
>
> > > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */
> >
> > I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag
> > and field. Or I fail to understand what exact information this will
> > expose and how userspace will consume it.
> > To me the naming gives the impression that some set of generic
> > attributes have changed but given that statx is about querying file
> > attributes this becomes confusing.
> >
> > Wouldn't it make more sense this time to expose it as what it is and
> > call this STATX_INO_VERSION and __u64 stx_ino_version?
>
> I'm not sure that STATX_INO_VERSION is better that might get confused with the
> version number that's used to uniquify inode slots (ie. deal with inode number
> reuse).
>
> The problem is that we need fsinfo() or similar to qualify what this means.
> On some filesystems, it's only changed when the data content changes, but on
> others it may get changed when, say, xattrs get changed; further, on some
> filesystems it might be monotonically incremented, but on others it's just
> supposed to be different between two consecutive changes (nfs, IIRC).
>

I think we'll just have to ensure that before we expose this for any
filesystem that it conforms to some minimum standards. i.e.: it must
change if there are data or metadata changes to the inode, modulo atime
changes due to reads on regular files or readdir on dirs.

The local filesystems, ceph and NFS should all be fine. I guess that
just leaves AFS. If it can't guarantee that, then we might want to avoid
exposing the counter for it.

--
Jeff Layton <[email protected]>

2022-08-16 15:19:55

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

Jeff Layton <[email protected]> wrote:

> I think we'll just have to ensure that before we expose this for any
> filesystem that it conforms to some minimum standards. i.e.: it must
> change if there are data or metadata changes to the inode, modulo atime
> changes due to reads on regular files or readdir on dirs.
>
> The local filesystems, ceph and NFS should all be fine. I guess that
> just leaves AFS. If it can't guarantee that, then we might want to avoid
> exposing the counter for it.

AFS monotonically increments the counter on data changes; doesn't make any
change for metadata changes (other than the file size).

But you can't assume NFS works as per your suggestion as you don't know what's
backing it (it could be AFS, for example - there's a converter for that).

Further, for ordinary disk filesystems, two data changes may get elided and
only increment the counter once. And then there's mmap...

It might be better to reduce the scope of your definition and just say that it
must change if there's a data change and may also be changed if there's a
metadata change.

David

2022-08-16 15:35:41

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

On Tue, 2022-08-16 at 16:15 +0100, David Howells wrote:
> Jeff Layton <[email protected]> wrote:
>
> > I think we'll just have to ensure that before we expose this for any
> > filesystem that it conforms to some minimum standards. i.e.: it must
> > change if there are data or metadata changes to the inode, modulo atime
> > changes due to reads on regular files or readdir on dirs.
> >
> > The local filesystems, ceph and NFS should all be fine. I guess that
> > just leaves AFS. If it can't guarantee that, then we might want to avoid
> > exposing the counter for it.
>
> AFS monotonically increments the counter on data changes; doesn't make any
> change for metadata changes (other than the file size).
>
> But you can't assume NFS works as per your suggestion as you don't know what's
> backing it (it could be AFS, for example - there's a converter for that).
>

In that case, the NFS server must synthesize a proper change attr. The
NFS spec mandates that it change on most metadata changes.

> Further, for ordinary disk filesystems, two data changes may get elided and
> only increment the counter once.
>

Not a problem as long as nothing queried the counter in between the
changes.

> And then there's mmap...
>

Not sure how that matters here.

> It might be better to reduce the scope of your definition and just say that it
> must change if there's a data change and may also be changed if there's a
> metadata change.
>

I'd prefer that we mandate that it change on metadata changes as well.
That's what most of the in-kernel users want, and what most of the
existing filesystems provide. If AFS can't give that guarantee then we
can just omit exposing i_version on it.
--
Jeff Layton <[email protected]>

2022-08-16 15:58:18

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

On Tue, Aug 16, 2022 at 11:32:24AM -0400, Jeff Layton wrote:
> On Tue, 2022-08-16 at 16:15 +0100, David Howells wrote:
> > Jeff Layton <[email protected]> wrote:
> >
> > > I think we'll just have to ensure that before we expose this for any
> > > filesystem that it conforms to some minimum standards. i.e.: it must
> > > change if there are data or metadata changes to the inode, modulo atime
> > > changes due to reads on regular files or readdir on dirs.
> > >
> > > The local filesystems, ceph and NFS should all be fine. I guess that
> > > just leaves AFS. If it can't guarantee that, then we might want to avoid
> > > exposing the counter for it.
> >
> > AFS monotonically increments the counter on data changes; doesn't make any
> > change for metadata changes (other than the file size).
> >
> > But you can't assume NFS works as per your suggestion as you don't know what's
> > backing it (it could be AFS, for example - there's a converter for that).
> >
>
> In that case, the NFS server must synthesize a proper change attr. The
> NFS spec mandates that it change on most metadata changes.
>
> > Further, for ordinary disk filesystems, two data changes may get elided and
> > only increment the counter once.
> >
>
> Not a problem as long as nothing queried the counter in between the
> changes.
>
> > And then there's mmap...
> >
>
> Not sure how that matters here.
>
> > It might be better to reduce the scope of your definition and just say that it
> > must change if there's a data change and may also be changed if there's a
> > metadata change.
> >
>
> I'd prefer that we mandate that it change on metadata changes as well.

...in that case, why not leave the i_version bump in
xfs_trans_log_inode? That will capture all changes to file data,
attribues, and metadata. ;)

--D

> That's what most of the in-kernel users want, and what most of the
> existing filesystems provide. If AFS can't give that guarantee then we
> can just omit exposing i_version on it.
> --
> Jeff Layton <[email protected]>

2022-08-16 16:19:50

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

On Tue, 2022-08-16 at 08:51 -0700, Darrick J. Wong wrote:
> On Tue, Aug 16, 2022 at 11:32:24AM -0400, Jeff Layton wrote:
> > On Tue, 2022-08-16 at 16:15 +0100, David Howells wrote:
> > > Jeff Layton <[email protected]> wrote:
> > >
> > > > I think we'll just have to ensure that before we expose this for any
> > > > filesystem that it conforms to some minimum standards. i.e.: it must
> > > > change if there are data or metadata changes to the inode, modulo atime
> > > > changes due to reads on regular files or readdir on dirs.
> > > >
> > > > The local filesystems, ceph and NFS should all be fine. I guess that
> > > > just leaves AFS. If it can't guarantee that, then we might want to avoid
> > > > exposing the counter for it.
> > >
> > > AFS monotonically increments the counter on data changes; doesn't make any
> > > change for metadata changes (other than the file size).
> > >
> > > But you can't assume NFS works as per your suggestion as you don't know what's
> > > backing it (it could be AFS, for example - there's a converter for that).
> > >
> >
> > In that case, the NFS server must synthesize a proper change attr. The
> > NFS spec mandates that it change on most metadata changes.
> >
> > > Further, for ordinary disk filesystems, two data changes may get elided and
> > > only increment the counter once.
> > >
> >
> > Not a problem as long as nothing queried the counter in between the
> > changes.
> >
> > > And then there's mmap...
> > >
> >
> > Not sure how that matters here.
> >
> > > It might be better to reduce the scope of your definition and just say that it
> > > must change if there's a data change and may also be changed if there's a
> > > metadata change.
> > >
> >
> > I'd prefer that we mandate that it change on metadata changes as well.
>
> ...in that case, why not leave the i_version bump in
> xfs_trans_log_inode? That will capture all changes to file data,
> attribues, and metadata. ;)
>
>

Because that includes changes to the atime due to reads which should be
specifically omitted. We could still keep that callsite instead, if you
can see some way to exclude those.

In practice, we are using a change to i_version to mean that "something
changed" in the inode, which usually implies a change to the ctime and
mtime.

Trond pointed out that the NFSv4 spec implies that time_access updates
should be omitted from what we consider to be "metadata" here:

https://mailarchive.ietf.org/arch/msg/nfsv4/yrRBMrVwWWDCrgHPAzq_yAEc7BU/

IMA (which is the only other in-kernel consumer of i_version) also wants
the same behavior.

> > That's what most of the in-kernel users want, and what most of the
> > existing filesystems provide. If AFS can't give that guarantee then we
> > can just omit exposing i_version on it.


--
Jeff Layton <[email protected]>

2022-08-18 20:27:17

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/4] vfs: report change attribute in statx for IS_I_VERSION inodes

On Tue, 2022-08-16 at 15:44 +0200, Christian Brauner wrote:
> On Tue, Aug 16, 2022 at 09:27:56AM -0400, Jeff Layton wrote:
> > From: Jeff Layton <[email protected]>
> >
> > Claim one of the spare fields in struct statx to hold a 64-bit change
> > attribute. When statx requests this attribute, do an
> > inode_query_iversion and fill the result in the field.
> >
> > Also update the test-statx.c program to display the change attribute and
> > the mountid as well.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/stat.c | 7 +++++++
> > include/linux/stat.h | 1 +
> > include/uapi/linux/stat.h | 3 ++-
> > samples/vfs/test-statx.c | 8 ++++++--
> > 4 files changed, 16 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/stat.c b/fs/stat.c
> > index 9ced8860e0f3..7c3d063c31ba 100644
> > --- a/fs/stat.c
> > +++ b/fs/stat.c
> > @@ -17,6 +17,7 @@
> > #include <linux/syscalls.h>
> > #include <linux/pagemap.h>
> > #include <linux/compat.h>
> > +#include <linux/iversion.h>
> >
> > #include <linux/uaccess.h>
> > #include <asm/unistd.h>
> > @@ -118,6 +119,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
> > stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
> > STATX_ATTR_DAX);
> >
> > + if ((request_mask & STATX_CHANGE_ATTR) && IS_I_VERSION(inode)) {
> > + stat->result_mask |= STATX_CHANGE_ATTR;
> > + stat->change_attr = inode_query_iversion(inode);
> > + }
> > +
> > mnt_userns = mnt_user_ns(path->mnt);
> > if (inode->i_op->getattr)
> > return inode->i_op->getattr(mnt_userns, path, stat,
> > @@ -611,6 +617,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
> > tmp.stx_dev_major = MAJOR(stat->dev);
> > tmp.stx_dev_minor = MINOR(stat->dev);
> > tmp.stx_mnt_id = stat->mnt_id;
> > + tmp.stx_change_attr = stat->change_attr;
> >
> > return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
> > }
> > diff --git a/include/linux/stat.h b/include/linux/stat.h
> > index 7df06931f25d..7b444c2ad0ad 100644
> > --- a/include/linux/stat.h
> > +++ b/include/linux/stat.h
> > @@ -50,6 +50,7 @@ struct kstat {
> > struct timespec64 btime; /* File creation time */
> > u64 blocks;
> > u64 mnt_id;
> > + u64 change_attr;
> > };
> >
> > #endif
> > diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
> > index 1500a0f58041..fd839ec76aa4 100644
> > --- a/include/uapi/linux/stat.h
> > +++ b/include/uapi/linux/stat.h
> > @@ -124,7 +124,7 @@ struct statx {
> > __u32 stx_dev_minor;
> > /* 0x90 */
> > __u64 stx_mnt_id;
> > - __u64 __spare2;
> > + __u64 stx_change_attr; /* Inode change attribute */
> > /* 0xa0 */
> > __u64 __spare3[12]; /* Spare space for future expansion */
> > /* 0x100 */
> > @@ -152,6 +152,7 @@ struct statx {
> > #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
> > #define STATX_BTIME 0x00000800U /* Want/got stx_btime */
> > #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
> > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */
>
> I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag
> and field. Or I fail to understand what exact information this will
> expose and how userspace will consume it.
> To me the naming gives the impression that some set of generic
> attributes have changed but given that statx is about querying file
> attributes this becomes confusing.
>
> Wouldn't it make more sense this time to expose it as what it is and
> call this STATX_INO_VERSION and __u64 stx_ino_version?

Ok, having thought about this some more, I think this is a reasonable
name. It _does_ sort of imply that this value will increase over time.
That's true of all of the existing implementations, but I think we ought
to define such that there could be alternative implementations.

I'll respin this patch and resend it with a wider audience.

Thanks for the input so far!
--
Jeff Layton <[email protected]>