When an i_op->getattr() call is made on an NFS file
(typically from a 'stat' family system call), NFS
will first flush any dirty data to the server.
This ensures that the mtime reported is correct and stable,
but has a performance penalty. 'stat' is normally thought
to be a quick operation, and imposing this cost can be
surprising.
I have seen problems when one process is writing a large
file and another process performs "ls -l" on the containing
directory and is blocked for as long as it take to flush
all the dirty data to the server, which can be minutes.
I have also seen a legacy application which frequently calls
"fstat" on a file that it is writing to. On a local
filesystem (and in the Solaris implementation of NFS) this
fstat call is cheap. On Linux/NFS, the causes a noticeable
decrease in throughput.
The only circumstances where an application calling 'stat()'
might get an mtime which is not stable are times when some
other process is writing to the file and the two processes
are not using locking to ensure consistency, or when the one
process is both writing and stating. In neither of these
cases is it reasonable to expect the mtime to be stable.
In the most common cases where mtime is important
(e.g. make), no other process has the file open, so there
will be no dirty data and the mtime will be stable.
Rather than unilaterally changing this behavior of 'stat',
this patch adds a "nosyncflush" mount option to allow
sysadmins to have applications which are hurt by the current
behavior to disable it.
Note that this option should probably *not* be used together
with "nocto". In that case, mtime could be unstable even
when no process has the file open.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/inode.c | 3 ++-
fs/nfs/super.c | 10 ++++++++++
include/uapi/linux/nfs_mount.h | 6 ++++--
3 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b992d2382ffa..16629a34dd62 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -740,7 +740,8 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
- if (S_ISREG(inode->i_mode)) {
+ if (S_ISREG(inode->i_mode) &&
+ !(NFS_SERVER(inode)->flags & NFS_MOUNT_NOSTATFLUSH)) {
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto out;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29bacdc56f6a..2351c0be98f5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -90,6 +90,7 @@ enum {
Opt_resvport, Opt_noresvport,
Opt_fscache, Opt_nofscache,
Opt_migration, Opt_nomigration,
+ Opt_statflush, Opt_nostatflush,
/* Mount options that take integer arguments */
Opt_port,
@@ -151,6 +152,8 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_nofscache, "nofsc" },
{ Opt_migration, "migration" },
{ Opt_nomigration, "nomigration" },
+ { Opt_statflush, "statflush" },
+ { Opt_nostatflush, "nostatflush" },
{ Opt_port, "port=%s" },
{ Opt_rsize, "rsize=%s" },
@@ -637,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
{ NFS_MOUNT_UNSHARED, ",nosharecache", "" },
{ NFS_MOUNT_NORESVPORT, ",noresvport", "" },
+ { NFS_MOUNT_NOSTATFLUSH, ",nostatflush", "" },
{ 0, NULL, NULL }
};
const struct proc_nfs_info *nfs_infop;
@@ -1334,6 +1338,12 @@ static int nfs_parse_mount_options(char *raw,
case Opt_nomigration:
mnt->options &= ~NFS_OPTION_MIGRATION;
break;
+ case Opt_statflush:
+ mnt->flags &= ~NFS_MOUNT_NOSTATFLUSH;
+ break;
+ case Opt_nostatflush:
+ mnt->flags |= NFS_MOUNT_NOSTATFLUSH;
+ break;
/*
* options that take numeric values
diff --git a/include/uapi/linux/nfs_mount.h b/include/uapi/linux/nfs_mount.h
index e44e00616ab5..d7c6f809d25d 100644
--- a/include/uapi/linux/nfs_mount.h
+++ b/include/uapi/linux/nfs_mount.h
@@ -72,7 +72,9 @@ struct nfs_mount_data {
#define NFS_MOUNT_NORESVPORT 0x40000
#define NFS_MOUNT_LEGACY_INTERFACE 0x80000
-#define NFS_MOUNT_LOCAL_FLOCK 0x100000
-#define NFS_MOUNT_LOCAL_FCNTL 0x200000
+#define NFS_MOUNT_LOCAL_FLOCK 0x100000
+#define NFS_MOUNT_LOCAL_FCNTL 0x200000
+
+#define NFS_MOUNT_NOSTATFLUSH 0x400000
#endif
--
2.14.0.rc0.dirty
Hi Neil-
> On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
>=20
>=20
> When an i_op->getattr() call is made on an NFS file
> (typically from a 'stat' family system call), NFS
> will first flush any dirty data to the server.
>=20
> This ensures that the mtime reported is correct and stable,
> but has a performance penalty. 'stat' is normally thought
> to be a quick operation, and imposing this cost can be
> surprising.
To be clear, this behavior is a POSIX requirement.
> I have seen problems when one process is writing a large
> file and another process performs "ls -l" on the containing
> directory and is blocked for as long as it take to flush
> all the dirty data to the server, which can be minutes.
Yes, a well-known annoyance that cannot be addressed
even with a write delegation.
> I have also seen a legacy application which frequently calls
> "fstat" on a file that it is writing to. On a local
> filesystem (and in the Solaris implementation of NFS) this
> fstat call is cheap. On Linux/NFS, the causes a noticeable
> decrease in throughput.
If the preceding write is small, Linux could be using
a FILE_SYNC write, but Solaris could be using UNSTABLE.
> The only circumstances where an application calling 'stat()'
> might get an mtime which is not stable are times when some
> other process is writing to the file and the two processes
> are not using locking to ensure consistency, or when the one
> process is both writing and stating. In neither of these
> cases is it reasonable to expect the mtime to be stable.
I'm not convinced this is a strong enough rationale
for claiming it is safe to disable the existing
behavior.
You've explained cases where the new behavior is
reasonable, but do you have any examples where the
new behavior would be a problem? There must be a
reason why POSIX explicitly requires an up-to-date
mtime.
What guidance would nfs(5) give on when it is safe
to specify the new mount option?
> In the most common cases where mtime is important
> (e.g. make), no other process has the file open, so there
> will be no dirty data and the mtime will be stable.
Isn't it also the case that make is a multi-process
workload where one process modifies a file, then
closes it (which triggers a flush), and then another
process stats the file? The new mount option does
not change the behavior of close(2), does it?
> Rather than unilaterally changing this behavior of 'stat',
> this patch adds a "nosyncflush" mount option to allow
> sysadmins to have applications which are hurt by the current
> behavior to disable it.
IMO a mount option is at the wrong granularity. A
mount point will be shared between applications that
can tolerate the non-POSIX behavior and those that
cannot, for instance.
I would rather see us address the underlying cause
of the delay, which is that the GETATTR gets starved
by an ongoing stream of WRITE requests. You could:
- Make active writers wait for the GETATTR so the
GETATTR is not starved
- Start flushing dirty pages earlier so there is
less to flush when a stat(2) is done
- Ensure that GETATTR is not also waiting for a
COMMIT
Or maybe there's some other problem?
I recall nfs_getattr used to grab i_mutex to hold
up active writers. But i_mutex is gone now. Is
there some other mechanism that can block writers
while the client flushes the file and handles the
GETATTR request?
> Note that this option should probably *not* be used together
> with "nocto". In that case, mtime could be unstable even
> when no process has the file open.
>=20
> Signed-off-by: NeilBrown <[email protected]>
> ---
> fs/nfs/inode.c | 3 ++-
> fs/nfs/super.c | 10 ++++++++++
> include/uapi/linux/nfs_mount.h | 6 ++++--
> 3 files changed, 16 insertions(+), 3 deletions(-)
>=20
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index b992d2382ffa..16629a34dd62 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -740,7 +740,8 @@ int nfs_getattr(const struct path *path, struct =
kstat *stat,
>=20
> trace_nfs_getattr_enter(inode);
> /* Flush out writes to the server in order to update c/mtime. =
*/
> - if (S_ISREG(inode->i_mode)) {
> + if (S_ISREG(inode->i_mode) &&
> + !(NFS_SERVER(inode)->flags & NFS_MOUNT_NOSTATFLUSH)) {
> err =3D filemap_write_and_wait(inode->i_mapping);
> if (err)
> goto out;
> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
> index 29bacdc56f6a..2351c0be98f5 100644
> --- a/fs/nfs/super.c
> +++ b/fs/nfs/super.c
> @@ -90,6 +90,7 @@ enum {
> Opt_resvport, Opt_noresvport,
> Opt_fscache, Opt_nofscache,
> Opt_migration, Opt_nomigration,
> + Opt_statflush, Opt_nostatflush,
>=20
> /* Mount options that take integer arguments */
> Opt_port,
> @@ -151,6 +152,8 @@ static const match_table_t nfs_mount_option_tokens =
=3D {
> { Opt_nofscache, "nofsc" },
> { Opt_migration, "migration" },
> { Opt_nomigration, "nomigration" },
> + { Opt_statflush, "statflush" },
> + { Opt_nostatflush, "nostatflush" },
>=20
> { Opt_port, "port=3D%s" },
> { Opt_rsize, "rsize=3D%s" },
> @@ -637,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file =
*m, struct nfs_server *nfss,
> { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
> { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
> { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
> + { NFS_MOUNT_NOSTATFLUSH, ",nostatflush", "" },
> { 0, NULL, NULL }
> };
> const struct proc_nfs_info *nfs_infop;
> @@ -1334,6 +1338,12 @@ static int nfs_parse_mount_options(char *raw,
> case Opt_nomigration:
> mnt->options &=3D ~NFS_OPTION_MIGRATION;
> break;
> + case Opt_statflush:
> + mnt->flags &=3D ~NFS_MOUNT_NOSTATFLUSH;
> + break;
> + case Opt_nostatflush:
> + mnt->flags |=3D NFS_MOUNT_NOSTATFLUSH;
> + break;
>=20
> /*
> * options that take numeric values
> diff --git a/include/uapi/linux/nfs_mount.h =
b/include/uapi/linux/nfs_mount.h
> index e44e00616ab5..d7c6f809d25d 100644
> --- a/include/uapi/linux/nfs_mount.h
> +++ b/include/uapi/linux/nfs_mount.h
> @@ -72,7 +72,9 @@ struct nfs_mount_data {
> #define NFS_MOUNT_NORESVPORT 0x40000
> #define NFS_MOUNT_LEGACY_INTERFACE 0x80000
>=20
> -#define NFS_MOUNT_LOCAL_FLOCK 0x100000
> -#define NFS_MOUNT_LOCAL_FCNTL 0x200000
> +#define NFS_MOUNT_LOCAL_FLOCK 0x100000
> +#define NFS_MOUNT_LOCAL_FCNTL 0x200000
> +
> +#define NFS_MOUNT_NOSTATFLUSH 0x400000
>=20
> #endif
> --=20
> 2.14.0.rc0.dirty
>=20
--
Chuck Lever
T24gVGh1LCAyMDE3LTEyLTIxIGF0IDEwOjM5IC0wNTAwLCBDaHVjayBMZXZlciB3cm90ZToNCj4g
SGkgTmVpbC0NCj4gDQo+IA0KPiA+IE9uIERlYyAyMCwgMjAxNywgYXQgOTo1NyBQTSwgTmVpbEJy
b3duIDxuZWlsYkBzdXNlLmNvbT4gd3JvdGU6DQo+ID4gDQo+ID4gDQo+ID4gV2hlbiBhbiBpX29w
LT5nZXRhdHRyKCkgY2FsbCBpcyBtYWRlIG9uIGFuIE5GUyBmaWxlDQo+ID4gKHR5cGljYWxseSBm
cm9tIGEgJ3N0YXQnIGZhbWlseSBzeXN0ZW0gY2FsbCksIE5GUw0KPiA+IHdpbGwgZmlyc3QgZmx1
c2ggYW55IGRpcnR5IGRhdGEgdG8gdGhlIHNlcnZlci4NCj4gPiANCj4gPiBUaGlzIGVuc3VyZXMg
dGhhdCB0aGUgbXRpbWUgcmVwb3J0ZWQgaXMgY29ycmVjdCBhbmQgc3RhYmxlLA0KPiA+IGJ1dCBo
YXMgYSBwZXJmb3JtYW5jZSBwZW5hbHR5LiAgJ3N0YXQnIGlzIG5vcm1hbGx5IHRob3VnaHQNCj4g
PiB0byBiZSBhIHF1aWNrIG9wZXJhdGlvbiwgYW5kIGltcG9zaW5nIHRoaXMgY29zdCBjYW4gYmUN
Cj4gPiBzdXJwcmlzaW5nLg0KPiANCj4gVG8gYmUgY2xlYXIsIHRoaXMgYmVoYXZpb3IgaXMgYSBQ
T1NJWCByZXF1aXJlbWVudC4NCj4gDQo+IA0KPiA+IEkgaGF2ZSBzZWVuIHByb2JsZW1zIHdoZW4g
b25lIHByb2Nlc3MgaXMgd3JpdGluZyBhIGxhcmdlDQo+ID4gZmlsZSBhbmQgYW5vdGhlciBwcm9j
ZXNzIHBlcmZvcm1zICJscyAtbCIgb24gdGhlIGNvbnRhaW5pbmcNCj4gPiBkaXJlY3RvcnkgYW5k
IGlzIGJsb2NrZWQgZm9yIGFzIGxvbmcgYXMgaXQgdGFrZSB0byBmbHVzaA0KPiA+IGFsbCB0aGUg
ZGlydHkgZGF0YSB0byB0aGUgc2VydmVyLCB3aGljaCBjYW4gYmUgbWludXRlcy4NCj4gDQo+IFll
cywgYSB3ZWxsLWtub3duIGFubm95YW5jZSB0aGF0IGNhbm5vdCBiZSBhZGRyZXNzZWQNCj4gZXZl
biB3aXRoIGEgd3JpdGUgZGVsZWdhdGlvbi4NCj4gDQo+IA0KPiA+IEkgaGF2ZSBhbHNvIHNlZW4g
YSBsZWdhY3kgYXBwbGljYXRpb24gd2hpY2ggZnJlcXVlbnRseSBjYWxscw0KPiA+ICJmc3RhdCIg
b24gYSBmaWxlIHRoYXQgaXQgaXMgd3JpdGluZyB0by4gIE9uIGEgbG9jYWwNCj4gPiBmaWxlc3lz
dGVtIChhbmQgaW4gdGhlIFNvbGFyaXMgaW1wbGVtZW50YXRpb24gb2YgTkZTKSB0aGlzDQo+ID4g
ZnN0YXQgY2FsbCBpcyBjaGVhcC4gIE9uIExpbnV4L05GUywgdGhlIGNhdXNlcyBhIG5vdGljZWFi
bGUNCj4gPiBkZWNyZWFzZSBpbiB0aHJvdWdocHV0Lg0KPiANCj4gSWYgdGhlIHByZWNlZGluZyB3
cml0ZSBpcyBzbWFsbCwgTGludXggY291bGQgYmUgdXNpbmcNCj4gYSBGSUxFX1NZTkMgd3JpdGUs
IGJ1dCBTb2xhcmlzIGNvdWxkIGJlIHVzaW5nIFVOU1RBQkxFLg0KPiANCj4gDQo+ID4gVGhlIG9u
bHkgY2lyY3Vtc3RhbmNlcyB3aGVyZSBhbiBhcHBsaWNhdGlvbiBjYWxsaW5nICdzdGF0KCknDQo+
ID4gbWlnaHQgZ2V0IGFuIG10aW1lIHdoaWNoIGlzIG5vdCBzdGFibGUgYXJlIHRpbWVzIHdoZW4g
c29tZQ0KPiA+IG90aGVyIHByb2Nlc3MgaXMgd3JpdGluZyB0byB0aGUgZmlsZSBhbmQgdGhlIHR3
byBwcm9jZXNzZXMNCj4gPiBhcmUgbm90IHVzaW5nIGxvY2tpbmcgdG8gZW5zdXJlIGNvbnNpc3Rl
bmN5LCBvciB3aGVuIHRoZSBvbmUNCj4gPiBwcm9jZXNzIGlzIGJvdGggd3JpdGluZyBhbmQgc3Rh
dGluZy4gIEluIG5laXRoZXIgb2YgdGhlc2UNCj4gPiBjYXNlcyBpcyBpdCByZWFzb25hYmxlIHRv
IGV4cGVjdCB0aGUgbXRpbWUgdG8gYmUgc3RhYmxlLg0KPiANCj4gSSdtIG5vdCBjb252aW5jZWQg
dGhpcyBpcyBhIHN0cm9uZyBlbm91Z2ggcmF0aW9uYWxlDQo+IGZvciBjbGFpbWluZyBpdCBpcyBz
YWZlIHRvIGRpc2FibGUgdGhlIGV4aXN0aW5nDQo+IGJlaGF2aW9yLg0KPiANCj4gWW91J3ZlIGV4
cGxhaW5lZCBjYXNlcyB3aGVyZSB0aGUgbmV3IGJlaGF2aW9yIGlzDQo+IHJlYXNvbmFibGUsIGJ1
dCBkbyB5b3UgaGF2ZSBhbnkgZXhhbXBsZXMgd2hlcmUgdGhlDQo+IG5ldyBiZWhhdmlvciB3b3Vs
ZCBiZSBhIHByb2JsZW0/IFRoZXJlIG11c3QgYmUgYQ0KPiByZWFzb24gd2h5IFBPU0lYIGV4cGxp
Y2l0bHkgcmVxdWlyZXMgYW4gdXAtdG8tZGF0ZQ0KPiBtdGltZS4NCj4gDQo+IFdoYXQgZ3VpZGFu
Y2Ugd291bGQgbmZzKDUpIGdpdmUgb24gd2hlbiBpdCBpcyBzYWZlDQo+IHRvIHNwZWNpZnkgdGhl
IG5ldyBtb3VudCBvcHRpb24/DQo+IA0KPiANCj4gPiBJbiB0aGUgbW9zdCBjb21tb24gY2FzZXMg
d2hlcmUgbXRpbWUgaXMgaW1wb3J0YW50DQo+ID4gKGUuZy4gbWFrZSksIG5vIG90aGVyIHByb2Nl
c3MgaGFzIHRoZSBmaWxlIG9wZW4sIHNvIHRoZXJlDQo+ID4gd2lsbCBiZSBubyBkaXJ0eSBkYXRh
IGFuZCB0aGUgbXRpbWUgd2lsbCBiZSBzdGFibGUuDQo+IA0KPiBJc24ndCBpdCBhbHNvIHRoZSBj
YXNlIHRoYXQgbWFrZSBpcyBhIG11bHRpLXByb2Nlc3MNCj4gd29ya2xvYWQgd2hlcmUgb25lIHBy
b2Nlc3MgbW9kaWZpZXMgYSBmaWxlLCB0aGVuDQo+IGNsb3NlcyBpdCAod2hpY2ggdHJpZ2dlcnMg
YSBmbHVzaCksIGFuZCB0aGVuIGFub3RoZXINCj4gcHJvY2VzcyBzdGF0cyB0aGUgZmlsZT8gVGhl
IG5ldyBtb3VudCBvcHRpb24gZG9lcw0KPiBub3QgY2hhbmdlIHRoZSBiZWhhdmlvciBvZiBjbG9z
ZSgyKSwgZG9lcyBpdD8NCj4gDQo+IA0KPiA+IFJhdGhlciB0aGFuIHVuaWxhdGVyYWxseSBjaGFu
Z2luZyB0aGlzIGJlaGF2aW9yIG9mICdzdGF0JywNCj4gPiB0aGlzIHBhdGNoIGFkZHMgYSAibm9z
eW5jZmx1c2giIG1vdW50IG9wdGlvbiB0byBhbGxvdw0KPiA+IHN5c2FkbWlucyB0byBoYXZlIGFw
cGxpY2F0aW9ucyB3aGljaCBhcmUgaHVydCBieSB0aGUgY3VycmVudA0KPiA+IGJlaGF2aW9yIHRv
IGRpc2FibGUgaXQuDQo+IA0KPiBJTU8gYSBtb3VudCBvcHRpb24gaXMgYXQgdGhlIHdyb25nIGdy
YW51bGFyaXR5LiBBDQo+IG1vdW50IHBvaW50IHdpbGwgYmUgc2hhcmVkIGJldHdlZW4gYXBwbGlj
YXRpb25zIHRoYXQNCj4gY2FuIHRvbGVyYXRlIHRoZSBub24tUE9TSVggYmVoYXZpb3IgYW5kIHRo
b3NlIHRoYXQNCj4gY2Fubm90LCBmb3IgaW5zdGFuY2UuDQoNCkFncmVlZC4gDQoNClRoZSBvdGhl
ciB0aGluZyB0byBub3RlIGhlcmUgaXMgdGhhdCB3ZSBub3cgaGF2ZSBhbiBlbWJyeW9uaWMgc3Rh
dHgoKQ0Kc3lzdGVtIGNhbGwsIHdoaWNoIGFsbG93cyB0aGUgYXBwbGljYXRpb24gaXRzZWxmIHRv
IGRlY2lkZSB3aGV0aGVyIG9yDQpub3QgaXQgbmVlZHMgdXAgdG8gZGF0ZSB2YWx1ZXMgZm9yIHRo
ZSBhdGltZS9jdGltZS9tdGltZS4gV2hpbGUgd2UNCmhhdmVuJ3QgeWV0IHBsdW1iZWQgaW4gdGhl
IE5GUyBzaWRlLCB0aGUgaW50ZW50aW9uIHdhcyBhbHdheXMgdG8gdXNlDQp0aGF0IGluZm9ybWF0
aW9uIHRvIHR1cm4gb2ZmIHRoZSB3cml0ZWJhY2sgZmx1c2hpbmcgd2hlbiBwb3NzaWJsZS4NCg0K
Q2hlZXJzDQogIFRyb25kDQoNCi0tIA0KVHJvbmQgTXlrbGVidXN0DQpMaW51eCBORlMgY2xpZW50
IG1haW50YWluZXIsIFByaW1hcnlEYXRhDQp0cm9uZC5teWtsZWJ1c3RAcHJpbWFyeWRhdGEuY29t
DQo=
On Thu, Dec 21 2017, Chuck Lever wrote:
> Hi Neil-
>
>
>> On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
>>
>>
>> When an i_op->getattr() call is made on an NFS file
>> (typically from a 'stat' family system call), NFS
>> will first flush any dirty data to the server.
>>
>> This ensures that the mtime reported is correct and stable,
>> but has a performance penalty. 'stat' is normally thought
>> to be a quick operation, and imposing this cost can be
>> surprising.
>
> To be clear, this behavior is a POSIX requirement.
Ah, that would be:
http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_09
which says:
All timestamps that are marked for update shall be updated when the
file ceases to be open by any process or before a fstat(), fstatat(),
fsync(), futimens(), lstat(), stat(), utime(), utimensat(), or
utimes() is successfully performed on the file.
Suppose that when handling a stat(), if we find there are dirty pages,
we send a SETATTR to the server to set the mtime to the current time,
and use the result for the mtime? That would avoid the costly flush, but still
give the appearance of the mtime being correctly updated.
>
>
>> I have seen problems when one process is writing a large
>> file and another process performs "ls -l" on the containing
>> directory and is blocked for as long as it take to flush
>> all the dirty data to the server, which can be minutes.
>
> Yes, a well-known annoyance that cannot be addressed
> even with a write delegation.
>
>
>> I have also seen a legacy application which frequently calls
>> "fstat" on a file that it is writing to. On a local
>> filesystem (and in the Solaris implementation of NFS) this
>> fstat call is cheap. On Linux/NFS, the causes a noticeable
>> decrease in throughput.
>
> If the preceding write is small, Linux could be using
> a FILE_SYNC write, but Solaris could be using UNSTABLE.
No, what is actually happening is that Linux is flushing out data and
Solaris isn't. There are differences in stability and in write size,
but they aren't the main contributors to the performance difference.
>
>
>> The only circumstances where an application calling 'stat()'
>> might get an mtime which is not stable are times when some
>> other process is writing to the file and the two processes
>> are not using locking to ensure consistency, or when the one
>> process is both writing and stating. In neither of these
>> cases is it reasonable to expect the mtime to be stable.
>
> I'm not convinced this is a strong enough rationale
> for claiming it is safe to disable the existing
> behavior.
>
> You've explained cases where the new behavior is
> reasonable, but do you have any examples where the
> new behavior would be a problem? There must be a
> reason why POSIX explicitly requires an up-to-date
> mtime.
I honestly cannot think of any credible scenario where the current
behaviour would be required.
If I write to a file from one NFS client, and request a stat() from
another NFS client, the stat doesn't flush any data (though it could
with NFSv4 if write delegations were being given). So depending on a
stat() returning precise timestamps when there is no other co-ordination
between processes will already fail. Why do we need to provide a
guarantee to processes running on the same client that we don't provide
to processes running on different clients?
>
> What guidance would nfs(5) give on when it is safe
> to specify the new mount option?
That is an excellent question to which I don't have an excellent answer.
nostatflush: A strict reading of the Posix specification requires that
any cached writes be flushed to the server before responding to
stat() and related operations, so that the timestamps are accurate
and stable. NFS does not guarantee this between applications on
different clients, but does when the writer and the caller of stat()
are on the same host. This flush can sometimes negatively impact
performance. Specifying the nostatflush option causes NFS to ignore
this requirement of Posix. A stat call when there is dirty data may
report a different modify timestamp to the one that would be reported
after that data was subsequently flushed to the server. There are no
known use cases where this inconsistency would cause a problem, but as
avoiding it is a Posix requirement, the default behavior to force a
flush on every stat() call.
???
>
>
>> In the most common cases where mtime is important
>> (e.g. make), no other process has the file open, so there
>> will be no dirty data and the mtime will be stable.
>
> Isn't it also the case that make is a multi-process
> workload where one process modifies a file, then
> closes it (which triggers a flush), and then another
> process stats the file? The new mount option does
> not change the behavior of close(2), does it?
No, the mount option doesn't change the behaviour of close(2).
A separate mount option (nocto) can do that.
I think your point is that close() can be delayed by flush in the same
way that stat() can. I think that is true, but not very relevant.
stat() can be called more often that close() is likely to be, and
close() only imposes a delay on the process that did the writing, after
it has finished writing. stat() can impose a delay on arbitrary other
processes and at other times.
>
>
>> Rather than unilaterally changing this behavior of 'stat',
>> this patch adds a "nosyncflush" mount option to allow
>> sysadmins to have applications which are hurt by the current
>> behavior to disable it.
>
> IMO a mount option is at the wrong granularity. A
> mount point will be shared between applications that
> can tolerate the non-POSIX behavior and those that
> cannot, for instance.
It is a better granularity than a module parameter, which was my first
approach ;-)
Yes, if we could create a control-group which avoided flush-on-stat, and
ran the problematic programs in that control group, that might be ideal
... for some values of "ideal".
You *could* mount the same filesystem in two places with nosharecache
and with only one having nosyncflush. But you probably wouldn't.
>
> I would rather see us address the underlying cause
> of the delay, which is that the GETATTR gets starved
> by an ongoing stream of WRITE requests. You could:
I don't think starving is the issue.
Of the two specific cases that I have seen,
in one the issue was that the dirty_ratio multiplied by the amount of
memory, divided by the throughput to the server, resulted in many
minutes to flush out all the dirty data. Having "ls -l" wait for that
flush was quite inconvenient. (We asked the customer to tune dirty_ratio
way down).
In the other (more recent) case the file being written was some sort of
data-base being restored from a dump (or something like that). So some
blocks (presumably indexing blocks) were being written multiple times.
When we disabled flush-on-sync the total number of bytes written went
down by about 20% (vague memory .. certainly a non-trivial amount)
because these index block were only written to the server once instead
of multiple times. So that flush-on-stat slowed down throughput in part
by pushing more data over the wire.
>
> - Make active writers wait for the GETATTR so the
> GETATTR is not starved
>
> - Start flushing dirty pages earlier so there is
> less to flush when a stat(2) is done
>
> - Ensure that GETATTR is not also waiting for a
> COMMIT
>
> Or maybe there's some other problem?
>
> I recall nfs_getattr used to grab i_mutex to hold
> up active writers. But i_mutex is gone now. Is
> there some other mechanism that can block writers
> while the client flushes the file and handles the
> GETATTR request?
I vaguely remember when i_mutex locking was added to nfs_getattr to
avoid the starvation :-)
Today nfs_getattr() calls filemap_write_and_wait() which first triggers
writes on all dirty pages, then waits for all writeback pages.
The first stage should be fairly quick, and writes during the second
stage shouldn't slow it down. If some process were calling fsync often
(e.g. using O_SYNC) that might starve nfs_getattr(), but I don't think
normal usage would.
I do agree that a "real" fix would be better than a mount option.
The only path to a "real" fix that I can think of is to provide some
alternate source of producing a credible mtime without flushing all the
data.
Might a SETATTR using SET_TO_SERVER_TIME be a workable way forward?
Thank for your thoughts!
NeilBrown
>
>
>> Note that this option should probably *not* be used together
>> with "nocto". In that case, mtime could be unstable even
>> when no process has the file open.
>>
>> Signed-off-by: NeilBrown <[email protected]>
>> ---
>> fs/nfs/inode.c | 3 ++-
>> fs/nfs/super.c | 10 ++++++++++
>> include/uapi/linux/nfs_mount.h | 6 ++++--
>> 3 files changed, 16 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>> index b992d2382ffa..16629a34dd62 100644
>> --- a/fs/nfs/inode.c
>> +++ b/fs/nfs/inode.c
>> @@ -740,7 +740,8 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
>>
>> trace_nfs_getattr_enter(inode);
>> /* Flush out writes to the server in order to update c/mtime. */
>> - if (S_ISREG(inode->i_mode)) {
>> + if (S_ISREG(inode->i_mode) &&
>> + !(NFS_SERVER(inode)->flags & NFS_MOUNT_NOSTATFLUSH)) {
>> err = filemap_write_and_wait(inode->i_mapping);
>> if (err)
>> goto out;
>> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
>> index 29bacdc56f6a..2351c0be98f5 100644
>> --- a/fs/nfs/super.c
>> +++ b/fs/nfs/super.c
>> @@ -90,6 +90,7 @@ enum {
>> Opt_resvport, Opt_noresvport,
>> Opt_fscache, Opt_nofscache,
>> Opt_migration, Opt_nomigration,
>> + Opt_statflush, Opt_nostatflush,
>>
>> /* Mount options that take integer arguments */
>> Opt_port,
>> @@ -151,6 +152,8 @@ static const match_table_t nfs_mount_option_tokens = {
>> { Opt_nofscache, "nofsc" },
>> { Opt_migration, "migration" },
>> { Opt_nomigration, "nomigration" },
>> + { Opt_statflush, "statflush" },
>> + { Opt_nostatflush, "nostatflush" },
>>
>> { Opt_port, "port=%s" },
>> { Opt_rsize, "rsize=%s" },
>> @@ -637,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
>> { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
>> { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
>> { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
>> + { NFS_MOUNT_NOSTATFLUSH, ",nostatflush", "" },
>> { 0, NULL, NULL }
>> };
>> const struct proc_nfs_info *nfs_infop;
>> @@ -1334,6 +1338,12 @@ static int nfs_parse_mount_options(char *raw,
>> case Opt_nomigration:
>> mnt->options &= ~NFS_OPTION_MIGRATION;
>> break;
>> + case Opt_statflush:
>> + mnt->flags &= ~NFS_MOUNT_NOSTATFLUSH;
>> + break;
>> + case Opt_nostatflush:
>> + mnt->flags |= NFS_MOUNT_NOSTATFLUSH;
>> + break;
>>
>> /*
>> * options that take numeric values
>> diff --git a/include/uapi/linux/nfs_mount.h b/include/uapi/linux/nfs_mount.h
>> index e44e00616ab5..d7c6f809d25d 100644
>> --- a/include/uapi/linux/nfs_mount.h
>> +++ b/include/uapi/linux/nfs_mount.h
>> @@ -72,7 +72,9 @@ struct nfs_mount_data {
>> #define NFS_MOUNT_NORESVPORT 0x40000
>> #define NFS_MOUNT_LEGACY_INTERFACE 0x80000
>>
>> -#define NFS_MOUNT_LOCAL_FLOCK 0x100000
>> -#define NFS_MOUNT_LOCAL_FCNTL 0x200000
>> +#define NFS_MOUNT_LOCAL_FLOCK 0x100000
>> +#define NFS_MOUNT_LOCAL_FCNTL 0x200000
>> +
>> +#define NFS_MOUNT_NOSTATFLUSH 0x400000
>>
>> #endif
>> --
>> 2.14.0.rc0.dirty
>>
>
> --
> Chuck Lever
On Thu, Dec 21 2017, Trond Myklebust wrote:
> On Thu, 2017-12-21 at 10:39 -0500, Chuck Lever wrote:
>> Hi Neil-
>>
>>
>> > On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
>> >
>> >
>> > When an i_op->getattr() call is made on an NFS file
>> > (typically from a 'stat' family system call), NFS
>> > will first flush any dirty data to the server.
>> >
>> > This ensures that the mtime reported is correct and stable,
>> > but has a performance penalty. 'stat' is normally thought
>> > to be a quick operation, and imposing this cost can be
>> > surprising.
>>
>> To be clear, this behavior is a POSIX requirement.
>>
>>
>> > I have seen problems when one process is writing a large
>> > file and another process performs "ls -l" on the containing
>> > directory and is blocked for as long as it take to flush
>> > all the dirty data to the server, which can be minutes.
>>
>> Yes, a well-known annoyance that cannot be addressed
>> even with a write delegation.
>>
>>
>> > I have also seen a legacy application which frequently calls
>> > "fstat" on a file that it is writing to. On a local
>> > filesystem (and in the Solaris implementation of NFS) this
>> > fstat call is cheap. On Linux/NFS, the causes a noticeable
>> > decrease in throughput.
>>
>> If the preceding write is small, Linux could be using
>> a FILE_SYNC write, but Solaris could be using UNSTABLE.
>>
>>
>> > The only circumstances where an application calling 'stat()'
>> > might get an mtime which is not stable are times when some
>> > other process is writing to the file and the two processes
>> > are not using locking to ensure consistency, or when the one
>> > process is both writing and stating. In neither of these
>> > cases is it reasonable to expect the mtime to be stable.
>>
>> I'm not convinced this is a strong enough rationale
>> for claiming it is safe to disable the existing
>> behavior.
>>
>> You've explained cases where the new behavior is
>> reasonable, but do you have any examples where the
>> new behavior would be a problem? There must be a
>> reason why POSIX explicitly requires an up-to-date
>> mtime.
>>
>> What guidance would nfs(5) give on when it is safe
>> to specify the new mount option?
>>
>>
>> > In the most common cases where mtime is important
>> > (e.g. make), no other process has the file open, so there
>> > will be no dirty data and the mtime will be stable.
>>
>> Isn't it also the case that make is a multi-process
>> workload where one process modifies a file, then
>> closes it (which triggers a flush), and then another
>> process stats the file? The new mount option does
>> not change the behavior of close(2), does it?
>>
>>
>> > Rather than unilaterally changing this behavior of 'stat',
>> > this patch adds a "nosyncflush" mount option to allow
>> > sysadmins to have applications which are hurt by the current
>> > behavior to disable it.
>>
>> IMO a mount option is at the wrong granularity. A
>> mount point will be shared between applications that
>> can tolerate the non-POSIX behavior and those that
>> cannot, for instance.
>
> Agreed.
>
> The other thing to note here is that we now have an embryonic statx()
> system call, which allows the application itself to decide whether or
> not it needs up to date values for the atime/ctime/mtime. While we
> haven't yet plumbed in the NFS side, the intention was always to use
> that information to turn off the writeback flushing when possible.
Yes, if statx() were actually working, we could change the application
to avoid the flush. But then if changing the application were an
option, I suspect that - for my current customer issue - we could just
remove the fstat() calls. I doubt they are really necessary.
I think programmers often think of stat() (and particularly fstat()) as
fairly cheap and so they use it whenever convenient. Only NFS violates
this expectation.
Also statx() is only a real solution if/when it gets widely used. Will
"ls -l" default to AT_STATX_DONT_SYNC ??
Apart from the Posix requirement (which only requires that the
timestamps be updated, not that the data be flushed), do you know of any
value gained from flushing data before stat()?
Thanks,
NeilBrown
On Fri, 2017-12-22 at 07:59 +1100, NeilBrown wrote:
> On Thu, Dec 21 2017, Trond Myklebust wrote:
>
> > On Thu, 2017-12-21 at 10:39 -0500, Chuck Lever wrote:
> > > Hi Neil-
> > >
> > >
> > > > On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
> > > >
> > > >
> > > > When an i_op->getattr() call is made on an NFS file
> > > > (typically from a 'stat' family system call), NFS
> > > > will first flush any dirty data to the server.
> > > >
> > > > This ensures that the mtime reported is correct and stable,
> > > > but has a performance penalty. 'stat' is normally thought
> > > > to be a quick operation, and imposing this cost can be
> > > > surprising.
> > >
> > > To be clear, this behavior is a POSIX requirement.
> > >
> > >
> > > > I have seen problems when one process is writing a large
> > > > file and another process performs "ls -l" on the containing
> > > > directory and is blocked for as long as it take to flush
> > > > all the dirty data to the server, which can be minutes.
> > >
> > > Yes, a well-known annoyance that cannot be addressed
> > > even with a write delegation.
> > >
> > >
> > > > I have also seen a legacy application which frequently calls
> > > > "fstat" on a file that it is writing to. On a local
> > > > filesystem (and in the Solaris implementation of NFS) this
> > > > fstat call is cheap. On Linux/NFS, the causes a noticeable
> > > > decrease in throughput.
> > >
> > > If the preceding write is small, Linux could be using
> > > a FILE_SYNC write, but Solaris could be using UNSTABLE.
> > >
> > >
> > > > The only circumstances where an application calling 'stat()'
> > > > might get an mtime which is not stable are times when some
> > > > other process is writing to the file and the two processes
> > > > are not using locking to ensure consistency, or when the one
> > > > process is both writing and stating. In neither of these
> > > > cases is it reasonable to expect the mtime to be stable.
> > >
> > > I'm not convinced this is a strong enough rationale
> > > for claiming it is safe to disable the existing
> > > behavior.
> > >
> > > You've explained cases where the new behavior is
> > > reasonable, but do you have any examples where the
> > > new behavior would be a problem? There must be a
> > > reason why POSIX explicitly requires an up-to-date
> > > mtime.
> > >
> > > What guidance would nfs(5) give on when it is safe
> > > to specify the new mount option?
> > >
> > >
> > > > In the most common cases where mtime is important
> > > > (e.g. make), no other process has the file open, so there
> > > > will be no dirty data and the mtime will be stable.
> > >
> > > Isn't it also the case that make is a multi-process
> > > workload where one process modifies a file, then
> > > closes it (which triggers a flush), and then another
> > > process stats the file? The new mount option does
> > > not change the behavior of close(2), does it?
> > >
> > >
> > > > Rather than unilaterally changing this behavior of 'stat',
> > > > this patch adds a "nosyncflush" mount option to allow
> > > > sysadmins to have applications which are hurt by the current
> > > > behavior to disable it.
> > >
> > > IMO a mount option is at the wrong granularity. A
> > > mount point will be shared between applications that
> > > can tolerate the non-POSIX behavior and those that
> > > cannot, for instance.
> >
> > Agreed.
> >
> > The other thing to note here is that we now have an embryonic
> > statx()
> > system call, which allows the application itself to decide whether
> > or
> > not it needs up to date values for the atime/ctime/mtime. While we
> > haven't yet plumbed in the NFS side, the intention was always to
> > use
> > that information to turn off the writeback flushing when possible.
>
> Yes, if statx() were actually working, we could change the
> application
> to avoid the flush. But then if changing the application were an
> option, I suspect that - for my current customer issue - we could
> just
> remove the fstat() calls. I doubt they are really necessary.
> I think programmers often think of stat() (and particularly fstat())
> as
> fairly cheap and so they use it whenever convenient. Only NFS
> violates
> this expectation.
>
> Also statx() is only a real solution if/when it gets widely
> used. Will
> "ls -l" default to AT_STATX_DONT_SYNC ??
>
> Apart from the Posix requirement (which only requires that the
> timestamps be updated, not that the data be flushed), do you know of
> any
> value gained from flushing data before stat()?
>
POSIX requires that timestamps change as part of the read() or write()
system call.
IOW: if I do the following
1. fd = open("foo", O_WRONLY)
2. fstat(fd, &buf1);
3. write(fd, "bar", 3)
4. fstat(fd, &buf2);
5. fstat(fd, &buf3);
then the requirement translates into the following:
* The timestamps returned in 'buf1' should differ from the timestamps
returned in 'buf2'.
* However 'buf2' and 'buf3' must contain the same values.
The flush performed by the NFS client in order to service the stat() in
(4) ensures that the timestamps get updated on the server (as part of
servicing the WRITE rpc call), which normally ensures that their values
won't change later on.
Note, however, that the client does cheat a little by only flushing the
WRITEs, and by not waiting for a COMMIT to be issued. That is generally
safe, but it means that the server could reboot before we have time to
persist the data+metadata. If that were to happen, then the replay of
the WRITEs might indeed cause the timestamps to change in (5) (in
contravention of POSIX rules).
Cheers
Trond
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Thu, Dec 21 2017, Trond Myklebust wrote:
> On Fri, 2017-12-22 at 07:59 +1100, NeilBrown wrote:
>> On Thu, Dec 21 2017, Trond Myklebust wrote:
>>
>> > On Thu, 2017-12-21 at 10:39 -0500, Chuck Lever wrote:
>> > > Hi Neil-
>> > >
>> > >
>> > > > On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
>> > > >
>> > > >
>> > > > When an i_op->getattr() call is made on an NFS file
>> > > > (typically from a 'stat' family system call), NFS
>> > > > will first flush any dirty data to the server.
>> > > >
>> > > > This ensures that the mtime reported is correct and stable,
>> > > > but has a performance penalty. 'stat' is normally thought
>> > > > to be a quick operation, and imposing this cost can be
>> > > > surprising.
>> > >
>> > > To be clear, this behavior is a POSIX requirement.
>> > >
>> > >
>> > > > I have seen problems when one process is writing a large
>> > > > file and another process performs "ls -l" on the containing
>> > > > directory and is blocked for as long as it take to flush
>> > > > all the dirty data to the server, which can be minutes.
>> > >
>> > > Yes, a well-known annoyance that cannot be addressed
>> > > even with a write delegation.
>> > >
>> > >
>> > > > I have also seen a legacy application which frequently calls
>> > > > "fstat" on a file that it is writing to. On a local
>> > > > filesystem (and in the Solaris implementation of NFS) this
>> > > > fstat call is cheap. On Linux/NFS, the causes a noticeable
>> > > > decrease in throughput.
>> > >
>> > > If the preceding write is small, Linux could be using
>> > > a FILE_SYNC write, but Solaris could be using UNSTABLE.
>> > >
>> > >
>> > > > The only circumstances where an application calling 'stat()'
>> > > > might get an mtime which is not stable are times when some
>> > > > other process is writing to the file and the two processes
>> > > > are not using locking to ensure consistency, or when the one
>> > > > process is both writing and stating. In neither of these
>> > > > cases is it reasonable to expect the mtime to be stable.
>> > >
>> > > I'm not convinced this is a strong enough rationale
>> > > for claiming it is safe to disable the existing
>> > > behavior.
>> > >
>> > > You've explained cases where the new behavior is
>> > > reasonable, but do you have any examples where the
>> > > new behavior would be a problem? There must be a
>> > > reason why POSIX explicitly requires an up-to-date
>> > > mtime.
>> > >
>> > > What guidance would nfs(5) give on when it is safe
>> > > to specify the new mount option?
>> > >
>> > >
>> > > > In the most common cases where mtime is important
>> > > > (e.g. make), no other process has the file open, so there
>> > > > will be no dirty data and the mtime will be stable.
>> > >
>> > > Isn't it also the case that make is a multi-process
>> > > workload where one process modifies a file, then
>> > > closes it (which triggers a flush), and then another
>> > > process stats the file? The new mount option does
>> > > not change the behavior of close(2), does it?
>> > >
>> > >
>> > > > Rather than unilaterally changing this behavior of 'stat',
>> > > > this patch adds a "nosyncflush" mount option to allow
>> > > > sysadmins to have applications which are hurt by the current
>> > > > behavior to disable it.
>> > >
>> > > IMO a mount option is at the wrong granularity. A
>> > > mount point will be shared between applications that
>> > > can tolerate the non-POSIX behavior and those that
>> > > cannot, for instance.
>> >
>> > Agreed.
>> >
>> > The other thing to note here is that we now have an embryonic
>> > statx()
>> > system call, which allows the application itself to decide whether
>> > or
>> > not it needs up to date values for the atime/ctime/mtime. While we
>> > haven't yet plumbed in the NFS side, the intention was always to
>> > use
>> > that information to turn off the writeback flushing when possible.
>>
>> Yes, if statx() were actually working, we could change the
>> application
>> to avoid the flush. But then if changing the application were an
>> option, I suspect that - for my current customer issue - we could
>> just
>> remove the fstat() calls. I doubt they are really necessary.
>> I think programmers often think of stat() (and particularly fstat())
>> as
>> fairly cheap and so they use it whenever convenient. Only NFS
>> violates
>> this expectation.
>>
>> Also statx() is only a real solution if/when it gets widely
>> used. Will
>> "ls -l" default to AT_STATX_DONT_SYNC ??
>>
>> Apart from the Posix requirement (which only requires that the
>> timestamps be updated, not that the data be flushed), do you know of
>> any
>> value gained from flushing data before stat()?
>>
>
> POSIX requires that timestamps change as part of the read() or write()
> system call.
Does it require that they don't change at other times?
I see the (arguable) deviation from POSIX that I propose to be
completely inline with the close-to-open consistency semantics that NFS
already uses, which are weaker than what POSIX might suggest.
As you didn't exactly answer the question, would it be fair to say that
you don't know of any reason to flush-before-stat except that POSIX
seems to require it?
Thanks,
NeilBrown
>
> IOW: if I do the following
>
> 1. fd = open("foo", O_WRONLY)
> 2. fstat(fd, &buf1);
> 3. write(fd, "bar", 3)
> 4. fstat(fd, &buf2);
> 5. fstat(fd, &buf3);
>
> then the requirement translates into the following:
> * The timestamps returned in 'buf1' should differ from the timestamps
> returned in 'buf2'.
> * However 'buf2' and 'buf3' must contain the same values.
>
> The flush performed by the NFS client in order to service the stat() in
> (4) ensures that the timestamps get updated on the server (as part of
> servicing the WRITE rpc call), which normally ensures that their values
> won't change later on.
>
> Note, however, that the client does cheat a little by only flushing the
> WRITEs, and by not waiting for a COMMIT to be issued. That is generally
> safe, but it means that the server could reboot before we have time to
> persist the data+metadata. If that were to happen, then the replay of
> the WRITEs might indeed cause the timestamps to change in (5) (in
> contravention of POSIX rules).
>
> Cheers
> Trond
>
> --
> Trond Myklebust
> Linux NFS client maintainer, PrimaryData
> [email protected]
On Fri, 2017-12-22 at 09:35 +1100, NeilBrown wrote:
> On Thu, Dec 21 2017, Trond Myklebust wrote:
>
> > On Fri, 2017-12-22 at 07:59 +1100, NeilBrown wrote:
> > > On Thu, Dec 21 2017, Trond Myklebust wrote:
> > >
> > > > On Thu, 2017-12-21 at 10:39 -0500, Chuck Lever wrote:
> > > > > Hi Neil-
> > > > >
> > > > >
> > > > > > On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]>
> > > > > > wrote:
> > > > > >
> > > > > >
> > > > > > When an i_op->getattr() call is made on an NFS file
> > > > > > (typically from a 'stat' family system call), NFS
> > > > > > will first flush any dirty data to the server.
> > > > > >
> > > > > > This ensures that the mtime reported is correct and stable,
> > > > > > but has a performance penalty. 'stat' is normally thought
> > > > > > to be a quick operation, and imposing this cost can be
> > > > > > surprising.
> > > > >
> > > > > To be clear, this behavior is a POSIX requirement.
> > > > >
> > > > >
> > > > > > I have seen problems when one process is writing a large
> > > > > > file and another process performs "ls -l" on the containing
> > > > > > directory and is blocked for as long as it take to flush
> > > > > > all the dirty data to the server, which can be minutes.
> > > > >
> > > > > Yes, a well-known annoyance that cannot be addressed
> > > > > even with a write delegation.
> > > > >
> > > > >
> > > > > > I have also seen a legacy application which frequently
> > > > > > calls
> > > > > > "fstat" on a file that it is writing to. On a local
> > > > > > filesystem (and in the Solaris implementation of NFS) this
> > > > > > fstat call is cheap. On Linux/NFS, the causes a noticeable
> > > > > > decrease in throughput.
> > > > >
> > > > > If the preceding write is small, Linux could be using
> > > > > a FILE_SYNC write, but Solaris could be using UNSTABLE.
> > > > >
> > > > >
> > > > > > The only circumstances where an application calling
> > > > > > 'stat()'
> > > > > > might get an mtime which is not stable are times when some
> > > > > > other process is writing to the file and the two processes
> > > > > > are not using locking to ensure consistency, or when the
> > > > > > one
> > > > > > process is both writing and stating. In neither of these
> > > > > > cases is it reasonable to expect the mtime to be stable.
> > > > >
> > > > > I'm not convinced this is a strong enough rationale
> > > > > for claiming it is safe to disable the existing
> > > > > behavior.
> > > > >
> > > > > You've explained cases where the new behavior is
> > > > > reasonable, but do you have any examples where the
> > > > > new behavior would be a problem? There must be a
> > > > > reason why POSIX explicitly requires an up-to-date
> > > > > mtime.
> > > > >
> > > > > What guidance would nfs(5) give on when it is safe
> > > > > to specify the new mount option?
> > > > >
> > > > >
> > > > > > In the most common cases where mtime is important
> > > > > > (e.g. make), no other process has the file open, so there
> > > > > > will be no dirty data and the mtime will be stable.
> > > > >
> > > > > Isn't it also the case that make is a multi-process
> > > > > workload where one process modifies a file, then
> > > > > closes it (which triggers a flush), and then another
> > > > > process stats the file? The new mount option does
> > > > > not change the behavior of close(2), does it?
> > > > >
> > > > >
> > > > > > Rather than unilaterally changing this behavior of 'stat',
> > > > > > this patch adds a "nosyncflush" mount option to allow
> > > > > > sysadmins to have applications which are hurt by the
> > > > > > current
> > > > > > behavior to disable it.
> > > > >
> > > > > IMO a mount option is at the wrong granularity. A
> > > > > mount point will be shared between applications that
> > > > > can tolerate the non-POSIX behavior and those that
> > > > > cannot, for instance.
> > > >
> > > > Agreed.
> > > >
> > > > The other thing to note here is that we now have an embryonic
> > > > statx()
> > > > system call, which allows the application itself to decide
> > > > whether
> > > > or
> > > > not it needs up to date values for the atime/ctime/mtime. While
> > > > we
> > > > haven't yet plumbed in the NFS side, the intention was always
> > > > to
> > > > use
> > > > that information to turn off the writeback flushing when
> > > > possible.
> > >
> > > Yes, if statx() were actually working, we could change the
> > > application
> > > to avoid the flush. But then if changing the application were an
> > > option, I suspect that - for my current customer issue - we could
> > > just
> > > remove the fstat() calls. I doubt they are really necessary.
> > > I think programmers often think of stat() (and particularly
> > > fstat())
> > > as
> > > fairly cheap and so they use it whenever convenient. Only NFS
> > > violates
> > > this expectation.
> > >
> > > Also statx() is only a real solution if/when it gets widely
> > > used. Will
> > > "ls -l" default to AT_STATX_DONT_SYNC ??
> > >
> > > Apart from the Posix requirement (which only requires that the
> > > timestamps be updated, not that the data be flushed), do you know
> > > of
> > > any
> > > value gained from flushing data before stat()?
> > >
> >
> > POSIX requires that timestamps change as part of the read() or
> > write()
> > system call.
>
> Does it require that they don't change at other times?
Yes.
> I see the (arguable) deviation from POSIX that I propose to be
> completely inline with the close-to-open consistency semantics that
> NFS
> already uses, which are weaker than what POSIX might suggest.
>
> As you didn't exactly answer the question, would it be fair to say
> that
> you don't know of any reason to flush-before-stat except that POSIX
> seems to require it?
The reason is to emulate POSIX semantics. That is the only reason, and
that is why when we added a statx() function, I asked that we allow the
application to specify that it doesn't need these POSIX semantics.
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
> On Dec 21, 2017, at 3:51 PM, NeilBrown <[email protected]> wrote:
>=20
> On Thu, Dec 21 2017, Chuck Lever wrote:
>=20
>> Hi Neil-
>>=20
>>=20
>>> On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
>>>=20
>>>=20
>>> When an i_op->getattr() call is made on an NFS file
>>> (typically from a 'stat' family system call), NFS
>>> will first flush any dirty data to the server.
>>>=20
>>> This ensures that the mtime reported is correct and stable,
>>> but has a performance penalty. 'stat' is normally thought
>>> to be a quick operation, and imposing this cost can be
>>> surprising.
>>=20
>> To be clear, this behavior is a POSIX requirement.
>=20
> Ah, that would be:
>=20
> =
http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#ta=
g_04_09
>=20
> which says:
>=20
> All timestamps that are marked for update shall be updated when the
> file ceases to be open by any process or before a fstat(), fstatat(),
> fsync(), futimens(), lstat(), stat(), utime(), utimensat(), or
> utimes() is successfully performed on the file.
>=20
> Suppose that when handling a stat(), if we find there are dirty pages,
> we send a SETATTR to the server to set the mtime to the current time,
> and use the result for the mtime? That would avoid the costly flush, =
but still
> give the appearance of the mtime being correctly updated.
>=20
>>=20
>>=20
>>> I have seen problems when one process is writing a large
>>> file and another process performs "ls -l" on the containing
>>> directory and is blocked for as long as it take to flush
>>> all the dirty data to the server, which can be minutes.
>>=20
>> Yes, a well-known annoyance that cannot be addressed
>> even with a write delegation.
>>=20
>>=20
>>> I have also seen a legacy application which frequently calls
>>> "fstat" on a file that it is writing to. On a local
>>> filesystem (and in the Solaris implementation of NFS) this
>>> fstat call is cheap. On Linux/NFS, the causes a noticeable
>>> decrease in throughput.
>>=20
>> If the preceding write is small, Linux could be using
>> a FILE_SYNC write, but Solaris could be using UNSTABLE.
>=20
> No, what is actually happening is that Linux is flushing out data and
> Solaris isn't. There are differences in stability and in write size,
> but they aren't the main contributors to the performance difference.
>=20
>>=20
>>=20
>>> The only circumstances where an application calling 'stat()'
>>> might get an mtime which is not stable are times when some
>>> other process is writing to the file and the two processes
>>> are not using locking to ensure consistency, or when the one
>>> process is both writing and stating. In neither of these
>>> cases is it reasonable to expect the mtime to be stable.
>>=20
>> I'm not convinced this is a strong enough rationale
>> for claiming it is safe to disable the existing
>> behavior.
>>=20
>> You've explained cases where the new behavior is
>> reasonable, but do you have any examples where the
>> new behavior would be a problem? There must be a
>> reason why POSIX explicitly requires an up-to-date
>> mtime.
>=20
> I honestly cannot think of any credible scenario where the current
> behaviour would be required.
I can't either, but:
- We have years of this behavior in place
- It is required by an aged standard
I don't mind creative thinking, but perhaps we do
need a higher bar than "can you remember why we do
this? <shrug>" before mitigating this particular
behavior. :-)
> If I write to a file from one NFS client, and request a stat() from
> another NFS client, the stat doesn't flush any data (though it could
> with NFSv4 if write delegations were being given). So depending on a
> stat() returning precise timestamps when there is no other =
co-ordination
> between processes will already fail. Why do we need to provide a
> guarantee to processes running on the same client that we don't =
provide
> to processes running on different clients?
When multiple clients are in play, as you pointed out
earlier, we do require explicit locking for correct
behavior. This usage scenario is clearly one that can
be done only in the context of a distributed file
system (ie., not on non-clustered local file systems).
Thus explicit locking is a tolerable and expected
requirement.
>> What guidance would nfs(5) give on when it is safe
>> to specify the new mount option?
>=20
> That is an excellent question to which I don't have an excellent =
answer.
>=20
> nostatflush: A strict reading of the Posix specification requires =
that
> any cached writes be flushed to the server before responding to
> stat() and related operations, so that the timestamps are accurate
> and stable. NFS does not guarantee this between applications on
> different clients, but does when the writer and the caller of stat()
> are on the same host. This flush can sometimes negatively impact
> performance. Specifying the nostatflush option causes NFS to ignore
> this requirement of Posix. A stat call when there is dirty data may
> report a different modify timestamp to the one that would be reported
> after that data was subsequently flushed to the server. There are no
> known use cases where this inconsistency would cause a problem, but =
as
> avoiding it is a Posix requirement, the default behavior to force a
> flush on every stat() call.
>=20
> ???
>=20
>=20
>=20
>>=20
>>=20
>>> In the most common cases where mtime is important
>>> (e.g. make), no other process has the file open, so there
>>> will be no dirty data and the mtime will be stable.
>>=20
>> Isn't it also the case that make is a multi-process
>> workload where one process modifies a file, then
>> closes it (which triggers a flush), and then another
>> process stats the file? The new mount option does
>> not change the behavior of close(2), does it?
>=20
> No, the mount option doesn't change the behaviour of close(2).
> A separate mount option (nocto) can do that.
>=20
> I think your point is that close() can be delayed by flush in the same
> way that stat() can. I think that is true, but not very relevant.
Agreed, it isn't relevant. So I'm wondering why we
should consider "make" at all in this case.
> stat() can be called more often that close() is likely to be, and
> close() only imposes a delay on the process that did the writing, =
after
> it has finished writing.
I don't believe that's true. The file's data cache is
shared across all applications on a client, and a
writeback on close is so that a subsequent GETATTR
reflects information about the file can be used to
verify the data cached the next time an application
on this client opens this file (close-to-open).
For NFSv4, of course, the client must flush dirty
data before it relinquishes the valid state ID at
CLOSE time.
> stat() can impose a delay on arbitrary other
> processes and at other times.
Eliminating unexpected latency is a noble goal. However,
write(2) can occasionally take a long time when the
application has unintentionally exceeded a dirty limit
for example.
More frequent writeback is part of the "sharing tax"
that network file systems have to pay. It can be
reduced somewhat in NFSv4 with delegation, though a
file's current mtime is still managed by the physical
file system on the NFS server.
>>> Rather than unilaterally changing this behavior of 'stat',
>>> this patch adds a "nosyncflush" mount option to allow
>>> sysadmins to have applications which are hurt by the current
>>> behavior to disable it.
>>=20
>> IMO a mount option is at the wrong granularity. A
>> mount point will be shared between applications that
>> can tolerate the non-POSIX behavior and those that
>> cannot, for instance.
>=20
> It is a better granularity than a module parameter, which was my first
> approach ;-)
>=20
> Yes, if we could create a control-group which avoided flush-on-stat, =
and
> ran the problematic programs in that control group, that might be =
ideal
> ... for some values of "ideal".
>=20
> You *could* mount the same filesystem in two places with nosharecache
> and with only one having nosyncflush. But you probably wouldn't.
>> I would rather see us address the underlying cause
>> of the delay, which is that the GETATTR gets starved
>> by an ongoing stream of WRITE requests. You could:
>=20
> I don't think starving is the issue.
> Of the two specific cases that I have seen,
> in one the issue was that the dirty_ratio multiplied by the amount of
> memory, divided by the throughput to the server, resulted in many
> minutes to flush out all the dirty data. Having "ls -l" wait for that
> flush was quite inconvenient. (We asked the customer to tune =
dirty_ratio
> way down).
This is the classic remedy for this problem. The default
dirty_ratio setting is antique for contemporary hardware.
> In the other (more recent) case the file being written was some sort =
of
> data-base being restored from a dump (or something like that). So =
some
> blocks (presumably indexing blocks) were being written multiple times.
> When we disabled flush-on-sync
flush-on-sync? That seems like a good thing! Do you mean
flush-on-getattr?
> the total number of bytes written went
> down by about 20% (vague memory .. certainly a non-trivial amount)
> because these index block were only written to the server once instead
> of multiple times. So that flush-on-stat slowed down throughput in =
part
> by pushing more data over the wire.
That behavior is disappointing, but arguments that the
application should be modified to address this issue (use
statx or avoid stat(2)) are convincing to me.
>> - Make active writers wait for the GETATTR so the
>> GETATTR is not starved
>>=20
>> - Start flushing dirty pages earlier so there is
>> less to flush when a stat(2) is done
>>=20
>> - Ensure that GETATTR is not also waiting for a
>> COMMIT
>>=20
>> Or maybe there's some other problem?
>>=20
>> I recall nfs_getattr used to grab i_mutex to hold
>> up active writers. But i_mutex is gone now. Is
>> there some other mechanism that can block writers
>> while the client flushes the file and handles the
>> GETATTR request?
>=20
> I vaguely remember when i_mutex locking was added to nfs_getattr to
> avoid the starvation :-)
> Today nfs_getattr() calls filemap_write_and_wait() which first =
triggers
> writes on all dirty pages, then waits for all writeback pages.
> The first stage should be fairly quick, and writes during the second
> stage shouldn't slow it down. If some process were calling fsync =
often
> (e.g. using O_SYNC) that might starve nfs_getattr(), but I don't think
> normal usage would.
>=20
> I do agree that a "real" fix would be better than a mount option.
> The only path to a "real" fix that I can think of is to provide some
> alternate source of producing a credible mtime without flushing all =
the
> data.
Perhaps, but flushing sooner will get us most of the way
there IMO, and seems more straightforward then playing
merry mtime games.
Jens Axboe had some writeback improvements that might
make it possible to start writeback sooner, depending on
the aggregate writeback throughput limit of the backing
storage. Not sure these were ever applied to the NFS
client.
The downside to starting writeback sooner is that an
application that uses a file as temporary rather than
persistent storage might be penalized (similar to the
second scenario you mention above).
> Might a SETATTR using SET_TO_SERVER_TIME be a workable way forward?
I thought about this, but here we risk the following odd
behavior:
1. The application calls write(2), and the client caches
the dirtied pages.
2. The application calls stat(2), the client explicitly
updates the mtime, then does a GETATTR returning mtime "A"
to the application.
3. The client later flushes the dirty data, and the
server updates the mtime again to mtime "B"
4. The application does another stat(2), the client
emits another GETATTR, and sees mtime "B" even though
there have been no additional write(2) calls on that
file.
That might be surprising to applications that, like the
client itself, rely on the file's mtime to detect outside
modification of the file. And it certainly violates POSIX.
Perhaps we could add a mechanism to the NFS protocol that
allows a client to send a WRITE that does not update the
file's mtime while holding a write delegation. In other
words, introduce a mode where the client rather than the
server manages a file's mtime.
Maybe pNFS block layout would allow this already?
> Thank for your thoughts!
>=20
> NeilBrown
>=20
>>=20
>>=20
>>> Note that this option should probably *not* be used together
>>> with "nocto". In that case, mtime could be unstable even
>>> when no process has the file open.
>>>=20
>>> Signed-off-by: NeilBrown <[email protected]>
>>> ---
>>> fs/nfs/inode.c | 3 ++-
>>> fs/nfs/super.c | 10 ++++++++++
>>> include/uapi/linux/nfs_mount.h | 6 ++++--
>>> 3 files changed, 16 insertions(+), 3 deletions(-)
>>>=20
>>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>>> index b992d2382ffa..16629a34dd62 100644
>>> --- a/fs/nfs/inode.c
>>> +++ b/fs/nfs/inode.c
>>> @@ -740,7 +740,8 @@ int nfs_getattr(const struct path *path, struct =
kstat *stat,
>>>=20
>>> trace_nfs_getattr_enter(inode);
>>> /* Flush out writes to the server in order to update c/mtime. =
*/
>>> - if (S_ISREG(inode->i_mode)) {
>>> + if (S_ISREG(inode->i_mode) &&
>>> + !(NFS_SERVER(inode)->flags & NFS_MOUNT_NOSTATFLUSH)) {
>>> err =3D filemap_write_and_wait(inode->i_mapping);
>>> if (err)
>>> goto out;
>>> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
>>> index 29bacdc56f6a..2351c0be98f5 100644
>>> --- a/fs/nfs/super.c
>>> +++ b/fs/nfs/super.c
>>> @@ -90,6 +90,7 @@ enum {
>>> Opt_resvport, Opt_noresvport,
>>> Opt_fscache, Opt_nofscache,
>>> Opt_migration, Opt_nomigration,
>>> + Opt_statflush, Opt_nostatflush,
>>>=20
>>> /* Mount options that take integer arguments */
>>> Opt_port,
>>> @@ -151,6 +152,8 @@ static const match_table_t =
nfs_mount_option_tokens =3D {
>>> { Opt_nofscache, "nofsc" },
>>> { Opt_migration, "migration" },
>>> { Opt_nomigration, "nomigration" },
>>> + { Opt_statflush, "statflush" },
>>> + { Opt_nostatflush, "nostatflush" },
>>>=20
>>> { Opt_port, "port=3D%s" },
>>> { Opt_rsize, "rsize=3D%s" },
>>> @@ -637,6 +640,7 @@ static void nfs_show_mount_options(struct =
seq_file *m, struct nfs_server *nfss,
>>> { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
>>> { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
>>> { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
>>> + { NFS_MOUNT_NOSTATFLUSH, ",nostatflush", "" },
>>> { 0, NULL, NULL }
>>> };
>>> const struct proc_nfs_info *nfs_infop;
>>> @@ -1334,6 +1338,12 @@ static int nfs_parse_mount_options(char *raw,
>>> case Opt_nomigration:
>>> mnt->options &=3D ~NFS_OPTION_MIGRATION;
>>> break;
>>> + case Opt_statflush:
>>> + mnt->flags &=3D ~NFS_MOUNT_NOSTATFLUSH;
>>> + break;
>>> + case Opt_nostatflush:
>>> + mnt->flags |=3D NFS_MOUNT_NOSTATFLUSH;
>>> + break;
>>>=20
>>> /*
>>> * options that take numeric values
>>> diff --git a/include/uapi/linux/nfs_mount.h =
b/include/uapi/linux/nfs_mount.h
>>> index e44e00616ab5..d7c6f809d25d 100644
>>> --- a/include/uapi/linux/nfs_mount.h
>>> +++ b/include/uapi/linux/nfs_mount.h
>>> @@ -72,7 +72,9 @@ struct nfs_mount_data {
>>> #define NFS_MOUNT_NORESVPORT 0x40000
>>> #define NFS_MOUNT_LEGACY_INTERFACE 0x80000
>>>=20
>>> -#define NFS_MOUNT_LOCAL_FLOCK 0x100000
>>> -#define NFS_MOUNT_LOCAL_FCNTL 0x200000
>>> +#define NFS_MOUNT_LOCAL_FLOCK 0x100000
>>> +#define NFS_MOUNT_LOCAL_FCNTL 0x200000
>>> +
>>> +#define NFS_MOUNT_NOSTATFLUSH 0x400000
>>>=20
>>> #endif
>>> --=20
>>> 2.14.0.rc0.dirty
>>>=20
>>=20
>> --
>> Chuck Lever
--
Chuck Lever
On Fri, 2017-12-22 at 07:59 +1100, NeilBrown wrote:
> On Thu, Dec 21 2017, Trond Myklebust wrote:
>
> > On Thu, 2017-12-21 at 10:39 -0500, Chuck Lever wrote:
> > > Hi Neil-
> > >
> > >
> > > > On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
> > > >
> > > >
> > > > When an i_op->getattr() call is made on an NFS file
> > > > (typically from a 'stat' family system call), NFS
> > > > will first flush any dirty data to the server.
> > > >
> > > > This ensures that the mtime reported is correct and stable,
> > > > but has a performance penalty. 'stat' is normally thought
> > > > to be a quick operation, and imposing this cost can be
> > > > surprising.
> > >
> > > To be clear, this behavior is a POSIX requirement.
> > >
> > >
> > > > I have seen problems when one process is writing a large
> > > > file and another process performs "ls -l" on the containing
> > > > directory and is blocked for as long as it take to flush
> > > > all the dirty data to the server, which can be minutes.
> > >
> > > Yes, a well-known annoyance that cannot be addressed
> > > even with a write delegation.
> > >
> > >
> > > > I have also seen a legacy application which frequently calls
> > > > "fstat" on a file that it is writing to. On a local
> > > > filesystem (and in the Solaris implementation of NFS) this
> > > > fstat call is cheap. On Linux/NFS, the causes a noticeable
> > > > decrease in throughput.
> > >
> > > If the preceding write is small, Linux could be using
> > > a FILE_SYNC write, but Solaris could be using UNSTABLE.
> > >
> > >
> > > > The only circumstances where an application calling 'stat()'
> > > > might get an mtime which is not stable are times when some
> > > > other process is writing to the file and the two processes
> > > > are not using locking to ensure consistency, or when the one
> > > > process is both writing and stating. In neither of these
> > > > cases is it reasonable to expect the mtime to be stable.
> > >
> > > I'm not convinced this is a strong enough rationale
> > > for claiming it is safe to disable the existing
> > > behavior.
> > >
> > > You've explained cases where the new behavior is
> > > reasonable, but do you have any examples where the
> > > new behavior would be a problem? There must be a
> > > reason why POSIX explicitly requires an up-to-date
> > > mtime.
> > >
> > > What guidance would nfs(5) give on when it is safe
> > > to specify the new mount option?
> > >
> > >
> > > > In the most common cases where mtime is important
> > > > (e.g. make), no other process has the file open, so there
> > > > will be no dirty data and the mtime will be stable.
> > >
> > > Isn't it also the case that make is a multi-process
> > > workload where one process modifies a file, then
> > > closes it (which triggers a flush), and then another
> > > process stats the file? The new mount option does
> > > not change the behavior of close(2), does it?
> > >
> > >
> > > > Rather than unilaterally changing this behavior of 'stat',
> > > > this patch adds a "nosyncflush" mount option to allow
> > > > sysadmins to have applications which are hurt by the current
> > > > behavior to disable it.
> > >
> > > IMO a mount option is at the wrong granularity. A
> > > mount point will be shared between applications that
> > > can tolerate the non-POSIX behavior and those that
> > > cannot, for instance.
> >
> > Agreed.
> >
> > The other thing to note here is that we now have an embryonic statx()
> > system call, which allows the application itself to decide whether or
> > not it needs up to date values for the atime/ctime/mtime. While we
> > haven't yet plumbed in the NFS side, the intention was always to use
> > that information to turn off the writeback flushing when possible.
>
> Yes, if statx() were actually working, we could change the application
> to avoid the flush. But then if changing the application were an
> option, I suspect that - for my current customer issue - we could just
> remove the fstat() calls. I doubt they are really necessary.
> I think programmers often think of stat() (and particularly fstat()) as
> fairly cheap and so they use it whenever convenient. Only NFS violates
> this expectation.
>
> Also statx() is only a real solution if/when it gets widely used. Will
> "ls -l" default to AT_STATX_DONT_SYNC ??
>
Maybe. Eventually, I could see glibc converting normal stat/fstat/etc.
to use a statx() syscall under the hood (similar to how stat syscalls on
32-bit arches will use stat64 in most cases).
With that, we could look at any number of ways to sneak a "don't flush"
flag into the call. Maybe an environment variable that causes the stat
syscall wrapper to add it? I think there are possibilities there that
don't necessarily require recompiling applications.
> Apart from the Posix requirement (which only requires that the
> timestamps be updated, not that the data be flushed), do you know of any
> value gained from flushing data before stat()?
>
--
Jeff Layton <[email protected]>
On Sat, Dec 23 2017, Jeff Layton wrote:
> On Fri, 2017-12-22 at 07:59 +1100, NeilBrown wrote:
>> On Thu, Dec 21 2017, Trond Myklebust wrote:
>>
>> > On Thu, 2017-12-21 at 10:39 -0500, Chuck Lever wrote:
>> > > Hi Neil-
>> > >
>> > >
>> > > > On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]> wrote:
>> > > >
>> > > >
>> > > > When an i_op->getattr() call is made on an NFS file
>> > > > (typically from a 'stat' family system call), NFS
>> > > > will first flush any dirty data to the server.
>> > > >
>> > > > This ensures that the mtime reported is correct and stable,
>> > > > but has a performance penalty. 'stat' is normally thought
>> > > > to be a quick operation, and imposing this cost can be
>> > > > surprising.
>> > >
>> > > To be clear, this behavior is a POSIX requirement.
>> > >
>> > >
>> > > > I have seen problems when one process is writing a large
>> > > > file and another process performs "ls -l" on the containing
>> > > > directory and is blocked for as long as it take to flush
>> > > > all the dirty data to the server, which can be minutes.
>> > >
>> > > Yes, a well-known annoyance that cannot be addressed
>> > > even with a write delegation.
>> > >
>> > >
>> > > > I have also seen a legacy application which frequently calls
>> > > > "fstat" on a file that it is writing to. On a local
>> > > > filesystem (and in the Solaris implementation of NFS) this
>> > > > fstat call is cheap. On Linux/NFS, the causes a noticeable
>> > > > decrease in throughput.
>> > >
>> > > If the preceding write is small, Linux could be using
>> > > a FILE_SYNC write, but Solaris could be using UNSTABLE.
>> > >
>> > >
>> > > > The only circumstances where an application calling 'stat()'
>> > > > might get an mtime which is not stable are times when some
>> > > > other process is writing to the file and the two processes
>> > > > are not using locking to ensure consistency, or when the one
>> > > > process is both writing and stating. In neither of these
>> > > > cases is it reasonable to expect the mtime to be stable.
>> > >
>> > > I'm not convinced this is a strong enough rationale
>> > > for claiming it is safe to disable the existing
>> > > behavior.
>> > >
>> > > You've explained cases where the new behavior is
>> > > reasonable, but do you have any examples where the
>> > > new behavior would be a problem? There must be a
>> > > reason why POSIX explicitly requires an up-to-date
>> > > mtime.
>> > >
>> > > What guidance would nfs(5) give on when it is safe
>> > > to specify the new mount option?
>> > >
>> > >
>> > > > In the most common cases where mtime is important
>> > > > (e.g. make), no other process has the file open, so there
>> > > > will be no dirty data and the mtime will be stable.
>> > >
>> > > Isn't it also the case that make is a multi-process
>> > > workload where one process modifies a file, then
>> > > closes it (which triggers a flush), and then another
>> > > process stats the file? The new mount option does
>> > > not change the behavior of close(2), does it?
>> > >
>> > >
>> > > > Rather than unilaterally changing this behavior of 'stat',
>> > > > this patch adds a "nosyncflush" mount option to allow
>> > > > sysadmins to have applications which are hurt by the current
>> > > > behavior to disable it.
>> > >
>> > > IMO a mount option is at the wrong granularity. A
>> > > mount point will be shared between applications that
>> > > can tolerate the non-POSIX behavior and those that
>> > > cannot, for instance.
>> >
>> > Agreed.
>> >
>> > The other thing to note here is that we now have an embryonic statx()
>> > system call, which allows the application itself to decide whether or
>> > not it needs up to date values for the atime/ctime/mtime. While we
>> > haven't yet plumbed in the NFS side, the intention was always to use
>> > that information to turn off the writeback flushing when possible.
>>
>> Yes, if statx() were actually working, we could change the application
>> to avoid the flush. But then if changing the application were an
>> option, I suspect that - for my current customer issue - we could just
>> remove the fstat() calls. I doubt they are really necessary.
>> I think programmers often think of stat() (and particularly fstat()) as
>> fairly cheap and so they use it whenever convenient. Only NFS violates
>> this expectation.
>>
>> Also statx() is only a real solution if/when it gets widely used. Will
>> "ls -l" default to AT_STATX_DONT_SYNC ??
>>
>
> Maybe. Eventually, I could see glibc converting normal stat/fstat/etc.
> to use a statx() syscall under the hood (similar to how stat syscalls on
> 32-bit arches will use stat64 in most cases).
>
> With that, we could look at any number of ways to sneak a "don't flush"
> flag into the call. Maybe an environment variable that causes the stat
> syscall wrapper to add it? I think there are possibilities there that
> don't necessarily require recompiling applications.
Thanks - interesting ideas.
One possibility would be an LD_PRELOAD which implements fstat() using
statx().
That doesn't address the "ls -l is needlessly slow" problem, but it
would address the "legacy application calls fstat too often" problem.
This isn't an option for the "enterprise kernel" the customer is using
(statx? what is statx?), but having a clear view of a credible
upstream solution is very helpful.
So thanks - and thanks a lot to Trond and Chuck for your input. It
helped clarify my thoughts a lot.
Is anyone working on proper statx support for NFS, or is it a case of
"that shouldn't be hard and we should do that, but it isn't a high
priority for anyone" ??
Thanks,
NeilBrown
>
>> Apart from the Posix requirement (which only requires that the
>> timestamps be updated, not that the data be flushed), do you know of any
>> value gained from flushing data before stat()?
>>
> --
> Jeff Layton <[email protected]>
Hi Neil,
On Tue, 2018-01-02 at 10:29 +1100, NeilBrown wrote:
> On Sat, Dec 23 2017, Jeff Layton wrote:
>
> > On Fri, 2017-12-22 at 07:59 +1100, NeilBrown wrote:
> > > On Thu, Dec 21 2017, Trond Myklebust wrote:
> > >
> > > > On Thu, 2017-12-21 at 10:39 -0500, Chuck Lever wrote:
> > > > > Hi Neil-
> > > > >
> > > > >
> > > > > > On Dec 20, 2017, at 9:57 PM, NeilBrown <[email protected]>
> > > > > > wrote:
> > > > > >
> > > > > >
> > > > > > When an i_op->getattr() call is made on an NFS file
> > > > > > (typically from a 'stat' family system call), NFS
> > > > > > will first flush any dirty data to the server.
> > > > > >
> > > > > > This ensures that the mtime reported is correct and stable,
> > > > > > but has a performance penalty. 'stat' is normally thought
> > > > > > to be a quick operation, and imposing this cost can be
> > > > > > surprising.
> > > > >
> > > > > To be clear, this behavior is a POSIX requirement.
> > > > >
> > > > >
> > > > > > I have seen problems when one process is writing a large
> > > > > > file and another process performs "ls -l" on the containing
> > > > > > directory and is blocked for as long as it take to flush
> > > > > > all the dirty data to the server, which can be minutes.
> > > > >
> > > > > Yes, a well-known annoyance that cannot be addressed
> > > > > even with a write delegation.
> > > > >
> > > > >
> > > > > > I have also seen a legacy application which frequently
> > > > > > calls
> > > > > > "fstat" on a file that it is writing to. On a local
> > > > > > filesystem (and in the Solaris implementation of NFS) this
> > > > > > fstat call is cheap. On Linux/NFS, the causes a noticeable
> > > > > > decrease in throughput.
> > > > >
> > > > > If the preceding write is small, Linux could be using
> > > > > a FILE_SYNC write, but Solaris could be using UNSTABLE.
> > > > >
> > > > >
> > > > > > The only circumstances where an application calling
> > > > > > 'stat()'
> > > > > > might get an mtime which is not stable are times when some
> > > > > > other process is writing to the file and the two processes
> > > > > > are not using locking to ensure consistency, or when the
> > > > > > one
> > > > > > process is both writing and stating. In neither of these
> > > > > > cases is it reasonable to expect the mtime to be stable.
> > > > >
> > > > > I'm not convinced this is a strong enough rationale
> > > > > for claiming it is safe to disable the existing
> > > > > behavior.
> > > > >
> > > > > You've explained cases where the new behavior is
> > > > > reasonable, but do you have any examples where the
> > > > > new behavior would be a problem? There must be a
> > > > > reason why POSIX explicitly requires an up-to-date
> > > > > mtime.
> > > > >
> > > > > What guidance would nfs(5) give on when it is safe
> > > > > to specify the new mount option?
> > > > >
> > > > >
> > > > > > In the most common cases where mtime is important
> > > > > > (e.g. make), no other process has the file open, so there
> > > > > > will be no dirty data and the mtime will be stable.
> > > > >
> > > > > Isn't it also the case that make is a multi-process
> > > > > workload where one process modifies a file, then
> > > > > closes it (which triggers a flush), and then another
> > > > > process stats the file? The new mount option does
> > > > > not change the behavior of close(2), does it?
> > > > >
> > > > >
> > > > > > Rather than unilaterally changing this behavior of 'stat',
> > > > > > this patch adds a "nosyncflush" mount option to allow
> > > > > > sysadmins to have applications which are hurt by the
> > > > > > current
> > > > > > behavior to disable it.
> > > > >
> > > > > IMO a mount option is at the wrong granularity. A
> > > > > mount point will be shared between applications that
> > > > > can tolerate the non-POSIX behavior and those that
> > > > > cannot, for instance.
> > > >
> > > > Agreed.
> > > >
> > > > The other thing to note here is that we now have an embryonic
> > > > statx()
> > > > system call, which allows the application itself to decide
> > > > whether or
> > > > not it needs up to date values for the atime/ctime/mtime. While
> > > > we
> > > > haven't yet plumbed in the NFS side, the intention was always
> > > > to use
> > > > that information to turn off the writeback flushing when
> > > > possible.
> > >
> > > Yes, if statx() were actually working, we could change the
> > > application
> > > to avoid the flush. But then if changing the application were an
> > > option, I suspect that - for my current customer issue - we could
> > > just
> > > remove the fstat() calls. I doubt they are really necessary.
> > > I think programmers often think of stat() (and particularly
> > > fstat()) as
> > > fairly cheap and so they use it whenever convenient. Only NFS
> > > violates
> > > this expectation.
> > >
> > > Also statx() is only a real solution if/when it gets widely
> > > used. Will
> > > "ls -l" default to AT_STATX_DONT_SYNC ??
> > >
> >
> > Maybe. Eventually, I could see glibc converting normal
> > stat/fstat/etc.
> > to use a statx() syscall under the hood (similar to how stat
> > syscalls on
> > 32-bit arches will use stat64 in most cases).
> >
> > With that, we could look at any number of ways to sneak a "don't
> > flush"
> > flag into the call. Maybe an environment variable that causes the
> > stat
> > syscall wrapper to add it? I think there are possibilities there
> > that
> > don't necessarily require recompiling applications.
>
> Thanks - interesting ideas.
>
> One possibility would be an LD_PRELOAD which implements fstat() using
> statx().
> That doesn't address the "ls -l is needlessly slow" problem, but it
> would address the "legacy application calls fstat too often" problem.
>
> This isn't an option for the "enterprise kernel" the customer is
> using
> (statx? what is statx?), but having a clear view of a credible
> upstream solution is very helpful.
>
> So thanks - and thanks a lot to Trond and Chuck for your input. It
> helped clarify my thoughts a lot.
>
> Is anyone working on proper statx support for NFS, or is it a case of
> "that shouldn't be hard and we should do that, but it isn't a high
> priority for anyone" ??
How about something like the following?
Cheers
Trond
8<--------------------------------------------------------
From 755b6771deb8d793c90f56fddf7070d7c2ea87b5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <[email protected]>
Date: Thu, 4 Jan 2018 17:46:09 -0500
Subject: [PATCH] Support statx() mask and query flags parameters
Support the query flags AT_STATX_FORCE_SYNC by forcing an attribute
revalidation, and AT_STATX_DONT_SYNC by returning cached attributes
only.
Use the mask to optimise away server revalidation for attributes
that are not being requested by the user.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/inode.c | 40 ++++++++++++++++++++++++++++++++++------
1 file changed, 34 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b112002dbdb2..a703b1d1500d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -735,12 +735,22 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+ unsigned long cache_validity;
+ bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
+ bool dont_sync = !force_sync && query_flags & AT_STATX_DONT_SYNC;
+ bool need_atime = !dont_sync;
+ bool need_cmtime = !dont_sync;
+ bool reval = force_sync;
int err = 0;
+ if (!(request_mask & STATX_ATIME))
+ need_atime = false;
+ if (!(request_mask & (STATX_CTIME|STATX_MTIME)))
+ need_cmtime = false;
+
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
- if (S_ISREG(inode->i_mode)) {
+ if (S_ISREG(inode->i_mode) && need_cmtime) {
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto out;
@@ -757,9 +767,22 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
*/
if ((path->mnt->mnt_flags & MNT_NOATIME) ||
((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
- need_atime = 0;
-
- if (need_atime || nfs_need_revalidate_inode(inode)) {
+ need_atime = false;
+
+ /* Check for whether the cached attributes are invalid */
+ cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ if (need_cmtime)
+ reval |= cache_validity & NFS_INO_REVAL_PAGECACHE;
+ if (need_atime)
+ reval |= cache_validity & NFS_INO_INVALID_ATIME;
+ if (request_mask & (STATX_MODE|STATX_NLINK|STATX_UID|STATX_GID|
+ STATX_ATIME|STATX_MTIME|STATX_CTIME|
+ STATX_SIZE|STATX_BLOCKS))
+ reval |= cache_validity & NFS_INO_INVALID_ATTR;
+ if (dont_sync)
+ reval = false;
+
+ if (reval) {
struct nfs_server *server = NFS_SERVER(inode);
if (!(server->flags & NFS_MOUNT_NOAC))
@@ -767,13 +790,18 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
else
nfs_readdirplus_parent_cache_hit(path->dentry);
err = __nfs_revalidate_inode(server, inode);
- } else
+ } else if (!dont_sync)
nfs_readdirplus_parent_cache_hit(path->dentry);
if (!err) {
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
+ /* Return only the requested attrs if others may be stale */
+ if (!reval && cache_validity & (NFS_INO_REVAL_PAGECACHE|
+ NFS_INO_INVALID_ATIME|
+ NFS_INO_INVALID_ATTR))
+ stat->result_mask &= request_mask;
}
out:
trace_nfs_getattr_exit(inode, err);
--
2.14.3
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]