In the years since the max readahead size was fixed in NFS, a number of
things have happened:
- Users can now set the value directly using /sys/class/bdi
- NFS max supported block sizes have increased by several orders of
magnitude from 64K to 1MB.
- Disk access latencies are orders of magnitude faster due to SSD + NVME.
In particular note that if the server is advertising 1MB as the optimal
read size, as that will set the readahead size to 15MB.
Let's therefore adjust down, and try to default to VM_READAHEAD_PAGES.
However let's inform the VM about our preferred block size so that it
can choose to round up in cases where that makes sense.
Reported-by: Alkis Georgopoulos <[email protected]>
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/internal.h | 8 --------
fs/nfs/super.c | 9 ++++++++-
2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e64f810223be..447a3c17fa8e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops;
struct nfs_string;
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- * their needs. People that do NFS over a slow network, might for
- * instance want to reduce it to something closer to 1 for improved
- * interactive response.
- */
-#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
-
static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
{
if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 703f595dce90..c96194e28692 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2627,6 +2627,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
}
EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
+static void nfs_set_readahead(struct backing_dev_info *bdi,
+ unsigned long iomax_pages)
+{
+ bdi->ra_pages = VM_READAHEAD_PAGES;
+ bdi->io_pages = iomax_pages;
+}
+
struct dentry *nfs_fs_mount_common(struct nfs_server *server,
int flags, const char *dev_name,
struct nfs_mount_info *mount_info,
@@ -2669,7 +2676,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
mntroot = ERR_PTR(error);
goto error_splat_super;
}
- s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+ nfs_set_readahead(s->s_bdi, server->rpages);
server->super = s;
}
--
2.21.0
Thank you Trond, you're awesome!
I don't know if it's appropriate, but I thought I'd send some recent
benchmarks about this:
Netbooting a system over 100 Mbps,tcp,timeo=600,rsize=1M,wsize=1M,
then `rm -rf .mozilla; echo 3>/proc/sys/vm/drop_caches; firefox`
| Readahead | Boot sec | Boot MB | Firefox sec | Firefox MB |
|-----------|----------|---------|-------------|------------|
| 4 KB | 34 | 158 | 27 | 120 |
| 128 KB | 36 | 355 | 27 | 247 |
| 1 MB | 83 | 1210 | 60 | 661 |
If I understand it correctly, the new default is 128 KB, which feels
like a great generic default, while for remote / or /home for multiple
clients, 4 KB might be more appropriate, so software like LTSP or klibc
nfsmount that focus there, may adjust readahead from the
/sys/devices/virtual/bdi interface.
Thanks again,
Alkis Georgopoulos
LTSP developer
On 9/22/19 10:07 PM, Trond Myklebust wrote:
> In the years since the max readahead size was fixed in NFS, a number of
> things have happened:
> - Users can now set the value directly using /sys/class/bdi
> - NFS max supported block sizes have increased by several orders of
> magnitude from 64K to 1MB.
> - Disk access latencies are orders of magnitude faster due to SSD + NVME.
>
> In particular note that if the server is advertising 1MB as the optimal
> read size, as that will set the readahead size to 15MB.
> Let's therefore adjust down, and try to default to VM_READAHEAD_PAGES.
> However let's inform the VM about our preferred block size so that it
> can choose to round up in cases where that makes sense.
>
> Reported-by: Alkis Georgopoulos <[email protected]>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/internal.h | 8 --------
> fs/nfs/super.c | 9 ++++++++-
> 2 files changed, 8 insertions(+), 9 deletions(-)
>
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index e64f810223be..447a3c17fa8e 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops;
>
> struct nfs_string;
>
> -/* Maximum number of readahead requests
> - * FIXME: this should really be a sysctl so that users may tune it to suit
> - * their needs. People that do NFS over a slow network, might for
> - * instance want to reduce it to something closer to 1 for improved
> - * interactive response.
> - */
> -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
> -
> static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
> {
> if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
> index 703f595dce90..c96194e28692 100644
> --- a/fs/nfs/super.c
> +++ b/fs/nfs/super.c
> @@ -2627,6 +2627,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
> }
> EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
>
> +static void nfs_set_readahead(struct backing_dev_info *bdi,
> + unsigned long iomax_pages)
> +{
> + bdi->ra_pages = VM_READAHEAD_PAGES;
> + bdi->io_pages = iomax_pages;
> +}
> +
> struct dentry *nfs_fs_mount_common(struct nfs_server *server,
> int flags, const char *dev_name,
> struct nfs_mount_info *mount_info,
> @@ -2669,7 +2676,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
> mntroot = ERR_PTR(error);
> goto error_splat_super;
> }
> - s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
> + nfs_set_readahead(s->s_bdi, server->rpages);
> server->super = s;
> }
>
>