Linux kernel defined the default read ahead to 128KiB, and this is
making read perform poorly. To mitigate it, add readahead as a mount
option that is handled by userspace, with some refactoring included.
Signed-off-by: Thiago Rafael Becker <[email protected]>
---
utils/mount/mount.c | 114 ++++++++++++++++++++++++++++++++++++++++----
utils/mount/nfs.man | 3 ++
2 files changed, 108 insertions(+), 9 deletions(-)
diff --git a/utils/mount/mount.c b/utils/mount/mount.c
index b98f9e00..15076ca8 100644
--- a/utils/mount/mount.c
+++ b/utils/mount/mount.c
@@ -56,9 +56,11 @@ int nomtab;
int verbose;
int sloppy;
int string;
+static int readahead_kb = 0;
#define FOREGROUND (0)
#define BACKGROUND (1)
+#define READAHEAD_VALUE_LEN 24
static struct option longopts[] = {
{ "fake", 0, 0, 'f' },
@@ -292,6 +294,16 @@ static int add_mtab(char *spec, char *mount_point, char *fstype,
return result;
}
+static void append_extra_opt(const char *opt, char *extra_opts, size_t len) {
+ len -= strlen(extra_opts);
+
+ if (*extra_opts && --len > 0)
+ strcat(extra_opts, ",");
+
+ if ((len -= strlen(opt)) > 0)
+ strcat(extra_opts, opt);
+}
+
static void parse_opt(const char *opt, int *mask, char *extra_opts, size_t len)
{
const struct opt_map *om;
@@ -306,13 +318,37 @@ static void parse_opt(const char *opt, int *mask, char *extra_opts, size_t len)
}
}
- len -= strlen(extra_opts);
+ append_extra_opt(opt, extra_opts, len);
+}
- if (*extra_opts && --len > 0)
- strcat(extra_opts, ",");
+static void parse_opt_val(const char *opt, const char *val, char *extra_opts, size_t len)
+{
+ size_t ov_len;
+ char *opt_val;
- if ((len -= strlen(opt)) > 0)
- strcat(extra_opts, opt);
+ /* readahead is a special value that is handled by userspace */
+ if (!strcmp(opt, "readahead")) {
+ char *endptr = NULL;
+ const char *expected_endptr = val + strlen(val);
+
+ readahead_kb = strtol(val, &endptr, 10);
+
+ if (endptr != expected_endptr) {
+ nfs_error(_("%s: invalid readahead value %s"),
+ progname, val);
+ readahead_kb = 0;
+ }
+ return;
+ }
+
+ /* Send the option to the kernel. */
+ ov_len = strlen(opt) + strlen(val) + 3;
+ opt_val = malloc(sizeof(char) * ov_len);
+ snprintf(opt_val, ov_len, ",%s=%s", opt, val);
+
+ append_extra_opt(opt_val, extra_opts, len);
+
+ free(opt_val);
}
/*
@@ -325,7 +361,7 @@ static void parse_opts(const char *options, int *flags, char **extra_opts)
{
if (options != NULL) {
char *opts = xstrdup(options);
- char *opt, *p;
+ char *opt, *p, *val = NULL;
size_t len = strlen(opts) + 1; /* include room for a null */
int open_quote = 0;
@@ -341,17 +377,75 @@ static void parse_opts(const char *options, int *flags, char **extra_opts)
continue; /* still in a quoted block */
if (*p == ',')
*p = '\0'; /* terminate the option item */
+ if (*p == '=') { /* key=val option */
+ if (!val) {
+ *p = '\0'; /* terminate key */
+ val = ++p; /* start the value */
+ }
+ }
/* end of option item or last item */
if (*p == '\0' || *(p + 1) == '\0') {
- parse_opt(opt, flags, *extra_opts, len);
- opt = NULL;
+ if (val) {
+ parse_opt_val(opt, val, *extra_opts, len);
+ } else
+ parse_opt(opt, flags, *extra_opts, len);
+ opt = val = NULL;
}
}
free(opts);
}
}
+/*
+ * Set the read ahead value for the mount point. On failure, uses the default kernel
+ * read ahead value (for new mounts) or the current value (for remounts).
+ */
+static void set_readahead(const char *mount_point) {
+ int error;
+ struct statx mp_stat;
+ char *mount_point_readahead_file, value[READAHEAD_VALUE_LEN];
+ size_t len;
+ int fp;
+
+ /* If readahead_kb is unset, or set to 0, do not change the value */
+ if (!readahead_kb)
+ return;
+
+ if ((error = statx(0, mount_point, 0, 0, &mp_stat)) != 0) {
+ goto out_error;
+ }
+
+ if (!(mount_point_readahead_file = malloc(PATH_MAX))) {
+ error = -ENOMEM;
+ goto out_error;
+ }
+
+ snprintf(mount_point_readahead_file, PATH_MAX, "/sys/class/bdi/%d:%d/read_ahead_kb",
+ mp_stat.stx_dev_major, mp_stat.stx_dev_minor);
+
+ len = snprintf(value, READAHEAD_VALUE_LEN, "%d", readahead_kb);
+
+ if ((fp = open(mount_point_readahead_file, O_WRONLY)) < 0) {
+ error = errno;
+ goto out_free;
+ }
+
+ if ((error = write(fp, value, len)) < 0)
+ goto out_close;
+
+ close(fp);
+ return;
+
+out_close:
+ close(fp);
+out_free:
+ free(mount_point_readahead_file);
+out_error:
+ nfs_error(_("%s: unable to set readahead value, using default kernel value (error = %d)\n"),
+ progname, error);
+}
+
static int try_mount(char *spec, char *mount_point, int flags,
char *fs_type, char **extra_opts, char *mount_opts,
int fake, int bg)
@@ -373,8 +467,10 @@ static int try_mount(char *spec, char *mount_point, int flags,
if (ret)
return ret;
- if (!fake)
+ if (!fake) {
+ set_readahead(mount_point);
print_one(spec, mount_point, fs_type, mount_opts);
+ }
return add_mtab(spec, mount_point, fs_type, flags, *extra_opts);
}
diff --git a/utils/mount/nfs.man b/utils/mount/nfs.man
index f1b76936..9832a377 100644
--- a/utils/mount/nfs.man
+++ b/utils/mount/nfs.man
@@ -561,6 +561,9 @@ The
.B sloppy
option is an alternative to specifying
.BR mount.nfs " -s " option.
+.TP 1.5i
+.B readahead=n
+Set the read ahead of the mount to n KiB. This is handled entirely in userspace and will not appear on mountinfo. If unset or set to 0, it will not set the a value, using the current value (for a remount) or the kernel default for a new mount.
.SS "Options for NFS versions 2 and 3 only"
Use these options, along with the options in the above subsection,
--
2.31.1
On Tue, 2021-08-03 at 10:07 -0300, Thiago Rafael Becker wrote:
> Linux kernel defined the default read ahead to 128KiB, and this is
> making read perform poorly. To mitigate it, add readahead as a mount
> option that is handled by userspace, with some refactoring included.
>
> Signed-off-by: Thiago Rafael Becker <[email protected]>
> ---
> utils/mount/mount.c | 114 ++++++++++++++++++++++++++++++++++++++++--
> --
> utils/mount/nfs.man | 3 ++
> 2 files changed, 108 insertions(+), 9 deletions(-)
>
> diff --git a/utils/mount/mount.c b/utils/mount/mount.c
> index b98f9e00..15076ca8 100644
> --- a/utils/mount/mount.c
> +++ b/utils/mount/mount.c
> @@ -56,9 +56,11 @@ int nomtab;
> int verbose;
> int sloppy;
> int string;
> +static int readahead_kb = 0;
>
> #define FOREGROUND (0)
> #define BACKGROUND (1)
> +#define READAHEAD_VALUE_LEN 24
>
> static struct option longopts[] = {
> { "fake", 0, 0, 'f' },
> @@ -292,6 +294,16 @@ static int add_mtab(char *spec, char
> *mount_point, char *fstype,
> return result;
> }
>
> +static void append_extra_opt(const char *opt, char *extra_opts,
> size_t len) {
> + len -= strlen(extra_opts);
> +
> + if (*extra_opts && --len > 0)
> + strcat(extra_opts, ",");
> +
> + if ((len -= strlen(opt)) > 0)
> + strcat(extra_opts, opt);
> +}
> +
> static void parse_opt(const char *opt, int *mask, char *extra_opts,
> size_t len)
> {
> const struct opt_map *om;
> @@ -306,13 +318,37 @@ static void parse_opt(const char *opt, int
> *mask, char *extra_opts, size_t len)
> }
> }
>
> - len -= strlen(extra_opts);
> + append_extra_opt(opt, extra_opts, len);
> +}
>
> - if (*extra_opts && --len > 0)
> - strcat(extra_opts, ",");
> +static void parse_opt_val(const char *opt, const char *val, char
> *extra_opts, size_t len)
> +{
> + size_t ov_len;
> + char *opt_val;
>
> - if ((len -= strlen(opt)) > 0)
> - strcat(extra_opts, opt);
> + /* readahead is a special value that is handled by userspace
> */
> + if (!strcmp(opt, "readahead")) {
> + char *endptr = NULL;
> + const char *expected_endptr = val + strlen(val);
> +
> + readahead_kb = strtol(val, &endptr, 10);
> +
> + if (endptr != expected_endptr) {
> + nfs_error(_("%s: invalid readahead value
> %s"),
> + progname, val);
> + readahead_kb = 0;
> + }
> + return;
> + }
> +
> + /* Send the option to the kernel. */
> + ov_len = strlen(opt) + strlen(val) + 3;
> + opt_val = malloc(sizeof(char) * ov_len);
> + snprintf(opt_val, ov_len, ",%s=%s", opt, val);
> +
> + append_extra_opt(opt_val, extra_opts, len);
> +
> + free(opt_val);
> }
>
> /*
> @@ -325,7 +361,7 @@ static void parse_opts(const char *options, int
> *flags, char **extra_opts)
> {
> if (options != NULL) {
> char *opts = xstrdup(options);
> - char *opt, *p;
> + char *opt, *p, *val = NULL;
> size_t len = strlen(opts) + 1; /* include room for a
> null */
> int open_quote = 0;
>
> @@ -341,17 +377,75 @@ static void parse_opts(const char *options, int
> *flags, char **extra_opts)
> continue; /* still in a quoted
> block */
> if (*p == ',')
> *p = '\0'; /* terminate the
> option item */
> + if (*p == '=') { /* key=val option */
> + if (!val) {
> + *p = '\0'; /* terminate
> key */
> + val = ++p; /* start the
> value */
> + }
> + }
>
> /* end of option item or last item */
> if (*p == '\0' || *(p + 1) == '\0') {
> - parse_opt(opt, flags, *extra_opts,
> len);
> - opt = NULL;
> + if (val) {
> + parse_opt_val(opt, val,
> *extra_opts, len);
> + } else
> + parse_opt(opt, flags,
> *extra_opts, len);
> + opt = val = NULL;
> }
> }
> free(opts);
> }
> }
>
> +/*
> + * Set the read ahead value for the mount point. On failure, uses
> the default kernel
> + * read ahead value (for new mounts) or the current value (for
> remounts).
> + */
> +static void set_readahead(const char *mount_point) {
> + int error;
> + struct statx mp_stat;
> + char *mount_point_readahead_file, value[READAHEAD_VALUE_LEN];
> + size_t len;
> + int fp;
> +
> + /* If readahead_kb is unset, or set to 0, do not change the
> value */
> + if (!readahead_kb)
> + return;
> +
> + if ((error = statx(0, mount_point, 0, 0, &mp_stat)) != 0) {
> + goto out_error;
> + }
> +
> + if (!(mount_point_readahead_file = malloc(PATH_MAX))) {
> + error = -ENOMEM;
> + goto out_error;
> + }
> +
> + snprintf(mount_point_readahead_file, PATH_MAX,
> "/sys/class/bdi/%d:%d/read_ahead_kb",
> + mp_stat.stx_dev_major,
> mp_stat.stx_dev_minor);
> +
> + len = snprintf(value, READAHEAD_VALUE_LEN, "%d",
> readahead_kb);
> +
> + if ((fp = open(mount_point_readahead_file, O_WRONLY)) < 0) {
> + error = errno;
> + goto out_free;
> + }
> +
> + if ((error = write(fp, value, len)) < 0)
> + goto out_close;
> +
> + close(fp);
> + return;
> +
> +out_close:
> + close(fp);
> +out_free:
> + free(mount_point_readahead_file);
> +out_error:
> + nfs_error(_("%s: unable to set readahead value, using default
> kernel value (error = %d)\n"),
> + progname, error);
> +}
> +
> static int try_mount(char *spec, char *mount_point, int flags,
> char *fs_type, char **extra_opts, char
> *mount_opts,
> int fake, int bg)
> @@ -373,8 +467,10 @@ static int try_mount(char *spec, char
> *mount_point, int flags,
> if (ret)
> return ret;
>
> - if (!fake)
> + if (!fake) {
> + set_readahead(mount_point);
> print_one(spec, mount_point, fs_type, mount_opts);
> + }
>
> return add_mtab(spec, mount_point, fs_type, flags,
> *extra_opts);
> }
> diff --git a/utils/mount/nfs.man b/utils/mount/nfs.man
> index f1b76936..9832a377 100644
> --- a/utils/mount/nfs.man
> +++ b/utils/mount/nfs.man
> @@ -561,6 +561,9 @@ The
> .B sloppy
> option is an alternative to specifying
> .BR mount.nfs " -s " option.
> +.TP 1.5i
> +.B readahead=n
> +Set the read ahead of the mount to n KiB. This is handled entirely
> in userspace and will not appear on mountinfo. If unset or set to 0,
> it will not set the a value, using the current value (for a remount)
> or the kernel default for a new mount.
>
> .SS "Options for NFS versions 2 and 3 only"
> Use these options, along with the options in the above subsection,
There is already a method for doing this: you set up an appropriate
entry in /etc/udev/rules.d/. Adding a mount option, particularly one
that is NFS only and is handled in userspace rather than by the kernel
parser, just causes fragmentation and confusion.
If you want to help users configure readahead, I'd suggest focussing on
a utility to help them set up the udev rule correctly.
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]
On Tue, Aug 03, 2021 at 03:15:54PM +0000, Trond Myklebust wrote:
> There is already a method for doing this: you set up an appropriate
> entry in /etc/udev/rules.d/. Adding a mount option, particularly one
> that is NFS only and is handled in userspace rather than by the kernel
> parser, just causes fragmentation and confusion.
>
> If you want to help users configure readahead, I'd suggest focussing on
> a utility to help them set up the udev rule correctly.
I prototyped a tool that runs on udev when a new bdi is added, you can
see it at https://github.com/trbecker/readahead-utils. If this is good
enough, it can have a user friendly tool that can be used to set the
read ahead, check current values and manipulate the configuration file.
I have a few limitations I see in this approach.
This creates a different place to look into when debugging systems.
Having the mount options would be more "in the face" of the engineer
doing the debugging.
This complicates configurations on clusters, and I find having the
option better.
I'm not sure how this would interact with containers, if this would
create issues between the container and the host system.
Advantages of this approach are the possibility to have defaults by fs
type, device type, server (in case of a networked filesysem), wildcards.
Some are good because they allow the user to improve memory usage
(reducing the readahead for random access devices, increasing it for
spinning devices and network devices, ...), some allow a set and forget
configuration for directories that are used for hosting filesystems
(automount, container directories, ...), which can't be achieved by a
mount option.
I agree with the fragmentation. Having this go into the kernel parser as
a mount option seems to be a better way. Again, this is more granular
and easier to find than having a separate application that runs on udev.
I want opinions on these different approaches.
Best,
Thiago
Hey Trond!
On 8/3/21 11:15 AM, Trond Myklebust wrote:
> On Tue, 2021-08-03 at 10:07 -0300, Thiago Rafael Becker wrote:
>> Linux kernel defined the default read ahead to 128KiB, and this is
>> making read perform poorly. To mitigate it, add readahead as a mount
>> option that is handled by userspace, with some refactoring included.
>>
>> Signed-off-by: Thiago Rafael Becker <[email protected]>
>> ---
>> utils/mount/mount.c | 114 ++++++++++++++++++++++++++++++++++++++++--
>> --
>> utils/mount/nfs.man | 3 ++
>> 2 files changed, 108 insertions(+), 9 deletions(-)
>>
>> diff --git a/utils/mount/mount.c b/utils/mount/mount.c
>> index b98f9e00..15076ca8 100644
>> --- a/utils/mount/mount.c
>> +++ b/utils/mount/mount.c
>> @@ -56,9 +56,11 @@ int nomtab;
>> int verbose;
>> int sloppy;
>> int string;
>> +static int readahead_kb = 0;
>>
>> #define FOREGROUND (0)
>> #define BACKGROUND (1)
>> +#define READAHEAD_VALUE_LEN 24
>>
>> static struct option longopts[] = {
>> { "fake", 0, 0, 'f' },
>> @@ -292,6 +294,16 @@ static int add_mtab(char *spec, char
>> *mount_point, char *fstype,
>> return result;
>> }
>>
>> +static void append_extra_opt(const char *opt, char *extra_opts,
>> size_t len) {
>> + len -= strlen(extra_opts);
>> +
>> + if (*extra_opts && --len > 0)
>> + strcat(extra_opts, ",");
>> +
>> + if ((len -= strlen(opt)) > 0)
>> + strcat(extra_opts, opt);
>> +}
>> +
>> static void parse_opt(const char *opt, int *mask, char *extra_opts,
>> size_t len)
>> {
>> const struct opt_map *om;
>> @@ -306,13 +318,37 @@ static void parse_opt(const char *opt, int
>> *mask, char *extra_opts, size_t len)
>> }
>> }
>>
>> - len -= strlen(extra_opts);
>> + append_extra_opt(opt, extra_opts, len);
>> +}
>>
>> - if (*extra_opts && --len > 0)
>> - strcat(extra_opts, ",");
>> +static void parse_opt_val(const char *opt, const char *val, char
>> *extra_opts, size_t len)
>> +{
>> + size_t ov_len;
>> + char *opt_val;
>>
>> - if ((len -= strlen(opt)) > 0)
>> - strcat(extra_opts, opt);
>> + /* readahead is a special value that is handled by userspace
>> */
>> + if (!strcmp(opt, "readahead")) {
>> + char *endptr = NULL;
>> + const char *expected_endptr = val + strlen(val);
>> +
>> + readahead_kb = strtol(val, &endptr, 10);
>> +
>> + if (endptr != expected_endptr) {
>> + nfs_error(_("%s: invalid readahead value
>> %s"),
>> + progname, val);
>> + readahead_kb = 0;
>> + }
>> + return;
>> + }
>> +
>> + /* Send the option to the kernel. */
>> + ov_len = strlen(opt) + strlen(val) + 3;
>> + opt_val = malloc(sizeof(char) * ov_len);
>> + snprintf(opt_val, ov_len, ",%s=%s", opt, val);
>> +
>> + append_extra_opt(opt_val, extra_opts, len);
>> +
>> + free(opt_val);
>> }
>>
>> /*
>> @@ -325,7 +361,7 @@ static void parse_opts(const char *options, int
>> *flags, char **extra_opts)
>> {
>> if (options != NULL) {
>> char *opts = xstrdup(options);
>> - char *opt, *p;
>> + char *opt, *p, *val = NULL;
>> size_t len = strlen(opts) + 1; /* include room for a
>> null */
>> int open_quote = 0;
>>
>> @@ -341,17 +377,75 @@ static void parse_opts(const char *options, int
>> *flags, char **extra_opts)
>> continue; /* still in a quoted
>> block */
>> if (*p == ',')
>> *p = '\0'; /* terminate the
>> option item */
>> + if (*p == '=') { /* key=val option */
>> + if (!val) {
>> + *p = '\0'; /* terminate
>> key */
>> + val = ++p; /* start the
>> value */
>> + }
>> + }
>>
>> /* end of option item or last item */
>> if (*p == '\0' || *(p + 1) == '\0') {
>> - parse_opt(opt, flags, *extra_opts,
>> len);
>> - opt = NULL;
>> + if (val) {
>> + parse_opt_val(opt, val,
>> *extra_opts, len);
>> + } else
>> + parse_opt(opt, flags,
>> *extra_opts, len);
>> + opt = val = NULL;
>> }
>> }
>> free(opts);
>> }
>> }
>>
>> +/*
>> + * Set the read ahead value for the mount point. On failure, uses
>> the default kernel
>> + * read ahead value (for new mounts) or the current value (for
>> remounts).
>> + */
>> +static void set_readahead(const char *mount_point) {
>> + int error;
>> + struct statx mp_stat;
>> + char *mount_point_readahead_file, value[READAHEAD_VALUE_LEN];
>> + size_t len;
>> + int fp;
>> +
>> + /* If readahead_kb is unset, or set to 0, do not change the
>> value */
>> + if (!readahead_kb)
>> + return;
>> +
>> + if ((error = statx(0, mount_point, 0, 0, &mp_stat)) != 0) {
>> + goto out_error;
>> + }
>> +
>> + if (!(mount_point_readahead_file = malloc(PATH_MAX))) {
>> + error = -ENOMEM;
>> + goto out_error;
>> + }
>> +
>> + snprintf(mount_point_readahead_file, PATH_MAX,
>> "/sys/class/bdi/%d:%d/read_ahead_kb",
>> + mp_stat.stx_dev_major,
>> mp_stat.stx_dev_minor);
>> +
>> + len = snprintf(value, READAHEAD_VALUE_LEN, "%d",
>> readahead_kb);
>> +
>> + if ((fp = open(mount_point_readahead_file, O_WRONLY)) < 0) {
>> + error = errno;
>> + goto out_free;
>> + }
>> +
>> + if ((error = write(fp, value, len)) < 0)
>> + goto out_close;
>> +
>> + close(fp);
>> + return;
>> +
>> +out_close:
>> + close(fp);
>> +out_free:
>> + free(mount_point_readahead_file);
>> +out_error:
>> + nfs_error(_("%s: unable to set readahead value, using default
>> kernel value (error = %d)\n"),
>> + progname, error);
>> +}
>> +
>> static int try_mount(char *spec, char *mount_point, int flags,
>> char *fs_type, char **extra_opts, char
>> *mount_opts,
>> int fake, int bg)
>> @@ -373,8 +467,10 @@ static int try_mount(char *spec, char
>> *mount_point, int flags,
>> if (ret)
>> return ret;
>>
>> - if (!fake)
>> + if (!fake) {
>> + set_readahead(mount_point);
>> print_one(spec, mount_point, fs_type, mount_opts);
>> + }
>>
>> return add_mtab(spec, mount_point, fs_type, flags,
>> *extra_opts);
>> }
>> diff --git a/utils/mount/nfs.man b/utils/mount/nfs.man
>> index f1b76936..9832a377 100644
>> --- a/utils/mount/nfs.man
>> +++ b/utils/mount/nfs.man
>> @@ -561,6 +561,9 @@ The
>> .B sloppy
>> option is an alternative to specifying
>> .BR mount.nfs " -s " option.
>> +.TP 1.5i
>> +.B readahead=n
>> +Set the read ahead of the mount to n KiB. This is handled entirely
>> in userspace and will not appear on mountinfo. If unset or set to 0,
>> it will not set the a value, using the current value (for a remount)
>> or the kernel default for a new mount.
>>
>> .SS "Options for NFS versions 2 and 3 only"
>> Use these options, along with the options in the above subsection,
>
>
> There is already a method for doing this: you set up an appropriate
> entry in /etc/udev/rules.d/. Adding a mount option, particularly one
> that is NFS only and is handled in userspace rather than by the kernel
> parser, just causes fragmentation and confusion.
>
> If you want to help users configure readahead, I'd suggest focussing on
> a utility to help them set up the udev rule correctly.I'm leaning toward taking this patch... True one can add a udev rule,
but setting a mount option is much simpler and straightforward...
Plus with nfsmount.conf, it can be per fs,server,or global.
You know I am not a fan of taking new mount options, but I have
bugs open that show the degrading of read performance from one
release over another... If we can eliminate that degradation with an
mount option... I really don't see a problem with that...
steved.
Hello, all
On Thu, Aug 05, 2021 at 02:23:29PM -0400, Steve Dickson wrote:
> Hey Trond!
> but setting a mount option is much simpler and straightforward...
> Plus with nfsmount.conf, it can be per fs,server,or global.
>
> You know I am not a fan of taking new mount options, but I have
> bugs open that show the degrading of read performance from one
> release over another... If we can eliminate that degradation with an mount
> option... I really don't see a problem with that...
>
> steved.
>
CEPH and CIFS have the rasize mount option that sets the readahead
size for that file system, so there are precedents for having this
option for NFS as well.