Recent changes in the linux kernel caused NFS readahead to default to
128 from the previous default of 15 * rsize. This causes performance
penalties to some read-heavy workloads, which can be fixed by
tuning the readahead for that given mount.
Specifically, the read troughput on a sec=krb5p mount drops by 50-75%
when comparing the default readahead with a readahead of 15360.
Previous discussions:
https://lore.kernel.org/linux-nfs/[email protected]/
I attempted to add a non-kernel option to mount.nfs, and it was
rejected.
https://lore.kernel.org/linux-nfs/[email protected]/
Attempted to add a mount option to the kernel, rejected as well.
I had started a separate tool to set the readahead of BDIs, but the
scope is specifically for NFS, so I would like to get the community
feeling for having this in nfs-utils.
This patch series introduces nfs-readahead-udev, a utility to
automatically set NFS readahead when NFS is mounted. The utility is
triggered by udev when a new BDI is added, returns to udev the value of
the readahead that should be used.
The tool currently supports setting read ahead per mountpoint, nfs major
version, or by a global default value.
v2:
- explain the motivation
v3:
- adopt already available facilities
- nfsrahead is now configured in nfs.conf
v4:
- retry getting the device if it fails
- assorted fixes and improvements
Thiago Becker (7):
Create nfsrahead
nfsrahead: configure udev
nfsrahead: only set readahead for nfs devices.
nfsrahead: add logging
nfsrahead: get the information from the config file.
nfsrahead: User documentation
nfsrahead: retry getting the device if it fails.
.gitignore | 2 +
configure.ac | 1 +
systemd/nfs.conf.man | 11 ++
tools/Makefile.am | 2 +-
tools/nfsrahead/99-nfs.rules.in | 1 +
tools/nfsrahead/Makefile.am | 16 +++
tools/nfsrahead/main.c | 183 ++++++++++++++++++++++++++++++++
tools/nfsrahead/nfsrahead.man | 72 +++++++++++++
8 files changed, 287 insertions(+), 1 deletion(-)
create mode 100644 tools/nfsrahead/99-nfs.rules.in
create mode 100644 tools/nfsrahead/Makefile.am
create mode 100644 tools/nfsrahead/main.c
create mode 100644 tools/nfsrahead/nfsrahead.man
--
2.35.1
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1946283
Signed-off-by: Thiago Becker <[email protected]>
---
tools/nfsrahead/Makefile.am | 1 +
tools/nfsrahead/main.c | 40 ++++++++++++++++++++++++++++++++-----
2 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/tools/nfsrahead/Makefile.am b/tools/nfsrahead/Makefile.am
index 0daddc4b..60a1102a 100644
--- a/tools/nfsrahead/Makefile.am
+++ b/tools/nfsrahead/Makefile.am
@@ -1,6 +1,7 @@
libexec_PROGRAMS = nfsrahead
nfsrahead_SOURCES = main.c
nfsrahead_LDFLAGS= -lmount
+nfsrahead_LDADD = ../../support/nfs/libnfsconf.la
udev_rulesdir = /usr/lib/udev/rules.d/
udev_rules_DATA = 99-nfs.rules
diff --git a/tools/nfsrahead/main.c b/tools/nfsrahead/main.c
index 8fe6d648..b630b92f 100644
--- a/tools/nfsrahead/main.c
+++ b/tools/nfsrahead/main.c
@@ -2,14 +2,19 @@
#include <string.h>
#include <stdlib.h>
#include <errno.h>
+#include <unistd.h>
#include <libmount/libmount.h>
#include <sys/sysmacros.h>
+#include "xlog.h"
+
#ifndef MOUNTINFO_PATH
#define MOUNTINFO_PATH "/proc/self/mountinfo"
#endif
+#define CONF_NAME "nfsrahead"
+
/* Device information from the system */
struct device_info {
char *device_number;
@@ -108,25 +113,50 @@ static int get_device_info(const char *device_number, struct device_info *device
return ret;
}
+#define L_DEFAULT (L_WARNING | L_ERROR | L_FATAL)
+
int main(int argc, char **argv)
{
int ret = 0;
struct device_info device;
- unsigned int readahead = 128;
-
- if (argc != 2) {
- return -EINVAL;
+ unsigned int readahead = 128, verbose = 0, log_stderr = 0;
+ char opt;
+
+ while((opt = getopt(argc, argv, "dF")) != -1) {
+ switch (opt) {
+ case 'd':
+ verbose = 1;
+ break;
+ case 'F':
+ log_stderr = 1;
+ break;
+ }
}
- if ((ret = get_device_info(argv[1], &device)) != 0) {
+ xlog_stderr(log_stderr);
+ xlog_syslog(~log_stderr);
+ xlog_config(L_DEFAULT | (L_NOTICE & verbose), 1);
+ xlog_open(CONF_NAME);
+
+ // xlog_err causes the system to exit
+ if ((argc - optind) != 1)
+ xlog_err("expected the device number of a BDI; is udev ok?");
+
+ if ((ret = get_device_info(argv[optind], &device)) != 0) {
+ xlog(L_ERROR, "unable to find device %s\n", argv[optind]);
goto out;
}
if (strncmp("nfs", device.fstype, 3) != 0) {
+ xlog(L_NOTICE,
+ "not setting readahead for non supported fstype %s on device %s\n",
+ device.fstype, argv[optind]);
ret = -EINVAL;
goto out;
}
+ xlog(L_WARNING, "setting %s readahead to %d\n", device.mountpoint, readahead);
+
printf("%d\n", readahead);
out:
--
2.35.1
Limit setting the readahead for nfs devices.
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1946283
Signed-off-by: Thiago Becker <[email protected]>
---
tools/nfsrahead/99-nfs.rules.in | 2 +-
tools/nfsrahead/Makefile.am | 1 +
tools/nfsrahead/main.c | 128 ++++++++++++++++++++++++++++++++
3 files changed, 130 insertions(+), 1 deletion(-)
diff --git a/tools/nfsrahead/99-nfs.rules.in b/tools/nfsrahead/99-nfs.rules.in
index 7d55b407..648813c5 100644
--- a/tools/nfsrahead/99-nfs.rules.in
+++ b/tools/nfsrahead/99-nfs.rules.in
@@ -1 +1 @@
-SUBSYSTEM=="bdi", ACTION=="add", PROGRAM="_libexecdir_/nfsrahead", ATTR{read_ahead_kb}="%c"
+SUBSYSTEM=="bdi", ACTION=="add", PROGRAM="_libexecdir_/nfsrahead %k", ATTR{read_ahead_kb}="%c"
diff --git a/tools/nfsrahead/Makefile.am b/tools/nfsrahead/Makefile.am
index 58a2ea29..0daddc4b 100644
--- a/tools/nfsrahead/Makefile.am
+++ b/tools/nfsrahead/Makefile.am
@@ -1,5 +1,6 @@
libexec_PROGRAMS = nfsrahead
nfsrahead_SOURCES = main.c
+nfsrahead_LDFLAGS= -lmount
udev_rulesdir = /usr/lib/udev/rules.d/
udev_rules_DATA = 99-nfs.rules
diff --git a/tools/nfsrahead/main.c b/tools/nfsrahead/main.c
index 0359aced..8fe6d648 100644
--- a/tools/nfsrahead/main.c
+++ b/tools/nfsrahead/main.c
@@ -1,7 +1,135 @@
#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include <libmount/libmount.h>
+#include <sys/sysmacros.h>
+
+#ifndef MOUNTINFO_PATH
+#define MOUNTINFO_PATH "/proc/self/mountinfo"
+#endif
+
+/* Device information from the system */
+struct device_info {
+ char *device_number;
+ dev_t dev;
+ char *mountpoint;
+ char *fstype;
+};
+
+/* Convert a string in the format n:m to a device number */
+static dev_t dev_from_arg(const char *device_number)
+{
+ char *s = strdup(device_number), *p;
+ char *maj_s, *min_s;
+ unsigned int maj, min;
+ dev_t dev;
+
+ maj_s = p = s;
+ for ( ; *p != ':'; p++)
+ ;
+
+ *p = '\0';
+ min_s = p + 1;
+
+ maj = strtol(maj_s, NULL, 10);
+ min = strtol(min_s, NULL, 10);
+
+ dev = makedev(maj, min);
+
+ free(s);
+ return dev;
+}
+
+#define sfree(ptr) if (ptr) free(ptr)
+
+/* device_info maintenance */
+static void init_device_info(struct device_info *di, const char *device_number)
+{
+ di->device_number = strdup(device_number);
+ di->dev = dev_from_arg(device_number);
+ di->mountpoint = NULL;
+ di->fstype = NULL;
+}
+
+
+static void free_device_info(struct device_info *di)
+{
+ sfree(di->mountpoint);
+ sfree(di->fstype);
+ sfree(di->device_number);
+}
+
+static int get_mountinfo(const char *device_number, struct device_info *device_info, const char *mountinfo_path)
+{
+ int ret = 0;
+ struct libmnt_table *mnttbl;
+ struct libmnt_fs *fs;
+ char *target;
+
+ init_device_info(device_info, device_number);
+
+ mnttbl = mnt_new_table();
+
+ if ((ret = mnt_table_parse_file(mnttbl, mountinfo_path)) < 0)
+ goto out_free_tbl;
+
+ if ((fs = mnt_table_find_devno(mnttbl, device_info->dev, MNT_ITER_FORWARD)) == NULL) {
+ ret = ENOENT;
+ goto out_free_tbl;
+ }
+
+ if ((target = (char *)mnt_fs_get_target(fs)) == NULL) {
+ ret = ENOENT;
+ goto out_free_fs;
+ }
+
+ device_info->mountpoint = strdup(target);
+ target = (char *)mnt_fs_get_fstype(fs);
+ if (target)
+ device_info->fstype = strdup(target);
+
+out_free_fs:
+ mnt_free_fs(fs);
+out_free_tbl:
+ mnt_free_table(mnttbl);
+ free(device_info->device_number);
+ device_info->device_number = NULL;
+ return ret;
+}
+
+static int get_device_info(const char *device_number, struct device_info *device_info)
+{
+ int ret = ENOENT;
+ for (int retry_count = 0; retry_count < 10 && ret != 0; retry_count++)
+ ret = get_mountinfo(device_number, device_info, MOUNTINFO_PATH);
+
+ return ret;
+}
int main(int argc, char **argv)
{
+ int ret = 0;
+ struct device_info device;
unsigned int readahead = 128;
+
+ if (argc != 2) {
+ return -EINVAL;
+ }
+
+ if ((ret = get_device_info(argv[1], &device)) != 0) {
+ goto out;
+ }
+
+ if (strncmp("nfs", device.fstype, 3) != 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
printf("%d\n", readahead);
+
+out:
+ free_device_info(&device);
+ return ret;
}
--
2.35.1
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1946283
Signed-off-by: Thiago Becker <[email protected]>
---
tools/nfsrahead/main.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tools/nfsrahead/main.c b/tools/nfsrahead/main.c
index b630b92f..1a312ac9 100644
--- a/tools/nfsrahead/main.c
+++ b/tools/nfsrahead/main.c
@@ -8,12 +8,14 @@
#include <sys/sysmacros.h>
#include "xlog.h"
+#include "conffile.h"
#ifndef MOUNTINFO_PATH
#define MOUNTINFO_PATH "/proc/self/mountinfo"
#endif
#define CONF_NAME "nfsrahead"
+#define NFS_DEFAULT_READAHEAD 128
/* Device information from the system */
struct device_info {
@@ -113,6 +115,14 @@ static int get_device_info(const char *device_number, struct device_info *device
return ret;
}
+static int conf_get_readahead(const char *kind) {
+ int readahead = 0;
+
+ if((readahead = conf_get_num(CONF_NAME, kind, -1)) == -1)
+ readahead = conf_get_num(CONF_NAME, "default", NFS_DEFAULT_READAHEAD);
+
+ return readahead;
+}
#define L_DEFAULT (L_WARNING | L_ERROR | L_FATAL)
int main(int argc, char **argv)
@@ -133,6 +143,8 @@ int main(int argc, char **argv)
}
}
+ conf_init_file(NFS_CONFFILE);
+
xlog_stderr(log_stderr);
xlog_syslog(~log_stderr);
xlog_config(L_DEFAULT | (L_NOTICE & verbose), 1);
@@ -155,6 +167,8 @@ int main(int argc, char **argv)
goto out;
}
+ readahead = conf_get_readahead(device.fstype);
+
xlog(L_WARNING, "setting %s readahead to %d\n", device.mountpoint, readahead);
printf("%d\n", readahead);
--
2.35.1
Add the man page for nfsrahead, and add the new section to nfs.conf.
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1946283
Signed-off-by: Thiago Becker <[email protected]>
---
systemd/nfs.conf.man | 11 ++++++
tools/nfsrahead/Makefile.am | 3 ++
tools/nfsrahead/nfsrahead.man | 72 +++++++++++++++++++++++++++++++++++
3 files changed, 86 insertions(+)
create mode 100644 tools/nfsrahead/nfsrahead.man
diff --git a/systemd/nfs.conf.man b/systemd/nfs.conf.man
index be487a11..e74083e9 100644
--- a/systemd/nfs.conf.man
+++ b/systemd/nfs.conf.man
@@ -294,6 +294,17 @@ Only
.B debug=
is recognized.
+.TP
+.B nfsrahead
+Recognized values:
+.BR nfs ,
+.BR nfsv4 ,
+.BR default .
+
+See
+.BR nfsrahead (5)
+for deatils.
+
.SH FILES
.TP 10n
.I /etc/nfs.conf
diff --git a/tools/nfsrahead/Makefile.am b/tools/nfsrahead/Makefile.am
index 60a1102a..845ea0d5 100644
--- a/tools/nfsrahead/Makefile.am
+++ b/tools/nfsrahead/Makefile.am
@@ -3,6 +3,9 @@ nfsrahead_SOURCES = main.c
nfsrahead_LDFLAGS= -lmount
nfsrahead_LDADD = ../../support/nfs/libnfsconf.la
+man5_MANS = nfsrahead.man
+EXTRA_DIST = $(man5_MANS)
+
udev_rulesdir = /usr/lib/udev/rules.d/
udev_rules_DATA = 99-nfs.rules
diff --git a/tools/nfsrahead/nfsrahead.man b/tools/nfsrahead/nfsrahead.man
new file mode 100644
index 00000000..5488f633
--- /dev/null
+++ b/tools/nfsrahead/nfsrahead.man
@@ -0,0 +1,72 @@
+.\" Manpage for nfsrahead.
+.nh
+.ad l
+.TH man 5 "08 Mar 2022" "1.0" "nfsrahead man page"
+.SH NAME
+
+nfsrahead \- Configure the readahead for NFS mounts
+
+.SH SYNOPSIS
+
+nfsrahead [-F] [-d] <device>
+
+.SH DESCRIPTION
+
+\fInfsrahead\fR is a tool intended to be used with udev to set the \fIread_ahead_kb\fR parameter of NFS mounts, according to the configuration file (see \fICONFIGURATION\fR). \fIdevice\fR is the device number for the NFS backing device as provided by the kernel.
+
+.SH OPTIONS
+.TP
+.B -F
+Send messages to
+.I stderr
+instead of
+.I syslog
+
+.TP
+.B -d
+Increase the debugging level.
+
+.SH CONFIGURATION
+.I nfsrahead
+is configured in
+.IR /etc/nfs.conf ,
+in the section titled
+.IR nfsrahead .
+It accepts the following configurations.
+
+.TP
+.B nfs=<value>
+The readahead value applied to NFSv3 mounts.
+
+.TP
+.B nfs4=<value>
+The readahead value applied to NFSv4 mounts.
+
+.TP
+.B default=<value>
+The default configuration when none of the configurations above is set.
+
+.SH EXAMPLE CONFIGURATION
+[nfsrahead]
+.br
+nfs=15000 # readahead of 15000 for NFSv3 mounts
+.br
+nfs4=16000 # readahead of 16000 for NFSv4 mounts
+.br
+default=128 # default is 128
+
+.SH SEE ALSO
+
+.BR mount.nfs (8),
+.BR nfs (5),
+.BR nfs.conf (5),
+.BR udev (7),
+.BR bcc-readahead (8)
+
+.SH BUGS
+
+No known bugs.
+
+.SH AUTHOR
+
+Thiago Rafael Becker <[email protected]>
--
2.35.1
Sometimes the mountinfo entry is not available when nfsrahead is called,
leading to failure to set the readahead. Retry getting the device before
failing.
Signed-off-by: Thiago Becker <[email protected]>
---
tools/nfsrahead/main.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/tools/nfsrahead/main.c b/tools/nfsrahead/main.c
index 1a312ac9..b3af3aa8 100644
--- a/tools/nfsrahead/main.c
+++ b/tools/nfsrahead/main.c
@@ -127,7 +127,7 @@ static int conf_get_readahead(const char *kind) {
int main(int argc, char **argv)
{
- int ret = 0;
+ int ret = 0, retry;
struct device_info device;
unsigned int readahead = 128, verbose = 0, log_stderr = 0;
char opt;
@@ -154,7 +154,11 @@ int main(int argc, char **argv)
if ((argc - optind) != 1)
xlog_err("expected the device number of a BDI; is udev ok?");
- if ((ret = get_device_info(argv[optind], &device)) != 0) {
+ for (retry = 0; retry <= 10; retry++ )
+ if ((ret = get_device_info(argv[optind], &device)) == 0)
+ break;
+
+ if (ret != 0) {
xlog(L_ERROR, "unable to find device %s\n", argv[optind]);
goto out;
}
--
2.35.1
Set the udev rule to call the readahead utility.
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1946283
Signed-off-by: Thiago Becker <[email protected]>
---
.gitignore | 1 +
tools/nfsrahead/99-nfs.rules.in | 1 +
tools/nfsrahead/Makefile.am | 8 ++++++++
3 files changed, 10 insertions(+)
create mode 100644 tools/nfsrahead/99-nfs.rules.in
diff --git a/.gitignore b/.gitignore
index 38ab1d39..df791a83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,6 +62,7 @@ tools/locktest/testlk
tools/getiversion/getiversion
tools/nfsconf/nfsconf
tools/nfsrahead/nfsrahead
+tools/nfsrahead/99-nfs_bdi.rules
support/export/mount.h
support/export/mount_clnt.c
support/export/mount_xdr.c
diff --git a/tools/nfsrahead/99-nfs.rules.in b/tools/nfsrahead/99-nfs.rules.in
new file mode 100644
index 00000000..7d55b407
--- /dev/null
+++ b/tools/nfsrahead/99-nfs.rules.in
@@ -0,0 +1 @@
+SUBSYSTEM=="bdi", ACTION=="add", PROGRAM="_libexecdir_/nfsrahead", ATTR{read_ahead_kb}="%c"
diff --git a/tools/nfsrahead/Makefile.am b/tools/nfsrahead/Makefile.am
index edff7921..58a2ea29 100644
--- a/tools/nfsrahead/Makefile.am
+++ b/tools/nfsrahead/Makefile.am
@@ -1,3 +1,11 @@
libexec_PROGRAMS = nfsrahead
nfsrahead_SOURCES = main.c
+udev_rulesdir = /usr/lib/udev/rules.d/
+udev_rules_DATA = 99-nfs.rules
+
+99-nfs.rules: 99-nfs.rules.in $(builddefs)
+ $(SED) "s|_libexecdir_|@libexecdir@|g" 99-nfs.rules.in > $@
+
+clean-local:
+ $(RM) 99-nfs.rules
--
2.35.1
On 4/1/22 11:32 AM, Thiago Becker wrote:
> Recent changes in the linux kernel caused NFS readahead to default to
> 128 from the previous default of 15 * rsize. This causes performance
> penalties to some read-heavy workloads, which can be fixed by
> tuning the readahead for that given mount.
>
> Specifically, the read troughput on a sec=krb5p mount drops by 50-75%
> when comparing the default readahead with a readahead of 15360.
>
> Previous discussions:
> https://lore.kernel.org/linux-nfs/[email protected]/
> I attempted to add a non-kernel option to mount.nfs, and it was
> rejected.
>
> https://lore.kernel.org/linux-nfs/[email protected]/
> Attempted to add a mount option to the kernel, rejected as well.
>
> I had started a separate tool to set the readahead of BDIs, but the
> scope is specifically for NFS, so I would like to get the community
> feeling for having this in nfs-utils.
>
> This patch series introduces nfs-readahead-udev, a utility to
> automatically set NFS readahead when NFS is mounted. The utility is
> triggered by udev when a new BDI is added, returns to udev the value of
> the readahead that should be used.
>
> The tool currently supports setting read ahead per mountpoint, nfs major
> version, or by a global default value.
>
> v2:
> - explain the motivation
>
> v3:
> - adopt already available facilities
> - nfsrahead is now configured in nfs.conf
>
> v4:
> - retry getting the device if it fails
> - assorted fixes and improvements
>
> Thiago Becker (7):
> Create nfsrahead
> nfsrahead: configure udev
> nfsrahead: only set readahead for nfs devices.
> nfsrahead: add logging
> nfsrahead: get the information from the config file.
> nfsrahead: User documentation
> nfsrahead: retry getting the device if it fails.
>
> .gitignore | 2 +
> configure.ac | 1 +
> systemd/nfs.conf.man | 11 ++
> tools/Makefile.am | 2 +-
> tools/nfsrahead/99-nfs.rules.in | 1 +
> tools/nfsrahead/Makefile.am | 16 +++
> tools/nfsrahead/main.c | 183 ++++++++++++++++++++++++++++++++
> tools/nfsrahead/nfsrahead.man | 72 +++++++++++++
> 8 files changed, 287 insertions(+), 1 deletion(-)
> create mode 100644 tools/nfsrahead/99-nfs.rules.in
> create mode 100644 tools/nfsrahead/Makefile.am
> create mode 100644 tools/nfsrahead/main.c
> create mode 100644 tools/nfsrahead/nfsrahead.man
>
Committed... (tag nfs-utils-2-6-2-rc4)
steved.