From: Mattias Nissler <[email protected]>
For mounts that have the new "nosymfollow" option, don't follow symlinks
when resolving paths. The new option is similar in spirit to the
existing "nodev", "noexec", and "nosuid" options, as well as to the
LOOKUP_NO_SYMLINKS resolve flag in the openat2(2) syscall. Various BSD
variants have been supporting the "nosymfollow" mount option for a long
time with equivalent implementations.
Note that symlinks may still be created on file systems mounted with
the "nosymfollow" option present. readlink() remains functional, so
user space code that is aware of symlinks can still choose to follow
them explicitly.
Setting the "nosymfollow" mount option helps prevent privileged
writers from modifying files unintentionally in case there is an
unexpected link along the accessed path. The "nosymfollow" option is
thus useful as a defensive measure for systems that need to deal with
untrusted file systems in privileged contexts.
More information on the history and motivation for this patch can be
found here:
https://sites.google.com/a/chromium.org/dev/chromium-os/chromiumos-design-docs/hardening-against-malicious-stateful-data#TOC-Restricting-symlink-traversal
Signed-off-by: Mattias Nissler <[email protected]>
Signed-off-by: Ross Zwisler <[email protected]>
---
Changes since v4 [1]:
* Rebased changes on top of the newly merged openat2(2) changes. This
changed the error value when trying to traverse symlinks on a
restricted filesystem from -EACCESS to -ELOOP, which is what
LOOKUP_NO_SYMLINKS uses.
[1]: https://lkml.org/lkml/2020/1/30/933
---
fs/namei.c | 3 ++-
fs/namespace.c | 2 ++
fs/proc_namespace.c | 1 +
fs/statfs.c | 2 ++
include/linux/mount.h | 3 ++-
include/linux/statfs.h | 1 +
include/uapi/linux/mount.h | 1 +
7 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 4167109297e0f..93af3d4353543 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1122,7 +1122,8 @@ const char *get_link(struct nameidata *nd)
int error;
const char *res;
- if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
+ if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
+ unlikely(nd->path.mnt->mnt_flags & MNT_NOSYMFOLLOW))
return ERR_PTR(-ELOOP);
if (!(nd->flags & LOOKUP_RCU)) {
diff --git a/fs/namespace.c b/fs/namespace.c
index 5e1bf611a9eb6..240421e02940d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3109,6 +3109,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
+ if (flags & MS_NOSYMFOLLOW)
+ mnt_flags |= MNT_NOSYMFOLLOW;
/* The default atime for remount is preservation */
if ((flags & MS_REMOUNT) &&
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 273ee82d8aa97..91a552f617406 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -70,6 +70,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
{ MNT_NOATIME, ",noatime" },
{ MNT_NODIRATIME, ",nodiratime" },
{ MNT_RELATIME, ",relatime" },
+ { MNT_NOSYMFOLLOW, ",nosymfollow" },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
diff --git a/fs/statfs.c b/fs/statfs.c
index 2616424012ea7..59f33752c1311 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -29,6 +29,8 @@ static int flags_by_mnt(int mnt_flags)
flags |= ST_NODIRATIME;
if (mnt_flags & MNT_RELATIME)
flags |= ST_RELATIME;
+ if (mnt_flags & MNT_NOSYMFOLLOW)
+ flags |= ST_NOSYMFOLLOW;
return flags;
}
diff --git a/include/linux/mount.h b/include/linux/mount.h
index bf8cc4108b8f9..ff2d132c21f5d 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,6 +30,7 @@ struct fs_context;
#define MNT_NODIRATIME 0x10
#define MNT_RELATIME 0x20
#define MNT_READONLY 0x40 /* does the user want this to be r/o? */
+#define MNT_NOSYMFOLLOW 0x80
#define MNT_SHRINKABLE 0x100
#define MNT_WRITE_HOLD 0x200
@@ -46,7 +47,7 @@ struct fs_context;
#define MNT_SHARED_MASK (MNT_UNBINDABLE)
#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
| MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
- | MNT_READONLY)
+ | MNT_READONLY | MNT_NOSYMFOLLOW)
#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
diff --git a/include/linux/statfs.h b/include/linux/statfs.h
index 9bc69edb8f188..fac4356ea1bfc 100644
--- a/include/linux/statfs.h
+++ b/include/linux/statfs.h
@@ -40,6 +40,7 @@ struct kstatfs {
#define ST_NOATIME 0x0400 /* do not update access times */
#define ST_NODIRATIME 0x0800 /* do not update directory access times */
#define ST_RELATIME 0x1000 /* update atime relative to mtime/ctime */
+#define ST_NOSYMFOLLOW 0x2000 /* do not follow symlinks */
struct dentry;
extern int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 96a0240f23fed..c268ea586dbf4 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -34,6 +34,7 @@
#define MS_I_VERSION (1<<23) /* Update inode I_version field */
#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
+#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
/* These sb flags are internal to the kernel */
#define MS_SUBMOUNT (1<<26)
--
2.25.0.341.g760bfbb309-goog
> --- a/include/uapi/linux/mount.h
> +++ b/include/uapi/linux/mount.h
> @@ -34,6 +34,7 @@
> #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
Doesn't this conflict with MS_SUBMOUNT below?
>
> /* These sb flags are internal to the kernel */
> #define MS_SUBMOUNT (1<<26)
On Tue, Feb 4, 2020 at 2:53 PM Raul Rangel <[email protected]> wrote:
> > --- a/include/uapi/linux/mount.h
> > +++ b/include/uapi/linux/mount.h
> > @@ -34,6 +34,7 @@
> > #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> > #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> > #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
> Doesn't this conflict with MS_SUBMOUNT below?
> >
> > /* These sb flags are internal to the kernel */
> > #define MS_SUBMOUNT (1<<26)
Yep. Thanks for the catch, v6 on it's way.
On Tue, Feb 4, 2020 at 3:11 PM Ross Zwisler <[email protected]> wrote:
> On Tue, Feb 4, 2020 at 2:53 PM Raul Rangel <[email protected]> wrote:
> > > --- a/include/uapi/linux/mount.h
> > > +++ b/include/uapi/linux/mount.h
> > > @@ -34,6 +34,7 @@
> > > #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> > > #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> > > #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > > +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
> > Doesn't this conflict with MS_SUBMOUNT below?
> > >
> > > /* These sb flags are internal to the kernel */
> > > #define MS_SUBMOUNT (1<<26)
>
> Yep. Thanks for the catch, v6 on it's way.
It actually looks like most of the flags which are internal to the
kernel are actually unused (MS_SUBMOUNT, MS_NOREMOTELOCK, MS_NOSEC,
MS_BORN and MS_ACTIVE). Several are unused completely, and the rest
are just part of the AA_MS_IGNORE_MASK which masks them off in the
apparmor LSM, but I'm pretty sure they couldn't have been set anyway.
I'll just take over (1<<26) for MS_NOSYMFOLLOW, and remove the rest in
a second patch.
If someone thinks these flags are actually used by something and I'm
just missing it, please let me know.
On Tue, Feb 04, 2020 at 04:49:48PM -0700, Ross Zwisler wrote:
> On Tue, Feb 4, 2020 at 3:11 PM Ross Zwisler <[email protected]> wrote:
> > On Tue, Feb 4, 2020 at 2:53 PM Raul Rangel <[email protected]> wrote:
> > > > --- a/include/uapi/linux/mount.h
> > > > +++ b/include/uapi/linux/mount.h
> > > > @@ -34,6 +34,7 @@
> > > > #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> > > > #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> > > > #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > > > +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
> > > Doesn't this conflict with MS_SUBMOUNT below?
> > > >
> > > > /* These sb flags are internal to the kernel */
> > > > #define MS_SUBMOUNT (1<<26)
> >
> > Yep. Thanks for the catch, v6 on it's way.
>
> It actually looks like most of the flags which are internal to the
> kernel are actually unused (MS_SUBMOUNT, MS_NOREMOTELOCK, MS_NOSEC,
> MS_BORN and MS_ACTIVE). Several are unused completely, and the rest
> are just part of the AA_MS_IGNORE_MASK which masks them off in the
> apparmor LSM, but I'm pretty sure they couldn't have been set anyway.
>
> I'll just take over (1<<26) for MS_NOSYMFOLLOW, and remove the rest in
> a second patch.
>
> If someone thinks these flags are actually used by something and I'm
> just missing it, please let me know.
Afraid you did miss it ...
/*
* sb->s_flags. Note that these mirror the equivalent MS_* flags where
* represented in both.
*/
...
#define SB_SUBMOUNT (1<<26)
It's not entirely clear to me why they need to be the same, but I haven't
been paying close attention to the separation of superblock and mount
flags, so someone else can probably explain the why of it.
On 2020-02-04, Matthew Wilcox <[email protected]> wrote:
> On Tue, Feb 04, 2020 at 04:49:48PM -0700, Ross Zwisler wrote:
> > On Tue, Feb 4, 2020 at 3:11 PM Ross Zwisler <[email protected]> wrote:
> > > On Tue, Feb 4, 2020 at 2:53 PM Raul Rangel <[email protected]> wrote:
> > > > > --- a/include/uapi/linux/mount.h
> > > > > +++ b/include/uapi/linux/mount.h
> > > > > @@ -34,6 +34,7 @@
> > > > > #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> > > > > #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> > > > > #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > > > > +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
> > > > Doesn't this conflict with MS_SUBMOUNT below?
> > > > >
> > > > > /* These sb flags are internal to the kernel */
> > > > > #define MS_SUBMOUNT (1<<26)
> > >
> > > Yep. Thanks for the catch, v6 on it's way.
> >
> > It actually looks like most of the flags which are internal to the
> > kernel are actually unused (MS_SUBMOUNT, MS_NOREMOTELOCK, MS_NOSEC,
> > MS_BORN and MS_ACTIVE). Several are unused completely, and the rest
> > are just part of the AA_MS_IGNORE_MASK which masks them off in the
> > apparmor LSM, but I'm pretty sure they couldn't have been set anyway.
> >
> > I'll just take over (1<<26) for MS_NOSYMFOLLOW, and remove the rest in
> > a second patch.
> >
> > If someone thinks these flags are actually used by something and I'm
> > just missing it, please let me know.
>
> Afraid you did miss it ...
>
> /*
> * sb->s_flags. Note that these mirror the equivalent MS_* flags where
> * represented in both.
> */
> ...
> #define SB_SUBMOUNT (1<<26)
>
> It's not entirely clear to me why they need to be the same, but I haven't
> been paying close attention to the separation of superblock and mount
> flags, so someone else can probably explain the why of it.
I could be wrong, but I believe this is historic and originates from the
kernel setting certain flags internally (similar to the whole O_* flag,
"internal" O_* flag, and FMODE_NOTIFY mixup).
Also, one of the arguments for the new mount API was that we'd run out
MS_* bits so it's possible that you have to enable this new mount option
in the new mount API only. (Though Howells is the right person to talk
to on this point.)
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
On Tue, Feb 4, 2020 at 8:45 PM Aleksa Sarai <[email protected]> wrote:
> On 2020-02-04, Matthew Wilcox <[email protected]> wrote:
> > On Tue, Feb 04, 2020 at 04:49:48PM -0700, Ross Zwisler wrote:
> > > On Tue, Feb 4, 2020 at 3:11 PM Ross Zwisler <[email protected]> wrote:
> > > > On Tue, Feb 4, 2020 at 2:53 PM Raul Rangel <[email protected]> wrote:
> > > > > > --- a/include/uapi/linux/mount.h
> > > > > > +++ b/include/uapi/linux/mount.h
> > > > > > @@ -34,6 +34,7 @@
> > > > > > #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> > > > > > #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> > > > > > #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > > > > > +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
> > > > > Doesn't this conflict with MS_SUBMOUNT below?
> > > > > >
> > > > > > /* These sb flags are internal to the kernel */
> > > > > > #define MS_SUBMOUNT (1<<26)
> > > >
> > > > Yep. Thanks for the catch, v6 on it's way.
> > >
> > > It actually looks like most of the flags which are internal to the
> > > kernel are actually unused (MS_SUBMOUNT, MS_NOREMOTELOCK, MS_NOSEC,
> > > MS_BORN and MS_ACTIVE). Several are unused completely, and the rest
> > > are just part of the AA_MS_IGNORE_MASK which masks them off in the
> > > apparmor LSM, but I'm pretty sure they couldn't have been set anyway.
> > >
> > > I'll just take over (1<<26) for MS_NOSYMFOLLOW, and remove the rest in
> > > a second patch.
> > >
> > > If someone thinks these flags are actually used by something and I'm
> > > just missing it, please let me know.
> >
> > Afraid you did miss it ...
> >
> > /*
> > * sb->s_flags. Note that these mirror the equivalent MS_* flags where
> > * represented in both.
> > */
> > ...
> > #define SB_SUBMOUNT (1<<26)
> >
> > It's not entirely clear to me why they need to be the same, but I haven't
> > been paying close attention to the separation of superblock and mount
> > flags, so someone else can probably explain the why of it.
>
> I could be wrong, but I believe this is historic and originates from the
> kernel setting certain flags internally (similar to the whole O_* flag,
> "internal" O_* flag, and FMODE_NOTIFY mixup).
>
> Also, one of the arguments for the new mount API was that we'd run out
> MS_* bits so it's possible that you have to enable this new mount option
> in the new mount API only. (Though Howells is the right person to talk
> to on this point.)
As far as I can tell, SB_SUBMOUNT doesn't actually have any dependence on
MS_SUBMOUNT. Nothing ever sets or checks MS_SUBMOUNT from within the kernel,
and whether or not it's set from userspace has no bearing on how SB_SUBMOUNT
is used. SB_SUBMOUNT is set independently inside of the kernel in
vfs_submount().
I agree that their association seems to be historical, introduced in this
commit from David Howells:
e462ec50cb5fa VFS: Differentiate mount flags (MS_*) from internal superblock flags
In that commit message David notes:
(1) Some MS_* flags get translated to MNT_* flags (such as MS_NODEV ->
MNT_NODEV) without passing this on to the filesystem, but some
filesystems set such flags anyway.
I think this is sort of what we are trying to do with MS_NOSYMFOLLOW: have a
userspace flag that translates to MNT_NOSYMFOLLOW, but which doesn't need an
associated SB_* flag. Is it okay to reclaim the bit currently owned by
MS_SUBMOUNT and use it for MS_NOSYMFOLLOW.
A second option would be to choose one of the unused MS_* values from the
middle of the range, such as 256 or 512. Looking back as far as git will let
me, I don't think that these flags have been used for MS_* values at least
since v2.6.12:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/include/linux/fs.h?id=1da177e4c3f41524e886b7f1b8a0c1fc7321cac2
I think maybe these used to be S_WRITE and S_APPEND, which weren't filesystem
mount flags?
https://sites.uclouvain.be/SystInfo/usr/include/sys/mount.h.html
A third option would be to create this flag using the new mount system:
https://lwn.net/Articles/753473/
https://lwn.net/Articles/759499/
My main concern with this option is that for Chrome OS we'd like to be able to
backport whatever solution we come up with to a variety of older kernels, and
if we go with the new mount system this would require us to backport the
entire new mount system to those kernels, which I think is infeasible.
David, what are your thoughts on this? Of these three options for supporting
a new MS_NOSYMFOLLOW flag:
1) reclaim the bit currently used by MS_SUBMOUNT
2) use a smaller unused value for the flag, 256 or 512
3) implement the new flag only in the new mount system
do you think either #1 or #2 are workable? If so, which would you prefer?
On Thu, Feb 06, 2020 at 12:10:45PM -0700, Ross Zwisler wrote:
> On Tue, Feb 4, 2020 at 8:45 PM Aleksa Sarai <[email protected]> wrote:
> > On 2020-02-04, Matthew Wilcox <[email protected]> wrote:
> > > On Tue, Feb 04, 2020 at 04:49:48PM -0700, Ross Zwisler wrote:
> > > > On Tue, Feb 4, 2020 at 3:11 PM Ross Zwisler <[email protected]> wrote:
> > > > > On Tue, Feb 4, 2020 at 2:53 PM Raul Rangel <[email protected]> wrote:
> > > > > > > --- a/include/uapi/linux/mount.h
> > > > > > > +++ b/include/uapi/linux/mount.h
> > > > > > > @@ -34,6 +34,7 @@
> > > > > > > #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> > > > > > > #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> > > > > > > #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > > > > > > +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
> > > > > > Doesn't this conflict with MS_SUBMOUNT below?
> > > > > > >
> > > > > > > /* These sb flags are internal to the kernel */
> > > > > > > #define MS_SUBMOUNT (1<<26)
> > > > >
> > > > > Yep. Thanks for the catch, v6 on it's way.
> > > >
> > > > It actually looks like most of the flags which are internal to the
> > > > kernel are actually unused (MS_SUBMOUNT, MS_NOREMOTELOCK, MS_NOSEC,
> > > > MS_BORN and MS_ACTIVE). Several are unused completely, and the rest
> > > > are just part of the AA_MS_IGNORE_MASK which masks them off in the
> > > > apparmor LSM, but I'm pretty sure they couldn't have been set anyway.
> > > >
> > > > I'll just take over (1<<26) for MS_NOSYMFOLLOW, and remove the rest in
> > > > a second patch.
> > > >
> > > > If someone thinks these flags are actually used by something and I'm
> > > > just missing it, please let me know.
> > >
> > > Afraid you did miss it ...
> > >
> > > /*
> > > * sb->s_flags. Note that these mirror the equivalent MS_* flags where
> > > * represented in both.
> > > */
> > > ...
> > > #define SB_SUBMOUNT (1<<26)
> > >
> > > It's not entirely clear to me why they need to be the same, but I haven't
> > > been paying close attention to the separation of superblock and mount
> > > flags, so someone else can probably explain the why of it.
> >
> > I could be wrong, but I believe this is historic and originates from the
> > kernel setting certain flags internally (similar to the whole O_* flag,
> > "internal" O_* flag, and FMODE_NOTIFY mixup).
> >
> > Also, one of the arguments for the new mount API was that we'd run out
> > MS_* bits so it's possible that you have to enable this new mount option
> > in the new mount API only. (Though Howells is the right person to talk
> > to on this point.)
>
> As far as I can tell, SB_SUBMOUNT doesn't actually have any dependence on
> MS_SUBMOUNT. Nothing ever sets or checks MS_SUBMOUNT from within the kernel,
> and whether or not it's set from userspace has no bearing on how SB_SUBMOUNT
> is used. SB_SUBMOUNT is set independently inside of the kernel in
> vfs_submount().
>
> I agree that their association seems to be historical, introduced in this
> commit from David Howells:
>
> e462ec50cb5fa VFS: Differentiate mount flags (MS_*) from internal superblock flags
>
> In that commit message David notes:
>
> (1) Some MS_* flags get translated to MNT_* flags (such as MS_NODEV ->
> MNT_NODEV) without passing this on to the filesystem, but some
> filesystems set such flags anyway.
>
> I think this is sort of what we are trying to do with MS_NOSYMFOLLOW: have a
> userspace flag that translates to MNT_NOSYMFOLLOW, but which doesn't need an
> associated SB_* flag. Is it okay to reclaim the bit currently owned by
> MS_SUBMOUNT and use it for MS_NOSYMFOLLOW.
>
> A second option would be to choose one of the unused MS_* values from the
> middle of the range, such as 256 or 512. Looking back as far as git will let
> me, I don't think that these flags have been used for MS_* values at least
> since v2.6.12:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/include/linux/fs.h?id=1da177e4c3f41524e886b7f1b8a0c1fc7321cac2
>
> I think maybe these used to be S_WRITE and S_APPEND, which weren't filesystem
> mount flags?
>
> https://sites.uclouvain.be/SystInfo/usr/include/sys/mount.h.html
>
> A third option would be to create this flag using the new mount system:
>
> https://lwn.net/Articles/753473/
> https://lwn.net/Articles/759499/
>
> My main concern with this option is that for Chrome OS we'd like to be able to
> backport whatever solution we come up with to a variety of older kernels, and
> if we go with the new mount system this would require us to backport the
> entire new mount system to those kernels, which I think is infeasible.
>
> David, what are your thoughts on this? Of these three options for supporting
> a new MS_NOSYMFOLLOW flag:
>
> 1) reclaim the bit currently used by MS_SUBMOUNT
> 2) use a smaller unused value for the flag, 256 or 512
> 3) implement the new flag only in the new mount system
>
> do you think either #1 or #2 are workable? If so, which would you prefer?
Gentle ping on this - do either of the options using the existing mount API
seem possible? Would it be useful for me to send out example patches in one
of those directions? Or is it out of the question, and I should spend my time
on making patches using the new mount system? Thanks!
On 2020-02-13, Ross Zwisler <[email protected]> wrote:
> On Thu, Feb 06, 2020 at 12:10:45PM -0700, Ross Zwisler wrote:
> > On Tue, Feb 4, 2020 at 8:45 PM Aleksa Sarai <[email protected]> wrote:
> > > On 2020-02-04, Matthew Wilcox <[email protected]> wrote:
> > > > On Tue, Feb 04, 2020 at 04:49:48PM -0700, Ross Zwisler wrote:
> > > > > On Tue, Feb 4, 2020 at 3:11 PM Ross Zwisler <[email protected]> wrote:
> > > > > > On Tue, Feb 4, 2020 at 2:53 PM Raul Rangel <[email protected]> wrote:
> > > > > > > > --- a/include/uapi/linux/mount.h
> > > > > > > > +++ b/include/uapi/linux/mount.h
> > > > > > > > @@ -34,6 +34,7 @@
> > > > > > > > #define MS_I_VERSION (1<<23) /* Update inode I_version field */
> > > > > > > > #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> > > > > > > > #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > > > > > > > +#define MS_NOSYMFOLLOW (1<<26) /* Do not follow symlinks */
> > > > > > > Doesn't this conflict with MS_SUBMOUNT below?
> > > > > > > >
> > > > > > > > /* These sb flags are internal to the kernel */
> > > > > > > > #define MS_SUBMOUNT (1<<26)
> > > > > >
> > > > > > Yep. Thanks for the catch, v6 on it's way.
> > > > >
> > > > > It actually looks like most of the flags which are internal to the
> > > > > kernel are actually unused (MS_SUBMOUNT, MS_NOREMOTELOCK, MS_NOSEC,
> > > > > MS_BORN and MS_ACTIVE). Several are unused completely, and the rest
> > > > > are just part of the AA_MS_IGNORE_MASK which masks them off in the
> > > > > apparmor LSM, but I'm pretty sure they couldn't have been set anyway.
> > > > >
> > > > > I'll just take over (1<<26) for MS_NOSYMFOLLOW, and remove the rest in
> > > > > a second patch.
> > > > >
> > > > > If someone thinks these flags are actually used by something and I'm
> > > > > just missing it, please let me know.
> > > >
> > > > Afraid you did miss it ...
> > > >
> > > > /*
> > > > * sb->s_flags. Note that these mirror the equivalent MS_* flags where
> > > > * represented in both.
> > > > */
> > > > ...
> > > > #define SB_SUBMOUNT (1<<26)
> > > >
> > > > It's not entirely clear to me why they need to be the same, but I haven't
> > > > been paying close attention to the separation of superblock and mount
> > > > flags, so someone else can probably explain the why of it.
> > >
> > > I could be wrong, but I believe this is historic and originates from the
> > > kernel setting certain flags internally (similar to the whole O_* flag,
> > > "internal" O_* flag, and FMODE_NOTIFY mixup).
> > >
> > > Also, one of the arguments for the new mount API was that we'd run out
> > > MS_* bits so it's possible that you have to enable this new mount option
> > > in the new mount API only. (Though Howells is the right person to talk
> > > to on this point.)
> >
> > As far as I can tell, SB_SUBMOUNT doesn't actually have any dependence on
> > MS_SUBMOUNT. Nothing ever sets or checks MS_SUBMOUNT from within the kernel,
> > and whether or not it's set from userspace has no bearing on how SB_SUBMOUNT
> > is used. SB_SUBMOUNT is set independently inside of the kernel in
> > vfs_submount().
> >
> > I agree that their association seems to be historical, introduced in this
> > commit from David Howells:
> >
> > e462ec50cb5fa VFS: Differentiate mount flags (MS_*) from internal superblock flags
> >
> > In that commit message David notes:
> >
> > (1) Some MS_* flags get translated to MNT_* flags (such as MS_NODEV ->
> > MNT_NODEV) without passing this on to the filesystem, but some
> > filesystems set such flags anyway.
> >
> > I think this is sort of what we are trying to do with MS_NOSYMFOLLOW: have a
> > userspace flag that translates to MNT_NOSYMFOLLOW, but which doesn't need an
> > associated SB_* flag. Is it okay to reclaim the bit currently owned by
> > MS_SUBMOUNT and use it for MS_NOSYMFOLLOW.
> >
> > A second option would be to choose one of the unused MS_* values from the
> > middle of the range, such as 256 or 512. Looking back as far as git will let
> > me, I don't think that these flags have been used for MS_* values at least
> > since v2.6.12:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/include/linux/fs.h?id=1da177e4c3f41524e886b7f1b8a0c1fc7321cac2
> >
> > I think maybe these used to be S_WRITE and S_APPEND, which weren't filesystem
> > mount flags?
> >
> > https://sites.uclouvain.be/SystInfo/usr/include/sys/mount.h.html
> >
> > A third option would be to create this flag using the new mount system:
> >
> > https://lwn.net/Articles/753473/
> > https://lwn.net/Articles/759499/
> >
> > My main concern with this option is that for Chrome OS we'd like to be able to
> > backport whatever solution we come up with to a variety of older kernels, and
> > if we go with the new mount system this would require us to backport the
> > entire new mount system to those kernels, which I think is infeasible.
> >
> > David, what are your thoughts on this? Of these three options for supporting
> > a new MS_NOSYMFOLLOW flag:
> >
> > 1) reclaim the bit currently used by MS_SUBMOUNT
> > 2) use a smaller unused value for the flag, 256 or 512
> > 3) implement the new flag only in the new mount system
> >
> > do you think either #1 or #2 are workable? If so, which would you prefer?
>
> Gentle ping on this - do either of the options using the existing mount API
> seem possible? Would it be useful for me to send out example patches in one
> of those directions? Or is it out of the question, and I should spend my time
> on making patches using the new mount system? Thanks!
I think (1) or (2) sound reasonable, but I'm not really the right person
to ask.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
On Fri, Feb 21, 2020 at 12:21:42PM +1100, Aleksa Sarai wrote:
> On 2020-02-13, Ross Zwisler <[email protected]> wrote:
> > On Thu, Feb 06, 2020 at 12:10:45PM -0700, Ross Zwisler wrote:
<>
> > > As far as I can tell, SB_SUBMOUNT doesn't actually have any dependence on
> > > MS_SUBMOUNT. Nothing ever sets or checks MS_SUBMOUNT from within the kernel,
> > > and whether or not it's set from userspace has no bearing on how SB_SUBMOUNT
> > > is used. SB_SUBMOUNT is set independently inside of the kernel in
> > > vfs_submount().
> > >
> > > I agree that their association seems to be historical, introduced in this
> > > commit from David Howells:
> > >
> > > e462ec50cb5fa VFS: Differentiate mount flags (MS_*) from internal superblock flags
> > >
> > > In that commit message David notes:
> > >
> > > (1) Some MS_* flags get translated to MNT_* flags (such as MS_NODEV ->
> > > MNT_NODEV) without passing this on to the filesystem, but some
> > > filesystems set such flags anyway.
> > >
> > > I think this is sort of what we are trying to do with MS_NOSYMFOLLOW: have a
> > > userspace flag that translates to MNT_NOSYMFOLLOW, but which doesn't need an
> > > associated SB_* flag. Is it okay to reclaim the bit currently owned by
> > > MS_SUBMOUNT and use it for MS_NOSYMFOLLOW.
> > >
> > > A second option would be to choose one of the unused MS_* values from the
> > > middle of the range, such as 256 or 512. Looking back as far as git will let
> > > me, I don't think that these flags have been used for MS_* values at least
> > > since v2.6.12:
> > >
> > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/include/linux/fs.h?id=1da177e4c3f41524e886b7f1b8a0c1fc7321cac2
> > >
> > > I think maybe these used to be S_WRITE and S_APPEND, which weren't filesystem
> > > mount flags?
> > >
> > > https://sites.uclouvain.be/SystInfo/usr/include/sys/mount.h.html
> > >
> > > A third option would be to create this flag using the new mount system:
> > >
> > > https://lwn.net/Articles/753473/
> > > https://lwn.net/Articles/759499/
> > >
> > > My main concern with this option is that for Chrome OS we'd like to be able to
> > > backport whatever solution we come up with to a variety of older kernels, and
> > > if we go with the new mount system this would require us to backport the
> > > entire new mount system to those kernels, which I think is infeasible.
> > >
> > > David, what are your thoughts on this? Of these three options for supporting
> > > a new MS_NOSYMFOLLOW flag:
> > >
> > > 1) reclaim the bit currently used by MS_SUBMOUNT
> > > 2) use a smaller unused value for the flag, 256 or 512
> > > 3) implement the new flag only in the new mount system
> > >
> > > do you think either #1 or #2 are workable? If so, which would you prefer?
> >
> > Gentle ping on this - do either of the options using the existing mount API
> > seem possible? Would it be useful for me to send out example patches in one
> > of those directions? Or is it out of the question, and I should spend my time
> > on making patches using the new mount system? Thanks!
>
> I think (1) or (2) sound reasonable, but I'm not really the right person
> to ask.
Cool, I appreciate the feedback. :) I'll go ahead and implement #2 and send
it out, along with example man page updates.