LinuxLists.cc - [PATCH RFC] fhandle: expose u64 mount id to name_to_handle

2024-05-20 21:36:35

Subject: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

Now that we have stabilised the unique 64-bit mount ID interface in
statx, we can now provide a race-free way for name_to_handle_at(2) to
provide a file handle and corresponding mount without needing to worry
about racing with /proc/mountinfo parsing.

As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
that doesn't make sense for name_to_handle_at(2).

Signed-off-by: Aleksa Sarai <[email protected]>
---
fs/fhandle.c | 27 +++++++++++++++++++--------
include/uapi/linux/fcntl.h | 2 ++
2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 8a7f86c2139a..6bc7ffccff8c 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -16,7 +16,8 @@

static long do_sys_name_to_handle(const struct path *path,
struct file_handle __user *ufh,
- int __user *mnt_id, int fh_flags)
+ void __user *mnt_id, bool unique_mntid,
+ int fh_flags)
{
long retval;
struct file_handle f_handle;
@@ -69,10 +70,16 @@ static long do_sys_name_to_handle(const struct path *path,
} else
retval = 0;
/* copy the mount id */
- if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
- copy_to_user(ufh, handle,
- struct_size(handle, f_handle, handle_bytes)))
- retval = -EFAULT;
+ if (unique_mntid)
+ retval = put_user(real_mount(path->mnt)->mnt_id_unique,
+ (u64 __user *) mnt_id);
+ else
+ retval = put_user(real_mount(path->mnt)->mnt_id,
+ (int __user *) mnt_id);
+ /* copy the handle */
+ if (!retval)
+ retval = copy_to_user(ufh, handle,
+ struct_size(handle, f_handle, handle_bytes));
kfree(handle);
return retval;
}
@@ -83,6 +90,7 @@ static long do_sys_name_to_handle(const struct path *path,
* @name: name that should be converted to handle.
* @handle: resulting file handle
* @mnt_id: mount id of the file system containing the file
+ * (u64 if AT_HANDLE_UNIQUE_MNT_ID, otherwise int)
* @flag: flag value to indicate whether to follow symlink or not
* and whether a decodable file handle is required.
*
@@ -92,7 +100,7 @@ static long do_sys_name_to_handle(const struct path *path,
* value required.
*/
SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
- struct file_handle __user *, handle, int __user *, mnt_id,
+ struct file_handle __user *, handle, void __user *, mnt_id,
int, flag)
{
struct path path;
@@ -100,7 +108,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
int fh_flags;
int err;

- if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
+ if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
+ AT_HANDLE_UNIQUE_MNT_ID))
return -EINVAL;

lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
@@ -109,7 +118,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
lookup_flags |= LOOKUP_EMPTY;
err = user_path_at(dfd, name, lookup_flags, &path);
if (!err) {
- err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
+ err = do_sys_name_to_handle(&path, handle, mnt_id,
+ flag & AT_HANDLE_UNIQUE_MNT_ID,
+ fh_flags);
path_put(&path);
}
return err;
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index c0bcc185fa48..fda970f92fba 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -118,6 +118,8 @@
#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to
compare object identity and may not
be usable to open_by_handle_at(2) */
+#define AT_HANDLE_UNIQUE_MNT_ID AT_STATX_FORCE_SYNC /* returned mount id is
+ the u64 unique mount id */
#if defined(__KERNEL__)
#define AT_GETATTR_NOSEC 0x80000000
#endif

---
base-commit: 584bbf439d0fa83d728ec49f3a38c581bdc828b4
change-id: 20240515-exportfs-u64-mount-id-9ebb5c58b53c

Best regards,
--
Aleksa Sarai <[email protected]>

2024-05-20 21:53:20

by Jeff Layton

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote:
> Now that we have stabilised the unique 64-bit mount ID interface in
> statx, we can now provide a race-free way for name_to_handle_at(2) to
> provide a file handle and corresponding mount without needing to worry
> about racing with /proc/mountinfo parsing.
>
> As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> that doesn't make sense for name_to_handle_at(2).
>
> Signed-off-by: Aleksa Sarai <[email protected]>
> ---
> fs/fhandle.c | 27 +++++++++++++++++++--------
> include/uapi/linux/fcntl.h | 2 ++
> 2 files changed, 21 insertions(+), 8 deletions(-)
>
> diff --git a/fs/fhandle.c b/fs/fhandle.c
> index 8a7f86c2139a..6bc7ffccff8c 100644
> --- a/fs/fhandle.c
> +++ b/fs/fhandle.c
> @@ -16,7 +16,8 @@
>
> static long do_sys_name_to_handle(const struct path *path,
> struct file_handle __user *ufh,
> - int __user *mnt_id, int fh_flags)
> + void __user *mnt_id, bool unique_mntid,
> + int fh_flags)
> {
> long retval;
> struct file_handle f_handle;
> @@ -69,10 +70,16 @@ static long do_sys_name_to_handle(const struct path *path,
> } else
> retval = 0;
> /* copy the mount id */
> - if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
> - copy_to_user(ufh, handle,
> - struct_size(handle, f_handle, handle_bytes)))
> - retval = -EFAULT;
> + if (unique_mntid)
> + retval = put_user(real_mount(path->mnt)->mnt_id_unique,
> + (u64 __user *) mnt_id);
> + else
> + retval = put_user(real_mount(path->mnt)->mnt_id,
> + (int __user *) mnt_id);
> + /* copy the handle */
> + if (!retval)
> + retval = copy_to_user(ufh, handle,
> + struct_size(handle, f_handle, handle_bytes));
> kfree(handle);
> return retval;
> }
> @@ -83,6 +90,7 @@ static long do_sys_name_to_handle(const struct path *path,
> * @name: name that should be converted to handle.
> * @handle: resulting file handle
> * @mnt_id: mount id of the file system containing the file
> + * (u64 if AT_HANDLE_UNIQUE_MNT_ID, otherwise int)
> * @flag: flag value to indicate whether to follow symlink or not
> * and whether a decodable file handle is required.
> *
> @@ -92,7 +100,7 @@ static long do_sys_name_to_handle(const struct path *path,
> * value required.
> */
> SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
> - struct file_handle __user *, handle, int __user *, mnt_id,
> + struct file_handle __user *, handle, void __user *, mnt_id,
>

Changing the syscall signature like this is rather nasty. The new flag
seems like it should safely gate the difference, but I still have some
concerns about misuse and people passing in too small a buffer for the
mnt_id.

> int, flag)
> {
> struct path path;
> @@ -100,7 +108,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
> int fh_flags;
> int err;
>
> - if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
> + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
> + AT_HANDLE_UNIQUE_MNT_ID))
> return -EINVAL;
>
> lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
> @@ -109,7 +118,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
> lookup_flags |= LOOKUP_EMPTY;
> err = user_path_at(dfd, name, lookup_flags, &path);
> if (!err) {
> - err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
> + err = do_sys_name_to_handle(&path, handle, mnt_id,
> + flag & AT_HANDLE_UNIQUE_MNT_ID,
> + fh_flags);
> path_put(&path);
> }
> return err;
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index c0bcc185fa48..fda970f92fba 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -118,6 +118,8 @@
> #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to
> compare object identity and may not
> be usable to open_by_handle_at(2) */
> +#define AT_HANDLE_UNIQUE_MNT_ID AT_STATX_FORCE_SYNC /* returned mount id is
> + the u64 unique mount id */
> #if defined(__KERNEL__)
> #define AT_GETATTR_NOSEC 0x80000000
> #endif
>
> ---
> base-commit: 584bbf439d0fa83d728ec49f3a38c581bdc828b4
> change-id: 20240515-exportfs-u64-mount-id-9ebb5c58b53c
>
> Best regards,

--
Jeff Layton <[email protected]>

2024-05-20 22:35:36

by Aleksa Sarai

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On 2024-05-20, Jeff Layton <[email protected]> wrote:
> On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote:
> > Now that we have stabilised the unique 64-bit mount ID interface in
> > statx, we can now provide a race-free way for name_to_handle_at(2) to
> > provide a file handle and corresponding mount without needing to worry
> > about racing with /proc/mountinfo parsing.
> >
> > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> > that doesn't make sense for name_to_handle_at(2).
> >
> > Signed-off-by: Aleksa Sarai <[email protected]>
> > ---
> > fs/fhandle.c | 27 +++++++++++++++++++--------
> > include/uapi/linux/fcntl.h | 2 ++
> > 2 files changed, 21 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/fhandle.c b/fs/fhandle.c
> > index 8a7f86c2139a..6bc7ffccff8c 100644
> > --- a/fs/fhandle.c
> > +++ b/fs/fhandle.c
> > @@ -16,7 +16,8 @@
> >
> > static long do_sys_name_to_handle(const struct path *path,
> > struct file_handle __user *ufh,
> > - int __user *mnt_id, int fh_flags)
> > + void __user *mnt_id, bool unique_mntid,
> > + int fh_flags)
> > {
> > long retval;
> > struct file_handle f_handle;
> > @@ -69,10 +70,16 @@ static long do_sys_name_to_handle(const struct path *path,
> > } else
> > retval = 0;
> > /* copy the mount id */
> > - if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
> > - copy_to_user(ufh, handle,
> > - struct_size(handle, f_handle, handle_bytes)))
> > - retval = -EFAULT;
> > + if (unique_mntid)
> > + retval = put_user(real_mount(path->mnt)->mnt_id_unique,
> > + (u64 __user *) mnt_id);
> > + else
> > + retval = put_user(real_mount(path->mnt)->mnt_id,
> > + (int __user *) mnt_id);
> > + /* copy the handle */
> > + if (!retval)
> > + retval = copy_to_user(ufh, handle,
> > + struct_size(handle, f_handle, handle_bytes));
> > kfree(handle);
> > return retval;
> > }
> > @@ -83,6 +90,7 @@ static long do_sys_name_to_handle(const struct path *path,
> > * @name: name that should be converted to handle.
> > * @handle: resulting file handle
> > * @mnt_id: mount id of the file system containing the file
> > + * (u64 if AT_HANDLE_UNIQUE_MNT_ID, otherwise int)
> > * @flag: flag value to indicate whether to follow symlink or not
> > * and whether a decodable file handle is required.
> > *
> > @@ -92,7 +100,7 @@ static long do_sys_name_to_handle(const struct path *path,
> > * value required.
> > */
> > SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
> > - struct file_handle __user *, handle, int __user *, mnt_id,
> > + struct file_handle __user *, handle, void __user *, mnt_id,
> >
>
> Changing the syscall signature like this is rather nasty. The new flag
> seems like it should safely gate the difference, but I still have some
> concerns about misuse and people passing in too small a buffer for the
> mnt_id.

Yeah, it's a little ugly, but an name_to_handle_at2 feels like overkill
for such a minor change. I'm also not sure there's a huge risk of users
accidentally passing AT_HANDLE_UNIQUE_MNT_ID with an (int *).

> > int, flag)
> > {
> > struct path path;
> > @@ -100,7 +108,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
> > int fh_flags;
> > int err;
> >
> > - if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
> > + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
> > + AT_HANDLE_UNIQUE_MNT_ID))
> > return -EINVAL;
> >
> > lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
> > @@ -109,7 +118,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
> > lookup_flags |= LOOKUP_EMPTY;
> > err = user_path_at(dfd, name, lookup_flags, &path);
> > if (!err) {
> > - err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
> > + err = do_sys_name_to_handle(&path, handle, mnt_id,
> > + flag & AT_HANDLE_UNIQUE_MNT_ID,
> > + fh_flags);
> > path_put(&path);
> > }
> > return err;
> > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> > index c0bcc185fa48..fda970f92fba 100644
> > --- a/include/uapi/linux/fcntl.h
> > +++ b/include/uapi/linux/fcntl.h
> > @@ -118,6 +118,8 @@
> > #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to
> > compare object identity and may not
> > be usable to open_by_handle_at(2) */
> > +#define AT_HANDLE_UNIQUE_MNT_ID AT_STATX_FORCE_SYNC /* returned mount id is
> > + the u64 unique mount id */
> > #if defined(__KERNEL__)
> > #define AT_GETATTR_NOSEC 0x80000000
> > #endif
> >
> > ---
> > base-commit: 584bbf439d0fa83d728ec49f3a38c581bdc828b4
> > change-id: 20240515-exportfs-u64-mount-id-9ebb5c58b53c
> >
> > Best regards,
>
> --
> Jeff Layton <[email protected]>
>

--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

Attachments:

(No filename) (4.96 kB)
signature.asc (235.00 B)
Download all attachments

2024-05-21 05:04:26

by Amir Goldstein

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On Tue, May 21, 2024 at 1:28 AM Aleksa Sarai <[email protected]> wrote:
>
> On 2024-05-20, Jeff Layton <[email protected]> wrote:
> > On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote:
> > > Now that we have stabilised the unique 64-bit mount ID interface in
> > > statx, we can now provide a race-free way for name_to_handle_at(2) to
> > > provide a file handle and corresponding mount without needing to worry
> > > about racing with /proc/mountinfo parsing.

Both statx() and name_to_handle_at() support AT_EMPTY_PATH, so
there is a race-free way to get a file handle and unique mount id
for statmount().

Why do you mean /proc/mountinfo parsing?

> > >
> > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> > > that doesn't make sense for name_to_handle_at(2).

Christian is probably regretting merging AT_HANDLE_FID now ;-)

Seriously, I would rearrange the AT_* flags namespace this way to
explicitly declare the overloaded per-syscall AT_* flags and possibly
prepare for the upcoming setxattrat(2) syscall [1].

[1] https://lore.kernel.org/linux-fsdevel/[email protected]/

The reason I would avoid overloading the AT_STATX_* flags is that
they describe a generic behavior that could potentially be relevant to
other syscalls in the future, e.g.:
renameat2(..., AT_RENAME_TEMPFILE | AT_FORCE_SYNC);

But then again, I don't understand why you need to extend name_to_handle_at()
at all for your purpose...

Thanks,
Amir.

--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
[...]
+
+#define AT_PRIVATE_FLAGS 0x2ff /* Per-syscall flags mask. */
+
+/* Common flags for *at() syscalls */
#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */
-#define AT_EACCESS 0x200 /* Test access permitted for
- effective IDs, not real IDs. */
-#define AT_REMOVEDIR 0x200 /* Remove directory instead of
- unlinking file. */
#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal
automount traversal */
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */

+/* Flags for statx(2) */
#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation
required from statx() */
#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to
be sync'd with the server */
[...]

#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */

-/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */
-#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to
+/* Flags for name_to_handle_at(2) */
+#define AT_HANDLE_FID 0x200 /* file handle is needed to
compare object identity and may not
be usable to open_by_handle_at(2) */
+/* Flags for faccessat(2) */
+#define AT_EACCESS 0x200 /* Test access permitted for
+ effective IDs, not real IDs. */
+/* Flags for unlinkat(2) */
+#define AT_REMOVEDIR 0x200 /* Remove directory instead of
+ unlinking file. */
+
+/* Flags for renameat2(2) (should match legacy RENAME_* flags) */
+#define AT_RENAME_NOREPLACE 0x001 /* Don't overwrite target */
+#define AT_RENAME_EXCHANGE 0x002 /* Exchange source and dest */
+#define AT_RENAME_WHITEOUT 0x004 /* Whiteout source */
+#define AT_RENAME_TEMPFILE 0x008 /* Source file is O_TMPFILE */
+
+/* Flags for setxattrat(2) (should match legacy XATTR_* flags) */
+#define AT_XATTR_CREATE 0x001 /* set value, fail if
attr already exists */
+#define AT_XATTR_REPLACE 0x002 /* set value, fail if attr
does not exist */
+

2024-05-21 10:43:17

by Aleksa Sarai

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On 2024-05-21, Amir Goldstein <[email protected]> wrote:
> On Tue, May 21, 2024 at 1:28 AM Aleksa Sarai <[email protected]> wrote:
> >
> > On 2024-05-20, Jeff Layton <[email protected]> wrote:
> > > On Mon, 2024-05-20 at 17:35 -0400, Aleksa Sarai wrote:
> > > > Now that we have stabilised the unique 64-bit mount ID interface in
> > > > statx, we can now provide a race-free way for name_to_handle_at(2) to
> > > > provide a file handle and corresponding mount without needing to worry
> > > > about racing with /proc/mountinfo parsing.
>
> Both statx() and name_to_handle_at() support AT_EMPTY_PATH, so
> there is a race-free way to get a file handle and unique mount id
> for statmount().

Doing it that way would require doing an open and statx for every path
you want to get a filehandle for, tripling the number of syscalls you
need to do. This is related to the syscall overhead issue Lennart talked
about last week at LSF (though for his usecase we would need to add a
hashed filehandle in statx).

> Why do you mean /proc/mountinfo parsing?

The man page for name_to_handle_at(2) talks about needing to parse
/proc/mountinfo as well as the possible races you can hit.

> > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> > > > that doesn't make sense for name_to_handle_at(2).
>
> Christian is probably regretting merging AT_HANDLE_FID now ;-)
>
> Seriously, I would rearrange the AT_* flags namespace this way to
> explicitly declare the overloaded per-syscall AT_* flags and possibly
> prepare for the upcoming setxattrat(2) syscall [1].

I'm not sure that unifying the flag namespace is a good idea -- while it
would be nicer, burning a flag bit for an extension will be more
expensive because we would only have 32 bits for every possible
extension we ever plan to have.

FWIW, I think that statx should've had their own flag namespace like
move_mount and renameat2.

> [1] https://lore.kernel.org/linux-fsdevel/[email protected]/
>
> The reason I would avoid overloading the AT_STATX_* flags is that
> they describe a generic behavior that could potentially be relevant to
> other syscalls in the future, e.g.:
> renameat2(..., AT_RENAME_TEMPFILE | AT_FORCE_SYNC);

Yeah, you might be right that the sync-related flags aren't the right
ones to overload here.

> But then again, I don't understand why you need to extend name_to_handle_at()
> at all for your purpose...
>
> Thanks,
> Amir.
>
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> [...]
> +
> +#define AT_PRIVATE_FLAGS 0x2ff /* Per-syscall flags mask. */
> +
> +/* Common flags for *at() syscalls */
> #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */
> -#define AT_EACCESS 0x200 /* Test access permitted for
> - effective IDs, not real IDs. */
> -#define AT_REMOVEDIR 0x200 /* Remove directory instead of
> - unlinking file. */
> #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
> #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal
> automount traversal */
> #define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */
>
> +/* Flags for statx(2) */
> #define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation
> required from statx() */
> #define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
> #define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to
> be sync'd with the server */
> [...]
>
> #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
>
> -/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */
> -#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to
> +/* Flags for name_to_handle_at(2) */
> +#define AT_HANDLE_FID 0x200 /* file handle is needed to
> compare object identity and may not
> be usable to open_by_handle_at(2) */
> +/* Flags for faccessat(2) */
> +#define AT_EACCESS 0x200 /* Test access permitted for
> + effective IDs, not real IDs. */
> +/* Flags for unlinkat(2) */
> +#define AT_REMOVEDIR 0x200 /* Remove directory instead of
> + unlinking file. */
> +
> +/* Flags for renameat2(2) (should match legacy RENAME_* flags) */
> +#define AT_RENAME_NOREPLACE 0x001 /* Don't overwrite target */
> +#define AT_RENAME_EXCHANGE 0x002 /* Exchange source and dest */
> +#define AT_RENAME_WHITEOUT 0x004 /* Whiteout source */
> +#define AT_RENAME_TEMPFILE 0x008 /* Source file is O_TMPFILE */
> +
> +/* Flags for setxattrat(2) (should match legacy XATTR_* flags) */
> +#define AT_XATTR_CREATE 0x001 /* set value, fail if
> attr already exists */
> +#define AT_XATTR_REPLACE 0x002 /* set value, fail if attr
> does not exist */
> +

--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

Attachments:

(No filename) (5.17 kB)
signature.asc (235.00 B)
Download all attachments

2024-05-21 13:46:08

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote:
> Now that we have stabilised the unique 64-bit mount ID interface in
> statx, we can now provide a race-free way for name_to_handle_at(2) to
> provide a file handle and corresponding mount without needing to worry
> about racing with /proc/mountinfo parsing.
>
> As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> that doesn't make sense for name_to_handle_at(2).
>
> Signed-off-by: Aleksa Sarai <[email protected]>
> ---

So I think overall this is probably fine (famous last words). If it's
just about being able to retrieve the new mount id without having to
take the hit of another statx system call it's indeed a bit much to
add a revised system call for this. Althoug I did say earlier that I
wouldn't rule that out.

But if we'd that then it'll be a long discussion on the form of the new
system call and the information it exposes.

For example, I lack the grey hair needed to understand why
name_to_handle_at() returns a mount id at all. The pitch in commit
990d6c2d7aee ("vfs: Add name to file handle conversion support") is that
the (old) mount id can be used to "lookup file system specific
information [...] in /proc/<pid>/mountinfo".

Granted, that's doable but it'll mean a lot of careful checking to avoid
races for mount id recycling because they're not even allocated
cyclically. With lots of containers it becomes even more of an issue. So
it's doubtful whether exposing the mount id through name_to_handle_at()
would be something that we'd still do.

So really, if this is just about a use-case where you want to spare the
additional system call for statx() and you need the mnt_id then
overloading is probably ok.

But it remains an unpleasant thing to look at.

2024-05-23 15:52:55

by Aleksa Sarai

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On 2024-05-21, Christian Brauner <[email protected]> wrote:
> On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote:
> > Now that we have stabilised the unique 64-bit mount ID interface in
> > statx, we can now provide a race-free way for name_to_handle_at(2) to
> > provide a file handle and corresponding mount without needing to worry
> > about racing with /proc/mountinfo parsing.
> >
> > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> > that doesn't make sense for name_to_handle_at(2).
> >
> > Signed-off-by: Aleksa Sarai <[email protected]>
> > ---
>
> So I think overall this is probably fine (famous last words). If it's
> just about being able to retrieve the new mount id without having to
> take the hit of another statx system call it's indeed a bit much to
> add a revised system call for this. Althoug I did say earlier that I
> wouldn't rule that out.
>
> But if we'd that then it'll be a long discussion on the form of the new
> system call and the information it exposes.
>
> For example, I lack the grey hair needed to understand why
> name_to_handle_at() returns a mount id at all. The pitch in commit
> 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that
> the (old) mount id can be used to "lookup file system specific
> information [...] in /proc/<pid>/mountinfo".

The logic was presumably to allow you to know what mount the resolved
file handle came from. If you use AT_EMPTY_PATH this is not needed
because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if
you just give name_to_handle_at() almost any path, there is no race-free
way to make sure that you know which filesystem the file handle came
from.

I don't know if that could lead to security issues (I guess an attacker
could find a way to try to manipulate the file handle you get back, and
then try to trick you into operating on the wrong filesystem with
open_by_handle_at()) but it is definitely something you'd want to avoid.

> Granted, that's doable but it'll mean a lot of careful checking to avoid
> races for mount id recycling because they're not even allocated
> cyclically. With lots of containers it becomes even more of an issue. So
> it's doubtful whether exposing the mount id through name_to_handle_at()
> would be something that we'd still do.
>
> So really, if this is just about a use-case where you want to spare the
> additional system call for statx() and you need the mnt_id then
> overloading is probably ok.
>
> But it remains an unpleasant thing to look at.
>

Yeah, I agree it's ugly.

--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

Attachments:

(No filename) (2.67 kB)
signature.asc (235.00 B)
Download all attachments

2024-05-24 12:27:39

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On Thu, May 23, 2024 at 09:52:20AM -0600, Aleksa Sarai wrote:
> On 2024-05-21, Christian Brauner <[email protected]> wrote:
> > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote:
> > > Now that we have stabilised the unique 64-bit mount ID interface in
> > > statx, we can now provide a race-free way for name_to_handle_at(2) to
> > > provide a file handle and corresponding mount without needing to worry
> > > about racing with /proc/mountinfo parsing.
> > >
> > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> > > that doesn't make sense for name_to_handle_at(2).
> > >
> > > Signed-off-by: Aleksa Sarai <[email protected]>
> > > ---
> >
> > So I think overall this is probably fine (famous last words). If it's
> > just about being able to retrieve the new mount id without having to
> > take the hit of another statx system call it's indeed a bit much to
> > add a revised system call for this. Althoug I did say earlier that I
> > wouldn't rule that out.
> >
> > But if we'd that then it'll be a long discussion on the form of the new
> > system call and the information it exposes.
> >
> > For example, I lack the grey hair needed to understand why
> > name_to_handle_at() returns a mount id at all. The pitch in commit
> > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that
> > the (old) mount id can be used to "lookup file system specific
> > information [...] in /proc/<pid>/mountinfo".
>
> The logic was presumably to allow you to know what mount the resolved
> file handle came from. If you use AT_EMPTY_PATH this is not needed
> because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if
> you just give name_to_handle_at() almost any path, there is no race-free
> way to make sure that you know which filesystem the file handle came
> from.
>
> I don't know if that could lead to security issues (I guess an attacker
> could find a way to try to manipulate the file handle you get back, and
> then try to trick you into operating on the wrong filesystem with
> open_by_handle_at()) but it is definitely something you'd want to avoid.

So the following paragraphs are prefaced with: I'm not an expert on file
handle encoding and so might be totally wrong.

Afaiu, the uniqueness guarantee of the file handle mostly depends on:

(1) the filesystem
(2) the actual file handling encoding

Looking at file handle encoding to me it looks like it's fairly easy to
fake them in userspace (I guess that's ok if you think about them like a
path but with a weird permission model built around them.) for quite a
few filesystems.

For example, for anything that uses fs/libfs.c:generic_encode_ino32_fh()
it's easy to construct a file handle by retrieving the inode number via
stat and the generation number via FS_IOC_GETVERSION.

Encoding using the inode number and the inode generation number is
probably not very strong so it's not impossible to generate a file
handle that is not unique without knowing in which filesystem to
interpret it in.

The problem is with what name_to_handle_at() returns imho. A mnt_id
doesn't pin the filesystem and the old mnt_id isn't unique. That means
the filesystem can be unmounted and go away and the mnt_id can be
recycled almost immediately for another mount but the file handle is
still there.

So to guarantee that a (weakly encoded) file handle is interpreted in
the right filesystem the file handle must either be accompanied by a
file descriptor that pins the relevant mount or have any other guarantee
that the filesystem doesn't go away (Or of course, the file handle
encodes the uuid of the filesystem or something or uses some sort of
hashing scheme.).

One of the features of file handles is that they're globally usable so
they're interesting to use as handles that can be shared. IOW, one can
send around a file handle to another process without having to pin
anything or have a file descriptor open that needs to be sent via
AF_UNIX.

But as stated above that's potentially risky so one might still have to
send around an fd together with the file handle if sender and receiver
don't share the filesystem for the handle.

However, with the unique mount id things improve quite a bit. Because
now it's possible to send around the unique mount id and the file
handle. Then one can use statmount() to figure out which filesystem this
file handle needs to be interpreted in.

>
> > Granted, that's doable but it'll mean a lot of careful checking to avoid
> > races for mount id recycling because they're not even allocated
> > cyclically. With lots of containers it becomes even more of an issue. So
> > it's doubtful whether exposing the mount id through name_to_handle_at()
> > would be something that we'd still do.
> >
> > So really, if this is just about a use-case where you want to spare the
> > additional system call for statx() and you need the mnt_id then
> > overloading is probably ok.
> >
> > But it remains an unpleasant thing to look at.
> >
>
> Yeah, I agree it's ugly.
>
> --
> Aleksa Sarai
> Senior Software Engineer (Containers)
> SUSE Linux GmbH
> <https://www.cyphar.com/>

2024-05-24 15:59:22

by Aleksa Sarai

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On 2024-05-24, Christian Brauner <[email protected]> wrote:
> On Thu, May 23, 2024 at 09:52:20AM -0600, Aleksa Sarai wrote:
> > On 2024-05-21, Christian Brauner <[email protected]> wrote:
> > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote:
> > > > Now that we have stabilised the unique 64-bit mount ID interface in
> > > > statx, we can now provide a race-free way for name_to_handle_at(2) to
> > > > provide a file handle and corresponding mount without needing to worry
> > > > about racing with /proc/mountinfo parsing.
> > > >
> > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> > > > that doesn't make sense for name_to_handle_at(2).
> > > >
> > > > Signed-off-by: Aleksa Sarai <[email protected]>
> > > > ---
> > >
> > > So I think overall this is probably fine (famous last words). If it's
> > > just about being able to retrieve the new mount id without having to
> > > take the hit of another statx system call it's indeed a bit much to
> > > add a revised system call for this. Althoug I did say earlier that I
> > > wouldn't rule that out.
> > >
> > > But if we'd that then it'll be a long discussion on the form of the new
> > > system call and the information it exposes.
> > >
> > > For example, I lack the grey hair needed to understand why
> > > name_to_handle_at() returns a mount id at all. The pitch in commit
> > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that
> > > the (old) mount id can be used to "lookup file system specific
> > > information [...] in /proc/<pid>/mountinfo".
> >
> > The logic was presumably to allow you to know what mount the resolved
> > file handle came from. If you use AT_EMPTY_PATH this is not needed
> > because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if
> > you just give name_to_handle_at() almost any path, there is no race-free
> > way to make sure that you know which filesystem the file handle came
> > from.
> >
> > I don't know if that could lead to security issues (I guess an attacker
> > could find a way to try to manipulate the file handle you get back, and
> > then try to trick you into operating on the wrong filesystem with
> > open_by_handle_at()) but it is definitely something you'd want to avoid.
>
> So the following paragraphs are prefaced with: I'm not an expert on file
> handle encoding and so might be totally wrong.
>
> Afaiu, the uniqueness guarantee of the file handle mostly depends on:
>
> (1) the filesystem
> (2) the actual file handling encoding
>
> Looking at file handle encoding to me it looks like it's fairly easy to
> fake them in userspace (I guess that's ok if you think about them like a
> path but with a weird permission model built around them.) for quite a
> few filesystems.

The old Docker breakout attack did brute-force the fhandle for the root
directory of the host filesystem, so it is entirely possible.

However, the attack I was thinking of was whether a directory tree that
an attacker had mount permissions over could be manipulated such that a
privileged process doing name_to_handle_at() on a path within the tree
would get a file handle that open_by_handle_at() on a different
filesystem would result in a potentially dangerous path being opened.

For instance (M is management process, C is the malicious container
process):

C: Bind-mounts the root of the container filesystem at /foo.
M: name_to_handle_at($CONTAINER/foo)
-> gets an fhandle of / of the container filesystem
-> stores a copy of the (recycled) mount id
C: Swaps /foo with a bind-mount of the host root filesystem such that
the mount id is recycled, before M can scan /proc/self/mountinfo.
C: Triggers M to try to use the filehandle for some administrative
process.
M: open_by_handle_at(...) on the wrong mount id, getting an fd of the
host / directory. Possibly does something bad to this directory
(deleting files, passing the fd back to the container, etc).

It seems possible that this could happen, so having a unique mount id is
kind of important if you plan to use open_by_handle_at() with malicious
actors in control of the target filesystem tree.

Though, regardless of the attack you are worried about, I guess we are
in agreement that a unique mount id from name_to_handle_at() would be a
good idea if we are planning for userspace to use file handles for
everything.

> For example, for anything that uses fs/libfs.c:generic_encode_ino32_fh()
> it's easy to construct a file handle by retrieving the inode number via
> stat and the generation number via FS_IOC_GETVERSION.
>
> Encoding using the inode number and the inode generation number is
> probably not very strong so it's not impossible to generate a file
> handle that is not unique without knowing in which filesystem to
> interpret it in.
>
> The problem is with what name_to_handle_at() returns imho. A mnt_id
> doesn't pin the filesystem and the old mnt_id isn't unique. That means
> the filesystem can be unmounted and go away and the mnt_id can be
> recycled almost immediately for another mount but the file handle is
> still there.
>
> So to guarantee that a (weakly encoded) file handle is interpreted in
> the right filesystem the file handle must either be accompanied by a
> file descriptor that pins the relevant mount or have any other guarantee
> that the filesystem doesn't go away (Or of course, the file handle
> encodes the uuid of the filesystem or something or uses some sort of
> hashing scheme.).
>
> One of the features of file handles is that they're globally usable so
> they're interesting to use as handles that can be shared. IOW, one can
> send around a file handle to another process without having to pin
> anything or have a file descriptor open that needs to be sent via
> AF_UNIX.
>
> But as stated above that's potentially risky so one might still have to
> send around an fd together with the file handle if sender and receiver
> don't share the filesystem for the handle.
>
> However, with the unique mount id things improve quite a bit. Because
> now it's possible to send around the unique mount id and the file
> handle. Then one can use statmount() to figure out which filesystem this
> file handle needs to be interpreted in.

--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

Attachments:

(No filename) (6.34 kB)
signature.asc (235.00 B)
Download all attachments

2024-05-26 08:55:59

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

2024-05-27 13:49:02

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH RFC] fhandle: expose u64 mount id to name_to_handle_at(2)

On Mon, May 27, 2024 at 10:06:15AM +1000, Dave Chinner wrote:
> On Fri, May 24, 2024 at 02:25:30PM +0200, Christian Brauner wrote:
> > On Thu, May 23, 2024 at 09:52:20AM -0600, Aleksa Sarai wrote:
> > > On 2024-05-21, Christian Brauner <[email protected]> wrote:
> > > > On Mon, May 20, 2024 at 05:35:49PM -0400, Aleksa Sarai wrote:
> > > > > Now that we have stabilised the unique 64-bit mount ID interface in
> > > > > statx, we can now provide a race-free way for name_to_handle_at(2) to
> > > > > provide a file handle and corresponding mount without needing to worry
> > > > > about racing with /proc/mountinfo parsing.
> > > > >
> > > > > As with AT_HANDLE_FID, AT_HANDLE_UNIQUE_MNT_ID reuses a statx AT_* bit
> > > > > that doesn't make sense for name_to_handle_at(2).
> > > > >
> > > > > Signed-off-by: Aleksa Sarai <[email protected]>
> > > > > ---
> > > >
> > > > So I think overall this is probably fine (famous last words). If it's
> > > > just about being able to retrieve the new mount id without having to
> > > > take the hit of another statx system call it's indeed a bit much to
> > > > add a revised system call for this. Althoug I did say earlier that I
> > > > wouldn't rule that out.
> > > >
> > > > But if we'd that then it'll be a long discussion on the form of the new
> > > > system call and the information it exposes.
> > > >
> > > > For example, I lack the grey hair needed to understand why
> > > > name_to_handle_at() returns a mount id at all. The pitch in commit
> > > > 990d6c2d7aee ("vfs: Add name to file handle conversion support") is that
> > > > the (old) mount id can be used to "lookup file system specific
> > > > information [...] in /proc/<pid>/mountinfo".
> > >
> > > The logic was presumably to allow you to know what mount the resolved
> > > file handle came from. If you use AT_EMPTY_PATH this is not needed
> > > because you could just fstatfs (and now statx(AT_EMPTY_PATH)), but if
> > > you just give name_to_handle_at() almost any path, there is no race-free
> > > way to make sure that you know which filesystem the file handle came
> > > from.
> > >
> > > I don't know if that could lead to security issues (I guess an attacker
> > > could find a way to try to manipulate the file handle you get back, and
> > > then try to trick you into operating on the wrong filesystem with
> > > open_by_handle_at()) but it is definitely something you'd want to avoid.
> >
> > So the following paragraphs are prefaced with: I'm not an expert on file
> > handle encoding and so might be totally wrong.
> >
> > Afaiu, the uniqueness guarantee of the file handle mostly depends on:
> >
> > (1) the filesystem
> > (2) the actual file handling encoding
> >
> > Looking at file handle encoding to me it looks like it's fairly easy to
> > fake them in userspace (I guess that's ok if you think about them like a
> > path but with a weird permission model built around them.) for quite a
> > few filesystems.
>
> This is a feature specifically used by XFS utilities like xfs_fsr
> and xfsdump to take pathless inode information retrieved by bulkstat
> operations to a file handle to enable arbitrary inodes in the
> filesystem to be opened.
>
> i.e. they scan the filesystem by walking the filesystem's internal
> inode index, not by walking paths to scan the filesytsem. This is
> *much* faster than path walking, especially on seek-limited storage
> hardware.
>
> Knowing the inode number, generation and fs uuid, we can
> construct a valid filehandle for that inode, and then do whatever
> operations we need to do (defrag, backup, move offline (HSM), etc)
> without needing to know the path to that inode....

Yeah, I think you mentioned this in another context before.

> > The problem is with what name_to_handle_at() returns imho. A mnt_id
> > doesn't pin the filesystem and the old mnt_id isn't unique. That means
> > the filesystem can be unmounted and go away and the mnt_id can be
> > recycled almost immediately for another mount but the file handle is
> > still there.
>
> This is no different to the NFS server unexporting a filesystem -
> all attempts to resolve the file handle the client holds for that
> export must now fail. Hence the filehandle itself must have a
> superblock specific identifier in it.
>
> > So to guarantee that a (weakly encoded) file handle is interpreted in
> > the right filesystem the file handle must either be accompanied by a
> > file descriptor that pins the relevant mount or have any other guarantee
> > that the filesystem doesn't go away (Or of course, the file handle
> > encodes the uuid of the filesystem or something or uses some sort of
> > hashing scheme.).
>
> Yes, it does, and that's the superblock based identifier, not
> something that is mount based. i.e. the handle is valid regardless
> of where the filesystem is mounted, even if the application does not
> have visibility or permitted access to the given mount. And the
> filehandle is persistent - it must work across unmount/mount,
> reboots, change of mount location, etc.

Agreed, and no one is disputing that. The old mount id as exposed by
name_to_handle_at() is one means to get to a persisent identifier and
that is racy. But we do have a 64bit non-recyled mount id and
statmount() since v6.7 now which allow to get a persistent identifier
for the filesystem in a race-free manner.