LinuxLists.cc - [PATCH v4 0/6] querying mount attributes

[permalink] [raw]

Subject: [PATCH v4 1/6] add unique mount ID

If a mount is released then its mnt_id can immediately be reused. This is
bad news for user interfaces that want to uniquely identify a mount.

Implementing a unique mount ID is trivial (use a 64bit counter).
Unfortunately userspace assumes 32bit size and would overflow after the
counter reaches 2^32.

Introduce a new 64bit ID alongside the old one. Initialize the counter to
2^32, this guarantees that the old and new IDs are never mixed up.

Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/mount.h | 3 ++-
fs/namespace.c | 4 ++++
fs/stat.c | 9 +++++++--
include/uapi/linux/stat.h | 1 +
4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 130c07c2f8d2..a14f762b3f29 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -72,7 +72,8 @@ struct mount {
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
- int mnt_id; /* mount identifier */
+ int mnt_id; /* mount identifier, reused */
+ u64 mnt_id_unique; /* mount ID unique until reboot */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..e02bc5f41c7b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,6 +68,9 @@ static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);

+/* Don't allow confusion with old 32bit mount ID */
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+
static struct hlist_head *mount_hashtable __read_mostly;
static struct hlist_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
@@ -131,6 +134,7 @@ static int mnt_alloc_id(struct mount *mnt)
if (res < 0)
return res;
mnt->mnt_id = res;
+ mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
return 0;
}

diff --git a/fs/stat.c b/fs/stat.c
index d43a5cc1bfa4..77878ae48a0f 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -243,8 +243,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,

error = vfs_getattr(&path, stat, request_mask, flags);

- stat->mnt_id = real_mount(path.mnt)->mnt_id;
- stat->result_mask |= STATX_MNT_ID;
+ if (request_mask & STATX_MNT_ID_UNIQUE) {
+ stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
+ stat->result_mask |= STATX_MNT_ID_UNIQUE;
+ } else {
+ stat->mnt_id = real_mount(path.mnt)->mnt_id;
+ stat->result_mask |= STATX_MNT_ID;
+ }

if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 7cab2c65d3d7..2f2ee82d5517 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -154,6 +154,7 @@ struct statx {
#define STATX_BTIME 0x00000800U /* Want/got stx_btime */
#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */
+#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */

#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */

--
2.41.0

2023-10-25 14:04:36

[permalink] [raw]

Subject: [PATCH v4 6/6] wire up syscalls for statmount/listmount

Wire up all archs.

Signed-off-by: Miklos Szeredi <[email protected]>
---
arch/alpha/kernel/syscalls/syscall.tbl | 3 +++
arch/arm/tools/syscall.tbl | 3 +++
arch/arm64/include/asm/unistd32.h | 4 ++++
arch/ia64/kernel/syscalls/syscall.tbl | 3 +++
arch/m68k/kernel/syscalls/syscall.tbl | 3 +++
arch/microblaze/kernel/syscalls/syscall.tbl | 3 +++
arch/mips/kernel/syscalls/syscall_n32.tbl | 3 +++
arch/mips/kernel/syscalls/syscall_n64.tbl | 3 +++
arch/mips/kernel/syscalls/syscall_o32.tbl | 3 +++
arch/parisc/kernel/syscalls/syscall.tbl | 3 +++
arch/powerpc/kernel/syscalls/syscall.tbl | 3 +++
arch/s390/kernel/syscalls/syscall.tbl | 3 +++
arch/sh/kernel/syscalls/syscall.tbl | 3 +++
arch/sparc/kernel/syscalls/syscall.tbl | 3 +++
arch/x86/entry/syscalls/syscall_32.tbl | 3 +++
arch/x86/entry/syscalls/syscall_64.tbl | 2 ++
arch/xtensa/kernel/syscalls/syscall.tbl | 3 +++
include/uapi/asm-generic/unistd.h | 8 +++++++-
18 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ad37569d0507..6c23bf68eff0 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -492,3 +492,6 @@
560 common set_mempolicy_home_node sys_ni_syscall
561 common cachestat sys_cachestat
562 common fchmodat2 sys_fchmodat2
+# 563 reserved for map_shadow_stack
+564 common statmount sys_statmount
+565 common listmount sys_listmount
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c572d6c3dee0..d110a832899b 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -466,3 +466,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 78b68311ec81..b0a994c9ff3c 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -911,6 +911,10 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
__SYSCALL(__NR_cachestat, sys_cachestat)
#define __NR_fchmodat2 452
__SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_statmount 454
+__SYSCALL(__NR_statmount, sys_statmount)
+#define __NR_listmount 455
+__SYSCALL(__NR_listmount, sys_listmount)

/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 83d8609aec03..c5f45a8fc834 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -373,3 +373,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 259ceb125367..b9cabb746487 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -452,3 +452,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a3798c2637fd..89c4ed548ce8 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -458,3 +458,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 152034b8e0a0..a9d561698fe2 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -391,3 +391,6 @@
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node
451 n32 cachestat sys_cachestat
452 n32 fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 n32 statmount sys_statmount
+455 n32 listmount sys_listmount
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cb5e757f6621..80a056866da7 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -367,3 +367,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 n64 cachestat sys_cachestat
452 n64 fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 n64 statmount sys_statmount
+455 n64 listmount sys_listmount
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 1a646813afdc..139ddc691176 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -440,3 +440,6 @@
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node
451 o32 cachestat sys_cachestat
452 o32 fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 o32 statmount sys_statmount
+455 o32 listmount sys_listmount
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e97c175b56f9..46fa753f0d64 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -451,3 +451,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 20e50586e8a2..106937744525 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -539,3 +539,6 @@
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 0122cc156952..57dd9202f8d7 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -455,3 +455,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount sys_statmount
+455 common listmount sys_listmount sys_listmount
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e90d585c4d3e..0c7407a0e32c 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -455,3 +455,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4ed06c71c43f..a0fd36469478 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -498,3 +498,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..4f41977bd4c1 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -457,3 +457,6 @@
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
451 i386 cachestat sys_cachestat
452 i386 fchmodat2 sys_fchmodat2
+# 563 reserved for map_shadow_stack
+454 i386 statmount sys_statmount
+455 i386 listmount sys_listmount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..a1b3ce7d22cc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,6 +375,8 @@
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
453 64 map_shadow_stack sys_map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount

#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index fc1a4f3c81d9..73378984702b 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -423,3 +423,6 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+# 453 reserved for map_shadow_stack
+454 common statmount sys_statmount
+455 common listmount sys_listmount
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..8df6a747e21a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -823,8 +823,14 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
#define __NR_fchmodat2 452
__SYSCALL(__NR_fchmodat2, sys_fchmodat2)

+#define __NR_statmount 454
+__SYSCALL(__NR_statmount, sys_statmount)
+
+#define __NR_listmount 455
+__SYSCALL(__NR_listmount, sys_listmount)
+
#undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 456

/*
* 32 bit systems traditionally used different
--
2.41.0

2023-10-25 14:05:03

[permalink] [raw]

Subject: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

When adding a mount to a namespace insert it into an rbtree rooted in the
mnt_namespace instead of a linear list.

The mnt.mnt_list is still used to set up the mount tree and for
propagation, but not after the mount has been added to a namespace. Hence
mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to
validate that the mount is on the correct list.

This allows removing the cursor used for reading /proc/$PID/mountinfo. The
mnt_id_unique of the next mount can be used as an index into the seq file.

Tested by inserting 100k bind mounts, unsharing the mount namespace, and
unmounting. No performance regressions have been observed.

For the last mount in the 100k list the statmount() call was more than 100x
faster due to the mount ID lookup not having to do a linear search. This
patch makes the overhead of mount ID lookup non-observable in this range.

Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/mount.h | 24 +++---
fs/namespace.c | 190 ++++++++++++++++++++----------------------
fs/pnode.c | 2 +-
fs/proc_namespace.c | 3 -
include/linux/mount.h | 5 +-
5 files changed, 106 insertions(+), 118 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index a14f762b3f29..4a42fc68f4cc 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,19 +8,13 @@
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
- /*
- * Traversal and modification of .list is protected by either
- * - taking namespace_sem for write, OR
- * - taking namespace_sem for read AND taking .ns_lock.
- */
- struct list_head list;
- spinlock_t ns_lock;
+ struct rb_root mounts; /* Protected by namespace_sem */
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
- unsigned int mounts; /* # of mounts in the namespace */
+ unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
} __randomize_layout;

@@ -55,7 +49,10 @@ struct mount {
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
- struct list_head mnt_list;
+ union {
+ struct rb_node mnt_node; /* Under ns->mounts */
+ struct list_head mnt_list;
+ };
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */
@@ -128,7 +125,6 @@ struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
- struct mount cursor;
};

extern const struct seq_operations mounts_op;
@@ -147,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
return ns->seq == 0;
}

+static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+{
+ WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
+ mnt->mnt.mnt_flags &= ~MNT_ONRB;
+ rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+ list_add_tail(&mnt->mnt_list, dt_list);
+}
+
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
diff --git a/fs/namespace.c b/fs/namespace.c
index e02bc5f41c7b..0eab47ffc76c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -732,21 +732,6 @@ struct vfsmount *lookup_mnt(const struct path *path)
return m;
}

-static inline void lock_ns_list(struct mnt_namespace *ns)
-{
- spin_lock(&ns->ns_lock);
-}
-
-static inline void unlock_ns_list(struct mnt_namespace *ns)
-{
- spin_unlock(&ns->ns_lock);
-}
-
-static inline bool mnt_is_cursor(struct mount *mnt)
-{
- return mnt->mnt.mnt_flags & MNT_CURSOR;
-}
-
/*
* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
* current mount namespace.
@@ -765,19 +750,15 @@ static inline bool mnt_is_cursor(struct mount *mnt)
bool __is_local_mountpoint(struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
- struct mount *mnt;
+ struct mount *mnt, *n;
bool is_covered = false;

down_read(&namespace_sem);
- lock_ns_list(ns);
- list_for_each_entry(mnt, &ns->list, mnt_list) {
- if (mnt_is_cursor(mnt))
- continue;
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
is_covered = (mnt->mnt_mountpoint == dentry);
if (is_covered)
break;
}
- unlock_ns_list(ns);
up_read(&namespace_sem);

return is_covered;
@@ -1024,6 +1005,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
mnt_add_count(old_parent, -1);
}

+static inline struct mount *node_to_mount(struct rb_node *node)
+{
+ return rb_entry(node, struct mount, mnt_node);
+}
+
+static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
+{
+ struct rb_node **link = &ns->mounts.rb_node;
+ struct rb_node *parent = NULL;
+
+ WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+ mnt->mnt_ns = ns;
+ while (*link) {
+ parent = *link;
+ if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ rb_link_node(&mnt->mnt_node, parent, link);
+ rb_insert_color(&mnt->mnt_node, &ns->mounts);
+ mnt->mnt.mnt_flags |= MNT_ONRB;
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -1037,12 +1042,13 @@ static void commit_tree(struct mount *mnt)
BUG_ON(parent == mnt);

list_add_tail(&head, &mnt->mnt_list);
- list_for_each_entry(m, &head, mnt_list)
- m->mnt_ns = n;
+ while (!list_empty(&head)) {
+ m = list_first_entry(&head, typeof(*m), mnt_list);
+ list_del(&m->mnt_list);

- list_splice(&head, n->list.prev);
-
- n->mounts += n->pending_mounts;
+ mnt_add_to_ns(n, m);
+ }
+ n->nr_mounts += n->pending_mounts;
n->pending_mounts = 0;

__attach_mnt(mnt, parent);
@@ -1190,7 +1196,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
}

mnt->mnt.mnt_flags = old->mnt.mnt_flags;
- mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+ mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);

atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1415,65 +1421,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
return &p->mnt;
}

-#ifdef CONFIG_PROC_FS
-static struct mount *mnt_list_next(struct mnt_namespace *ns,
- struct list_head *p)
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * smallest id afer the specified one.
+ */
+static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{
- struct mount *mnt, *ret = NULL;
+ struct rb_node *node = ns->mounts.rb_node;
+ struct mount *ret = NULL;

- lock_ns_list(ns);
- list_for_each_continue(p, &ns->list) {
- mnt = list_entry(p, typeof(*mnt), mnt_list);
- if (!mnt_is_cursor(mnt)) {
- ret = mnt;
- break;
+ while (node) {
+ struct mount *m = node_to_mount(node);
+
+ if (mnt_id <= m->mnt_id_unique) {
+ ret = node_to_mount(node);
+ if (mnt_id == m->mnt_id_unique)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
}
}
- unlock_ns_list(ns);
-
return ret;
}

+#ifdef CONFIG_PROC_FS
+
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
- struct list_head *prev;

down_read(&namespace_sem);
- if (!*pos) {
- prev = &p->ns->list;
- } else {
- prev = &p->cursor.mnt_list;

- /* Read after we'd reached the end? */
- if (list_empty(prev))
- return NULL;
- }
-
- return mnt_list_next(p->ns, prev);
+ return mnt_find_id_at(p->ns, *pos);
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct proc_mounts *p = m->private;
- struct mount *mnt = v;
+ struct mount *next = NULL, *mnt = v;
+ struct rb_node *node = rb_next(&mnt->mnt_node);

++*pos;
- return mnt_list_next(p->ns, &mnt->mnt_list);
+ if (node) {
+ next = node_to_mount(node);
+ *pos = next->mnt_id_unique;
+ }
+ return next;
}

static void m_stop(struct seq_file *m, void *v)
{
- struct proc_mounts *p = m->private;
- struct mount *mnt = v;
-
- lock_ns_list(p->ns);
- if (mnt)
- list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
- else
- list_del_init(&p->cursor.mnt_list);
- unlock_ns_list(p->ns);
up_read(&namespace_sem);
}

@@ -1491,14 +1489,6 @@ const struct seq_operations mounts_op = {
.show = m_show,
};

-void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
-{
- down_read(&namespace_sem);
- lock_ns_list(ns);
- list_del(&cursor->mnt_list);
- unlock_ns_list(ns);
- up_read(&namespace_sem);
-}
#endif /* CONFIG_PROC_FS */

/**
@@ -1640,7 +1630,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
/* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT;
- list_move(&p->mnt_list, &tmp_list);
+ if (p->mnt.mnt_flags & MNT_ONRB)
+ move_from_ns(p, &tmp_list);
+ else
+ list_move(&p->mnt_list, &tmp_list);
}

/* Hide the mounts from mnt_mounts */
@@ -1660,7 +1653,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
list_del_init(&p->mnt_list);
ns = p->mnt_ns;
if (ns) {
- ns->mounts--;
+ ns->nr_mounts--;
__touch_mnt_namespace(ns);
}
p->mnt_ns = NULL;
@@ -1786,14 +1779,16 @@ static int do_umount(struct mount *mnt, int flags)

event++;
if (flags & MNT_DETACH) {
- if (!list_empty(&mnt->mnt_list))
+ if (mnt->mnt.mnt_flags & MNT_ONRB ||
+ !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
- if (!list_empty(&mnt->mnt_list))
+ if (mnt->mnt.mnt_flags & MNT_ONRB ||
+ !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
@@ -2211,9 +2206,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
unsigned int mounts = 0;
struct mount *p;

- if (ns->mounts >= max)
+ if (ns->nr_mounts >= max)
return -ENOSPC;
- max -= ns->mounts;
+ max -= ns->nr_mounts;
if (ns->pending_mounts >= max)
return -ENOSPC;
max -= ns->pending_mounts;
@@ -2357,8 +2352,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
+ LIST_HEAD(head);
+
/* move from anon - the caller will destroy */
- list_del_init(&source_mnt->mnt_ns->list);
+ for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+ move_from_ns(p, &head);
+ list_del_init(&head);
}
if (beneath)
mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
@@ -2669,11 +2668,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive)

lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
- p->mnt_ns = ns;
- ns->mounts++;
+ mnt_add_to_ns(ns, p);
+ ns->nr_mounts++;
}
ns->root = mnt;
- list_add_tail(&ns->list, &mnt->mnt_list);
mntget(&mnt->mnt);
unlock_mount_hash();
namespace_unlock();
@@ -3736,9 +3734,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
- INIT_LIST_HEAD(&new_ns->list);
+ new_ns->mounts = RB_ROOT;
init_waitqueue_head(&new_ns->poll);
- spin_lock_init(&new_ns->ns_lock);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
return new_ns;
@@ -3785,7 +3782,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
unlock_mount_hash();
}
new_ns->root = new;
- list_add_tail(&new_ns->list, &new->mnt_list);

/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -3795,8 +3791,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
p = old;
q = new;
while (p) {
- q->mnt_ns = new_ns;
- new_ns->mounts++;
+ mnt_add_to_ns(new_ns, q);
+ new_ns->nr_mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
@@ -3838,10 +3834,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
mntput(m);
return ERR_CAST(ns);
}
- mnt->mnt_ns = ns;
ns->root = mnt;
- ns->mounts++;
- list_add(&mnt->mnt_list, &ns->list);
+ ns->nr_mounts++;
+ mnt_add_to_ns(ns, mnt);

err = vfs_path_lookup(m->mnt_root, m,
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
@@ -4019,10 +4014,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
goto err_path;
}
mnt = real_mount(newmount.mnt);
- mnt->mnt_ns = ns;
ns->root = mnt;
- ns->mounts = 1;
- list_add(&mnt->mnt_list, &ns->list);
+ ns->nr_mounts = 1;
+ mnt_add_to_ns(ns, mnt);
mntget(newmount.mnt);

/* Attach to an apparent O_PATH fd with a note that we need to unmount
@@ -4693,10 +4687,9 @@ static void __init init_mount_tree(void)
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
m = real_mount(mnt);
- m->mnt_ns = ns;
ns->root = m;
- ns->mounts = 1;
- list_add(&m->mnt_list, &ns->list);
+ ns->nr_mounts = 1;
+ mnt_add_to_ns(ns, m);
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);

@@ -4823,18 +4816,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
- struct mount *mnt;
+ struct mount *mnt, *n;
bool visible = false;

down_read(&namespace_sem);
- lock_ns_list(ns);
- list_for_each_entry(mnt, &ns->list, mnt_list) {
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
struct mount *child;
int mnt_flags;

- if (mnt_is_cursor(mnt))
- continue;
-
if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;

@@ -4882,7 +4871,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
next: ;
}
found:
- unlock_ns_list(ns);
up_read(&namespace_sem);
return visible;
}
diff --git a/fs/pnode.c b/fs/pnode.c
index e4d0340393d5..a799e0315cc9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount)
mnt->mnt.mnt_flags |= MNT_UMOUNT;
list_del_init(&mnt->mnt_child);
list_del_init(&mnt->mnt_umounting);
- list_move_tail(&mnt->mnt_list, to_umount);
+ move_from_ns(mnt, to_umount);
}

/*
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..73d2274d5f59 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -283,8 +283,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
p->ns = ns;
p->root = root;
p->show = show;
- INIT_LIST_HEAD(&p->cursor.mnt_list);
- p->cursor.mnt.mnt_flags = MNT_CURSOR;

return 0;

@@ -301,7 +299,6 @@ static int mounts_release(struct inode *inode, struct file *file)
struct seq_file *m = file->private_data;
struct proc_mounts *p = m->private;
path_put(&p->root);
- mnt_cursor_del(p->ns, &p->cursor);
put_mnt_ns(p->ns);
return seq_release_private(inode, file);
}
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4f40b40306d0..7952eddc835c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -50,8 +50,7 @@ struct path;
#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )

#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
- MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
- MNT_CURSOR)
+ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)

#define MNT_INTERNAL 0x4000

@@ -65,7 +64,7 @@ struct path;
#define MNT_SYNC_UMOUNT 0x2000000
#define MNT_MARKED 0x4000000
#define MNT_UMOUNT 0x8000000
-#define MNT_CURSOR 0x10000000
+#define MNT_ONRB 0x10000000

struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
--
2.41.0

2023-10-25 14:40:14

[permalink] [raw]

Subject: [PATCH v4 3/6] namespace: extract show_path() helper

To be used by the statmount(2) syscall as well.

Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/internal.h | 2 ++
fs/namespace.c | 9 +++++++++
fs/proc_namespace.c | 10 +++-------
3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..0c4f4cf2ff5a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -83,6 +83,8 @@ int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);

+int show_path(struct seq_file *m, struct dentry *root);
+
/*
* fs_struct.c
*/
diff --git a/fs/namespace.c b/fs/namespace.c
index 0eab47ffc76c..7a33ea391a02 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4672,6 +4672,15 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}

+int show_path(struct seq_file *m, struct dentry *root)
+{
+ if (root->d_sb->s_op->show_path)
+ return root->d_sb->s_op->show_path(m, root);
+
+ seq_dentry(m, root, " \t\n\\");
+ return 0;
+}
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 73d2274d5f59..0a808951b7d3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -142,13 +142,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)

seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
MAJOR(sb->s_dev), MINOR(sb->s_dev));
- if (sb->s_op->show_path) {
- err = sb->s_op->show_path(m, mnt->mnt_root);
- if (err)
- goto out;
- } else {
- seq_dentry(m, mnt->mnt_root, " \t\n\\");
- }
+ err = show_path(m, mnt->mnt_root);
+ if (err)
+ goto out;
seq_putc(m, ' ');

/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
--
2.41.0

2023-10-25 14:40:18

[permalink] [raw]

Subject: [PATCH v4 5/6] add listmount(2) syscall

Add way to query the children of a particular mount. This is a more
flexible way to iterate the mount tree than having to parse the complete
/proc/self/mountinfo.

Allow listing either

- immediate child mounts only, or

- recursively all descendant mounts (depth first).

Lookup the mount by the new 64bit mount ID. If a mount needs to be queried
based on path, then statx(2) can be used to first query the mount ID
belonging to the path.

Return an array of new (64bit) mount ID's. Without privileges only mounts
are listed which are reachable from the task's root.

Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/namespace.c | 93 ++++++++++++++++++++++++++++++++++++++
include/linux/syscalls.h | 3 ++
include/uapi/linux/mount.h | 9 ++++
3 files changed, 105 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index a980c250a3a6..0afe2344bba6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4958,6 +4958,99 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
return ret;
}

+static struct mount *listmnt_first(struct mount *root)
+{
+ return list_first_entry_or_null(&root->mnt_mounts, struct mount, mnt_child);
+}
+
+static struct mount *listmnt_next(struct mount *curr, struct mount *root, bool recurse)
+{
+ if (recurse)
+ return next_mnt(curr, root);
+ if (!list_is_head(curr->mnt_child.next, &root->mnt_mounts))
+ return list_next_entry(curr, mnt_child);
+ return NULL;
+}
+
+static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
+ const struct path *root, unsigned int flags)
+{
+ struct mount *r, *m = real_mount(mnt);
+ struct path rootmnt = {
+ .mnt = root->mnt,
+ .dentry = root->mnt->mnt_root
+ };
+ long ctr = 0;
+ bool reachable_only = true;
+ bool recurse = flags & LISTMOUNT_RECURSIVE;
+ int err;
+
+ err = security_sb_statfs(mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (flags & LISTMOUNT_UNREACHABLE) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ reachable_only = false;
+ }
+
+ if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
+ return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;
+
+ for (r = listmnt_first(m); r; r = listmnt_next(r, m, recurse)) {
+ if (reachable_only &&
+ !is_path_reachable(r, r->mnt.mnt_root, root))
+ continue;
+
+ if (ctr >= bufsize)
+ return -EOVERFLOW;
+ if (put_user(r->mnt_id_unique, buf + ctr))
+ return -EFAULT;
+ ctr++;
+ if (ctr < 0)
+ return -ERANGE;
+ }
+ return ctr;
+}
+
+SYSCALL_DEFINE4(listmount, const struct __mount_arg __user *, req,
+ u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+{
+ struct __mount_arg kreq;
+ struct vfsmount *mnt;
+ struct path root;
+ u64 mnt_id;
+ long err;
+
+ if (flags & ~(LISTMOUNT_UNREACHABLE | LISTMOUNT_RECURSIVE))
+ return -EINVAL;
+
+ if (copy_from_user(&kreq, req, sizeof(kreq)))
+ return -EFAULT;
+ mnt_id = kreq.mnt_id;
+
+ down_read(&namespace_sem);
+ if (mnt_id == LSMT_ROOT)
+ mnt = &current->nsproxy->mnt_ns->root->mnt;
+ else
+ mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
+
+ err = -ENOENT;
+ if (mnt) {
+ get_fs_root(current->fs, &root);
+ /* Skip unreachable for LSMT_ROOT */
+ if (mnt_id == LSMT_ROOT && !(flags & LISTMOUNT_UNREACHABLE))
+ mnt = root.mnt;
+ err = do_listmount(mnt, buf, bufsize, &root, flags);
+ path_put(&root);
+ }
+ up_read(&namespace_sem);
+
+ return err;
+}
+
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ba371024d902..38f3da7e04d1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -413,6 +413,9 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
asmlinkage long sys_statmount(const struct __mount_arg __user *req,
struct statmnt __user *buf, size_t bufsize,
unsigned int flags);
+asmlinkage long sys_listmount(const struct __mount_arg __user *req,
+ u64 __user *buf, size_t bufsize,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
#if BITS_PER_LONG == 32
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d2c988ab526b..704c408cc662 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -194,4 +194,13 @@ struct __mount_arg {
#define STMT_MNT_POINT 0x00000010U /* Want/got mnt_point */
#define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */

+/* listmount(2) flags */
+#define LISTMOUNT_UNREACHABLE 0x01 /* List unreachable mounts too */
+#define LISTMOUNT_RECURSIVE 0x02 /* List a mount tree */
+
+/*
+ * Special @mnt_id values that can be passed to listmount
+ */
+#define LSMT_ROOT 0xffffffffffffffff /* root mount */
+
#endif /* _UAPI_LINUX_MOUNT_H */
--
2.41.0

2023-10-25 14:40:22

[permalink] [raw]

Subject: [PATCH v4 4/6] add statmount(2) syscall

Add a way to query attributes of a single mount instead of having to parse
the complete /proc/$PID/mountinfo, which might be huge.

Lookup the mount the new 64bit mount ID. If a mount needs to be queried
based on path, then statx(2) can be used to first query the mount ID
belonging to the path.

Design is based on a suggestion by Linus:

"So I'd suggest something that is very much like "statfsat()", which gets
a buffer and a length, and returns an extended "struct statfs" *AND*
just a string description at the end."

The interface closely mimics that of statx.

Handle ASCII attributes by appending after the end of the structure (as per
above suggestion). Pointers to strings are stored in u64 members to make
the structure the same regardless of pointer size. Strings are nul
terminated.

Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/namespace.c | 277 +++++++++++++++++++++++++++++++++++++
include/linux/syscalls.h | 5 +
include/uapi/linux/mount.h | 56 ++++++++
3 files changed, 338 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7a33ea391a02..a980c250a3a6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4681,6 +4681,283 @@ int show_path(struct seq_file *m, struct dentry *root)
return 0;
}

+static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
+{
+ struct mount *mnt = mnt_find_id_at(ns, id);
+
+ if (!mnt || mnt->mnt_id_unique != id)
+ return NULL;
+
+ return &mnt->mnt;
+}
+
+struct stmt_state {
+ struct statmnt __user *const buf;
+ size_t const bufsize;
+ struct vfsmount *const mnt;
+ u64 const mask;
+ struct seq_file seq;
+ struct path root;
+ struct statmnt sm;
+ size_t pos;
+ int err;
+};
+
+typedef int (*stmt_func_t)(struct stmt_state *);
+
+static int stmt_string_seq(struct stmt_state *s, stmt_func_t func)
+{
+ size_t rem = s->bufsize - s->pos - sizeof(s->sm);
+ struct seq_file *seq = &s->seq;
+ int ret;
+
+ seq->count = 0;
+ seq->size = min(seq->size, rem);
+ seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
+ if (!seq->buf)
+ return -ENOMEM;
+
+ ret = func(s);
+ if (ret)
+ return ret;
+
+ if (seq_has_overflowed(seq)) {
+ if (seq->size == rem)
+ return -EOVERFLOW;
+ seq->size *= 2;
+ if (seq->size > MAX_RW_COUNT)
+ return -ENOMEM;
+ kvfree(seq->buf);
+ return 0;
+ }
+
+ /* Done */
+ return 1;
+}
+
+static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
+ u32 *str)
+{
+ int ret = s->pos + sizeof(s->sm) >= s->bufsize ? -EOVERFLOW : 0;
+ struct statmnt *sm = &s->sm;
+ struct seq_file *seq = &s->seq;
+
+ if (s->err || !(s->mask & mask))
+ return;
+
+ seq->size = PAGE_SIZE;
+ while (!ret)
+ ret = stmt_string_seq(s, func);
+
+ if (ret < 0) {
+ s->err = ret;
+ } else {
+ seq->buf[seq->count++] = '\0';
+ if (copy_to_user(s->buf->str + s->pos, seq->buf, seq->count)) {
+ s->err = -EFAULT;
+ } else {
+ *str = s->pos;
+ s->pos += seq->count;
+ }
+ }
+ kvfree(seq->buf);
+ sm->mask |= mask;
+}
+
+static void stmt_numeric(struct stmt_state *s, u64 mask, stmt_func_t func)
+{
+ if (s->err || !(s->mask & mask))
+ return;
+
+ s->err = func(s);
+ s->sm.mask |= mask;
+}
+
+static u64 mnt_to_attr_flags(struct vfsmount *mnt)
+{
+ unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
+ u64 attr_flags = 0;
+
+ if (mnt_flags & MNT_READONLY)
+ attr_flags |= MOUNT_ATTR_RDONLY;
+ if (mnt_flags & MNT_NOSUID)
+ attr_flags |= MOUNT_ATTR_NOSUID;
+ if (mnt_flags & MNT_NODEV)
+ attr_flags |= MOUNT_ATTR_NODEV;
+ if (mnt_flags & MNT_NOEXEC)
+ attr_flags |= MOUNT_ATTR_NOEXEC;
+ if (mnt_flags & MNT_NODIRATIME)
+ attr_flags |= MOUNT_ATTR_NODIRATIME;
+ if (mnt_flags & MNT_NOSYMFOLLOW)
+ attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
+
+ if (mnt_flags & MNT_NOATIME)
+ attr_flags |= MOUNT_ATTR_NOATIME;
+ else if (mnt_flags & MNT_RELATIME)
+ attr_flags |= MOUNT_ATTR_RELATIME;
+ else
+ attr_flags |= MOUNT_ATTR_STRICTATIME;
+
+ if (is_idmapped_mnt(mnt))
+ attr_flags |= MOUNT_ATTR_IDMAP;
+
+ return attr_flags;
+}
+
+static u64 mnt_to_propagation_flags(struct mount *m)
+{
+ u64 propagation = 0;
+
+ if (IS_MNT_SHARED(m))
+ propagation |= MS_SHARED;
+ if (IS_MNT_SLAVE(m))
+ propagation |= MS_SLAVE;
+ if (IS_MNT_UNBINDABLE(m))
+ propagation |= MS_UNBINDABLE;
+ if (!propagation)
+ propagation |= MS_PRIVATE;
+
+ return propagation;
+}
+
+static int stmt_sb_basic(struct stmt_state *s)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ s->sm.sb_dev_major = MAJOR(sb->s_dev);
+ s->sm.sb_dev_minor = MINOR(sb->s_dev);
+ s->sm.sb_magic = sb->s_magic;
+ s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
+
+ return 0;
+}
+
+static int stmt_mnt_basic(struct stmt_state *s)
+{
+ struct mount *m = real_mount(s->mnt);
+
+ s->sm.mnt_id = m->mnt_id_unique;
+ s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
+ s->sm.mnt_id_old = m->mnt_id;
+ s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
+ s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
+ s->sm.mnt_propagation = mnt_to_propagation_flags(m);
+ s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+ s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
+
+ return 0;
+}
+
+static int stmt_propagate_from(struct stmt_state *s)
+{
+ struct mount *m = real_mount(s->mnt);
+
+ if (!IS_MNT_SLAVE(m))
+ return 0;
+
+ s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
+
+ return 0;
+}
+
+static int stmt_mnt_root(struct stmt_state *s)
+{
+ struct seq_file *seq = &s->seq;
+ int err = show_path(seq, s->mnt->mnt_root);
+
+ if (!err && !seq_has_overflowed(seq)) {
+ seq->buf[seq->count] = '\0';
+ seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
+ }
+ return err;
+}
+
+static int stmt_mnt_point(struct stmt_state *s)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+ int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
+
+ return err == SEQ_SKIP ? 0 : err;
+}
+
+static int stmt_fs_type(struct stmt_state *s)
+{
+ struct seq_file *seq = &s->seq;
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ seq_puts(seq, sb->s_type->name);
+ return 0;
+}
+
+static int do_statmount(struct stmt_state *s)
+{
+ struct statmnt *sm = &s->sm;
+ struct mount *m = real_mount(s->mnt);
+ size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
+ int err;
+
+ err = security_sb_statfs(s->mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (!capable(CAP_SYS_ADMIN) &&
+ !is_path_reachable(m, m->mnt.mnt_root, &s->root))
+ return -EPERM;
+
+ stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
+ stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
+ stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
+ stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
+ stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
+ stmt_string(s, STMT_MNT_POINT, stmt_mnt_point, &sm->mnt_point);
+
+ if (s->err)
+ return s->err;
+
+ /* Return the number of bytes copied to the buffer */
+ sm->size = copysize + s->pos;
+
+ if (copy_to_user(s->buf, sm, copysize))
+ return -EFAULT;
+
+ return 0;
+}
+
+SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
+ struct statmnt __user *, buf, size_t, bufsize,
+ unsigned int, flags)
+{
+ struct vfsmount *mnt;
+ struct __mount_arg kreq;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (copy_from_user(&kreq, req, sizeof(kreq)))
+ return -EFAULT;
+
+ down_read(&namespace_sem);
+ mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+ ret = -ENOENT;
+ if (mnt) {
+ struct stmt_state s = {
+ .mask = kreq.request_mask,
+ .buf = buf,
+ .bufsize = bufsize,
+ .mnt = mnt,
+ };
+
+ get_fs_root(current->fs, &s.root);
+ ret = do_statmount(&s);
+ path_put(&s.root);
+ }
+ up_read(&namespace_sem);
+
+ return ret;
+}
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..ba371024d902 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -74,6 +74,8 @@ struct landlock_ruleset_attr;
enum landlock_rule_type;
struct cachestat_range;
struct cachestat;
+struct statmnt;
+struct __mount_arg;

#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -408,6 +410,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
struct statfs64 __user *buf);
+asmlinkage long sys_statmount(const struct __mount_arg __user *req,
+ struct statmnt __user *buf, size_t bufsize,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
#if BITS_PER_LONG == 32
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index bb242fdcfe6b..d2c988ab526b 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -138,4 +138,60 @@ struct mount_attr {
/* List of all mount_attr versions. */
#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */

+
+/*
+ * Structure for getting mount/superblock/filesystem info with statmount(2).
+ *
+ * The interface is similar to statx(2): individual fields or groups can be
+ * selected with the @mask argument of statmount(). Kernel will set the @mask
+ * field according to the supported fields.
+ *
+ * If string fields are selected, then the caller needs to pass a buffer that
+ * has space after the fixed part of the structure. Nul terminated strings are
+ * copied there and offsets relative to @str are stored in the relevant fields.
+ * If the buffer is too small, then EOVERFLOW is returned. The actually used
+ * size is returned in @size.
+ */
+struct statmnt {
+ __u32 size; /* Total size, including strings */
+ __u32 __spare1;
+ __u64 mask; /* What results were written */
+ __u32 sb_dev_major; /* Device ID */
+ __u32 sb_dev_minor;
+ __u64 sb_magic; /* ..._SUPER_MAGIC */
+ __u32 sb_flags; /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+ __u32 fs_type; /* [str] Filesystem type */
+ __u64 mnt_id; /* Unique ID of mount */
+ __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */
+ __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */
+ __u32 mnt_parent_id_old;
+ __u64 mnt_attr; /* MOUNT_ATTR_... */
+ __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+ __u64 mnt_peer_group; /* ID of shared peer group */
+ __u64 mnt_master; /* Mount receives propagation from this ID */
+ __u64 propagate_from; /* Propagation from in current namespace */
+ __u32 mnt_root; /* [str] Root of mount relative to root of fs */
+ __u32 mnt_point; /* [str] Mountpoint relative to current root */
+ __u64 __spare2[50];
+ char str[]; /* Variable size part containing strings */
+};
+
+/*
+ * To be used on the kernel ABI only for passing 64bit arguments to statmount(2)
+ */
+struct __mount_arg {
+ __u64 mnt_id;
+ __u64 request_mask;
+};
+
+/*
+ * @mask bits for statmount(2)
+ */
+#define STMT_SB_BASIC 0x00000001U /* Want/got sb_... */
+#define STMT_MNT_BASIC 0x00000002U /* Want/got mnt_... */
+#define STMT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */
+#define STMT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
+#define STMT_MNT_POINT 0x00000010U /* Want/got mnt_point */
+#define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */
+
#endif /* _UAPI_LINUX_MOUNT_H */
--
2.41.0

2023-10-27 03:12:40

[permalink] [raw]

Subject: Re: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

On 25/10/23 22:02, Miklos Szeredi wrote:
> When adding a mount to a namespace insert it into an rbtree rooted in the
> mnt_namespace instead of a linear list.
>
> The mnt.mnt_list is still used to set up the mount tree and for
> propagation, but not after the mount has been added to a namespace. Hence
> mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to
> validate that the mount is on the correct list.

Is that accurate, propagation occurs at mount and also at umount.

IDG how the change to umount_one() works, it looks like umount_list()

uses mnt_list. It looks like propagate_umount() is also using mnt_list.

Am I missing something obvious?

Ian

>
> This allows removing the cursor used for reading /proc/$PID/mountinfo. The
> mnt_id_unique of the next mount can be used as an index into the seq file.
>
> Tested by inserting 100k bind mounts, unsharing the mount namespace, and
> unmounting. No performance regressions have been observed.
>
> For the last mount in the 100k list the statmount() call was more than 100x
> faster due to the mount ID lookup not having to do a linear search. This
> patch makes the overhead of mount ID lookup non-observable in this range.
>
> Signed-off-by: Miklos Szeredi <[email protected]>
> ---
> fs/mount.h | 24 +++---
> fs/namespace.c | 190 ++++++++++++++++++++----------------------
> fs/pnode.c | 2 +-
> fs/proc_namespace.c | 3 -
> include/linux/mount.h | 5 +-
> 5 files changed, 106 insertions(+), 118 deletions(-)
>
> diff --git a/fs/mount.h b/fs/mount.h
> index a14f762b3f29..4a42fc68f4cc 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -8,19 +8,13 @@
> struct mnt_namespace {
> struct ns_common ns;
> struct mount * root;
> - /*
> - * Traversal and modification of .list is protected by either
> - * - taking namespace_sem for write, OR
> - * - taking namespace_sem for read AND taking .ns_lock.
> - */
> - struct list_head list;
> - spinlock_t ns_lock;
> + struct rb_root mounts; /* Protected by namespace_sem */
> struct user_namespace *user_ns;
> struct ucounts *ucounts;
> u64 seq; /* Sequence number to prevent loops */
> wait_queue_head_t poll;
> u64 event;
> - unsigned int mounts; /* # of mounts in the namespace */
> + unsigned int nr_mounts; /* # of mounts in the namespace */
> unsigned int pending_mounts;
> } __randomize_layout;
>
> @@ -55,7 +49,10 @@ struct mount {
> struct list_head mnt_child; /* and going through their mnt_child */
> struct list_head mnt_instance; /* mount instance on sb->s_mounts */
> const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
> - struct list_head mnt_list;
> + union {
> + struct rb_node mnt_node; /* Under ns->mounts */
> + struct list_head mnt_list;
> + };
> struct list_head mnt_expire; /* link in fs-specific expiry list */
> struct list_head mnt_share; /* circular list of shared mounts */
> struct list_head mnt_slave_list;/* list of slave mounts */
> @@ -128,7 +125,6 @@ struct proc_mounts {
> struct mnt_namespace *ns;
> struct path root;
> int (*show)(struct seq_file *, struct vfsmount *);
> - struct mount cursor;
> };
>
> extern const struct seq_operations mounts_op;
> @@ -147,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
> return ns->seq == 0;
> }
>
> +static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
> +{
> + WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
> + mnt->mnt.mnt_flags &= ~MNT_ONRB;
> + rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
> + list_add_tail(&mnt->mnt_list, dt_list);
> +}
> +
> extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e02bc5f41c7b..0eab47ffc76c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -732,21 +732,6 @@ struct vfsmount *lookup_mnt(const struct path *path)
> return m;
> }
>
> -static inline void lock_ns_list(struct mnt_namespace *ns)
> -{
> - spin_lock(&ns->ns_lock);
> -}
> -
> -static inline void unlock_ns_list(struct mnt_namespace *ns)
> -{
> - spin_unlock(&ns->ns_lock);
> -}
> -
> -static inline bool mnt_is_cursor(struct mount *mnt)
> -{
> - return mnt->mnt.mnt_flags & MNT_CURSOR;
> -}
> -
> /*
> * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
> * current mount namespace.
> @@ -765,19 +750,15 @@ static inline bool mnt_is_cursor(struct mount *mnt)
> bool __is_local_mountpoint(struct dentry *dentry)
> {
> struct mnt_namespace *ns = current->nsproxy->mnt_ns;
> - struct mount *mnt;
> + struct mount *mnt, *n;
> bool is_covered = false;
>
> down_read(&namespace_sem);
> - lock_ns_list(ns);
> - list_for_each_entry(mnt, &ns->list, mnt_list) {
> - if (mnt_is_cursor(mnt))
> - continue;
> + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
> is_covered = (mnt->mnt_mountpoint == dentry);
> if (is_covered)
> break;
> }
> - unlock_ns_list(ns);
> up_read(&namespace_sem);
>
> return is_covered;
> @@ -1024,6 +1005,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
> mnt_add_count(old_parent, -1);
> }
>
> +static inline struct mount *node_to_mount(struct rb_node *node)
> +{
> + return rb_entry(node, struct mount, mnt_node);
> +}
> +
> +static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
> +{
> + struct rb_node **link = &ns->mounts.rb_node;
> + struct rb_node *parent = NULL;
> +
> + WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
> + mnt->mnt_ns = ns;
> + while (*link) {
> + parent = *link;
> + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
> + link = &parent->rb_left;
> + else
> + link = &parent->rb_right;
> + }
> + rb_link_node(&mnt->mnt_node, parent, link);
> + rb_insert_color(&mnt->mnt_node, &ns->mounts);
> + mnt->mnt.mnt_flags |= MNT_ONRB;
> +}
> +
> /*
> * vfsmount lock must be held for write
> */
> @@ -1037,12 +1042,13 @@ static void commit_tree(struct mount *mnt)
> BUG_ON(parent == mnt);
>
> list_add_tail(&head, &mnt->mnt_list);
> - list_for_each_entry(m, &head, mnt_list)
> - m->mnt_ns = n;
> + while (!list_empty(&head)) {
> + m = list_first_entry(&head, typeof(*m), mnt_list);
> + list_del(&m->mnt_list);
>
> - list_splice(&head, n->list.prev);
> -
> - n->mounts += n->pending_mounts;
> + mnt_add_to_ns(n, m);
> + }
> + n->nr_mounts += n->pending_mounts;
> n->pending_mounts = 0;
>
> __attach_mnt(mnt, parent);
> @@ -1190,7 +1196,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
> }
>
> mnt->mnt.mnt_flags = old->mnt.mnt_flags;
> - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
> + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
>
> atomic_inc(&sb->s_active);
> mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
> @@ -1415,65 +1421,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
> return &p->mnt;
> }
>
> -#ifdef CONFIG_PROC_FS
> -static struct mount *mnt_list_next(struct mnt_namespace *ns,
> - struct list_head *p)
> +/*
> + * Returns the mount which either has the specified mnt_id, or has the next
> + * smallest id afer the specified one.
> + */
> +static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
> {
> - struct mount *mnt, *ret = NULL;
> + struct rb_node *node = ns->mounts.rb_node;
> + struct mount *ret = NULL;
>
> - lock_ns_list(ns);
> - list_for_each_continue(p, &ns->list) {
> - mnt = list_entry(p, typeof(*mnt), mnt_list);
> - if (!mnt_is_cursor(mnt)) {
> - ret = mnt;
> - break;
> + while (node) {
> + struct mount *m = node_to_mount(node);
> +
> + if (mnt_id <= m->mnt_id_unique) {
> + ret = node_to_mount(node);
> + if (mnt_id == m->mnt_id_unique)
> + break;
> + node = node->rb_left;
> + } else {
> + node = node->rb_right;
> }
> }
> - unlock_ns_list(ns);
> -
> return ret;
> }
>
> +#ifdef CONFIG_PROC_FS
> +
> /* iterator; we want it to have access to namespace_sem, thus here... */
> static void *m_start(struct seq_file *m, loff_t *pos)
> {
> struct proc_mounts *p = m->private;
> - struct list_head *prev;
>
> down_read(&namespace_sem);
> - if (!*pos) {
> - prev = &p->ns->list;
> - } else {
> - prev = &p->cursor.mnt_list;
>
> - /* Read after we'd reached the end? */
> - if (list_empty(prev))
> - return NULL;
> - }
> -
> - return mnt_list_next(p->ns, prev);
> + return mnt_find_id_at(p->ns, *pos);
> }
>
> static void *m_next(struct seq_file *m, void *v, loff_t *pos)
> {
> - struct proc_mounts *p = m->private;
> - struct mount *mnt = v;
> + struct mount *next = NULL, *mnt = v;
> + struct rb_node *node = rb_next(&mnt->mnt_node);
>
> ++*pos;
> - return mnt_list_next(p->ns, &mnt->mnt_list);
> + if (node) {
> + next = node_to_mount(node);
> + *pos = next->mnt_id_unique;
> + }
> + return next;
> }
>
> static void m_stop(struct seq_file *m, void *v)
> {
> - struct proc_mounts *p = m->private;
> - struct mount *mnt = v;
> -
> - lock_ns_list(p->ns);
> - if (mnt)
> - list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
> - else
> - list_del_init(&p->cursor.mnt_list);
> - unlock_ns_list(p->ns);
> up_read(&namespace_sem);
> }
>
> @@ -1491,14 +1489,6 @@ const struct seq_operations mounts_op = {
> .show = m_show,
> };
>
> -void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
> -{
> - down_read(&namespace_sem);
> - lock_ns_list(ns);
> - list_del(&cursor->mnt_list);
> - unlock_ns_list(ns);
> - up_read(&namespace_sem);
> -}
> #endif /* CONFIG_PROC_FS */
>
> /**
> @@ -1640,7 +1630,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> /* Gather the mounts to umount */
> for (p = mnt; p; p = next_mnt(p, mnt)) {
> p->mnt.mnt_flags |= MNT_UMOUNT;
> - list_move(&p->mnt_list, &tmp_list);
> + if (p->mnt.mnt_flags & MNT_ONRB)
> + move_from_ns(p, &tmp_list);
> + else
> + list_move(&p->mnt_list, &tmp_list);
> }
>
> /* Hide the mounts from mnt_mounts */
> @@ -1660,7 +1653,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> list_del_init(&p->mnt_list);
> ns = p->mnt_ns;
> if (ns) {
> - ns->mounts--;
> + ns->nr_mounts--;
> __touch_mnt_namespace(ns);
> }
> p->mnt_ns = NULL;
> @@ -1786,14 +1779,16 @@ static int do_umount(struct mount *mnt, int flags)
>
> event++;
> if (flags & MNT_DETACH) {
> - if (!list_empty(&mnt->mnt_list))
> + if (mnt->mnt.mnt_flags & MNT_ONRB ||
> + !list_empty(&mnt->mnt_list))
> umount_tree(mnt, UMOUNT_PROPAGATE);
> retval = 0;
> } else {
> shrink_submounts(mnt);
> retval = -EBUSY;
> if (!propagate_mount_busy(mnt, 2)) {
> - if (!list_empty(&mnt->mnt_list))
> + if (mnt->mnt.mnt_flags & MNT_ONRB ||
> + !list_empty(&mnt->mnt_list))
> umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
> retval = 0;
> }
> @@ -2211,9 +2206,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
> unsigned int mounts = 0;
> struct mount *p;
>
> - if (ns->mounts >= max)
> + if (ns->nr_mounts >= max)
> return -ENOSPC;
> - max -= ns->mounts;
> + max -= ns->nr_mounts;
> if (ns->pending_mounts >= max)
> return -ENOSPC;
> max -= ns->pending_mounts;
> @@ -2357,8 +2352,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> touch_mnt_namespace(source_mnt->mnt_ns);
> } else {
> if (source_mnt->mnt_ns) {
> + LIST_HEAD(head);
> +
> /* move from anon - the caller will destroy */
> - list_del_init(&source_mnt->mnt_ns->list);
> + for (p = source_mnt; p; p = next_mnt(p, source_mnt))
> + move_from_ns(p, &head);
> + list_del_init(&head);
> }
> if (beneath)
> mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
> @@ -2669,11 +2668,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
>
> lock_mount_hash();
> for (p = mnt; p; p = next_mnt(p, mnt)) {
> - p->mnt_ns = ns;
> - ns->mounts++;
> + mnt_add_to_ns(ns, p);
> + ns->nr_mounts++;
> }
> ns->root = mnt;
> - list_add_tail(&ns->list, &mnt->mnt_list);
> mntget(&mnt->mnt);
> unlock_mount_hash();
> namespace_unlock();
> @@ -3736,9 +3734,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
> if (!anon)
> new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
> refcount_set(&new_ns->ns.count, 1);
> - INIT_LIST_HEAD(&new_ns->list);
> + new_ns->mounts = RB_ROOT;
> init_waitqueue_head(&new_ns->poll);
> - spin_lock_init(&new_ns->ns_lock);
> new_ns->user_ns = get_user_ns(user_ns);
> new_ns->ucounts = ucounts;
> return new_ns;
> @@ -3785,7 +3782,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
> unlock_mount_hash();
> }
> new_ns->root = new;
> - list_add_tail(&new_ns->list, &new->mnt_list);
>
> /*
> * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
> @@ -3795,8 +3791,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
> p = old;
> q = new;
> while (p) {
> - q->mnt_ns = new_ns;
> - new_ns->mounts++;
> + mnt_add_to_ns(new_ns, q);
> + new_ns->nr_mounts++;
> if (new_fs) {
> if (&p->mnt == new_fs->root.mnt) {
> new_fs->root.mnt = mntget(&q->mnt);
> @@ -3838,10 +3834,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
> mntput(m);
> return ERR_CAST(ns);
> }
> - mnt->mnt_ns = ns;
> ns->root = mnt;
> - ns->mounts++;
> - list_add(&mnt->mnt_list, &ns->list);
> + ns->nr_mounts++;
> + mnt_add_to_ns(ns, mnt);
>
> err = vfs_path_lookup(m->mnt_root, m,
> name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
> @@ -4019,10 +4014,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
> goto err_path;
> }
> mnt = real_mount(newmount.mnt);
> - mnt->mnt_ns = ns;
> ns->root = mnt;
> - ns->mounts = 1;
> - list_add(&mnt->mnt_list, &ns->list);
> + ns->nr_mounts = 1;
> + mnt_add_to_ns(ns, mnt);
> mntget(newmount.mnt);
>
> /* Attach to an apparent O_PATH fd with a note that we need to unmount
> @@ -4693,10 +4687,9 @@ static void __init init_mount_tree(void)
> if (IS_ERR(ns))
> panic("Can't allocate initial namespace");
> m = real_mount(mnt);
> - m->mnt_ns = ns;
> ns->root = m;
> - ns->mounts = 1;
> - list_add(&m->mnt_list, &ns->list);
> + ns->nr_mounts = 1;
> + mnt_add_to_ns(ns, m);
> init_task.nsproxy->mnt_ns = ns;
> get_mnt_ns(ns);
>
> @@ -4823,18 +4816,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
> int *new_mnt_flags)
> {
> int new_flags = *new_mnt_flags;
> - struct mount *mnt;
> + struct mount *mnt, *n;
> bool visible = false;
>
> down_read(&namespace_sem);
> - lock_ns_list(ns);
> - list_for_each_entry(mnt, &ns->list, mnt_list) {
> + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
> struct mount *child;
> int mnt_flags;
>
> - if (mnt_is_cursor(mnt))
> - continue;
> -
> if (mnt->mnt.mnt_sb->s_type != sb->s_type)
> continue;
>
> @@ -4882,7 +4871,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
> next: ;
> }
> found:
> - unlock_ns_list(ns);
> up_read(&namespace_sem);
> return visible;
> }
> diff --git a/fs/pnode.c b/fs/pnode.c
> index e4d0340393d5..a799e0315cc9 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount)
> mnt->mnt.mnt_flags |= MNT_UMOUNT;
> list_del_init(&mnt->mnt_child);
> list_del_init(&mnt->mnt_umounting);
> - list_move_tail(&mnt->mnt_list, to_umount);
> + move_from_ns(mnt, to_umount);
> }
>
> /*
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 250eb5bf7b52..73d2274d5f59 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -283,8 +283,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
> p->ns = ns;
> p->root = root;
> p->show = show;
> - INIT_LIST_HEAD(&p->cursor.mnt_list);
> - p->cursor.mnt.mnt_flags = MNT_CURSOR;
>
> return 0;
>
> @@ -301,7 +299,6 @@ static int mounts_release(struct inode *inode, struct file *file)
> struct seq_file *m = file->private_data;
> struct proc_mounts *p = m->private;
> path_put(&p->root);
> - mnt_cursor_del(p->ns, &p->cursor);
> put_mnt_ns(p->ns);
> return seq_release_private(inode, file);
> }
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 4f40b40306d0..7952eddc835c 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -50,8 +50,7 @@ struct path;
> #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
>
> #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
> - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
> - MNT_CURSOR)
> + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
>
> #define MNT_INTERNAL 0x4000
>
> @@ -65,7 +64,7 @@ struct path;
> #define MNT_SYNC_UMOUNT 0x2000000
> #define MNT_MARKED 0x4000000
> #define MNT_UMOUNT 0x8000000
> -#define MNT_CURSOR 0x10000000
> +#define MNT_ONRB 0x10000000
>
> struct vfsmount {
> struct dentry *mnt_root; /* root of the mounted tree */

2023-10-27 08:18:41

[permalink] [raw]

Subject: Re: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <[email protected]> wrote:
>
> On 25/10/23 22:02, Miklos Szeredi wrote:

> > The mnt.mnt_list is still used to set up the mount tree and for
> > propagation, but not after the mount has been added to a namespace. Hence
> > mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to
> > validate that the mount is on the correct list.
>
> Is that accurate, propagation occurs at mount and also at umount.

When propagating a mount, the new mount's mnt_list is used as a head
for the new propagated mounts. These are then moved to the rb tree by
commit_tree().

When umounting there's a "to umount" list called tmp_list in
umount_tree(), this list is used to collect direct umounts and then
propagated umounts. The direct umounts are added in umount_tree(),
the propagated ones umount_one().

Note: umount_tree() can be called on a not yet finished mount, in that
case the mounts are still on mnt_list, so umount_tree() needs to deal
with both.

> IDG how the change to umount_one() works, it looks like umount_list()
>
> uses mnt_list. It looks like propagate_umount() is also using mnt_list.
>
>
> Am I missing something obvious?

So when a mount is part of a namespace (either anonymous or not) it is
on the rb tree, when not then it can temporarily be on mnt_list.
MNT_ONRB flag is used to validate that the mount is on the list that
we expect it to be on, but also to detect the case of the mount setup
being aborted.

We could handle the second case differently, since we should be able
to tell when we are removing the mount from a namespace and when we
are aborting a mount, but this was the least invasive way to do this.

Thanks,
Miklos

2023-10-28 01:37:55

[permalink] [raw]

Subject: Re: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

On 27/10/23 16:17, Miklos Szeredi wrote:
> On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <[email protected]> wrote:
>> On 25/10/23 22:02, Miklos Szeredi wrote:
>>> The mnt.mnt_list is still used to set up the mount tree and for
>>> propagation, but not after the mount has been added to a namespace. Hence
>>> mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to
>>> validate that the mount is on the correct list.
>> Is that accurate, propagation occurs at mount and also at umount.
> When propagating a mount, the new mount's mnt_list is used as a head
> for the new propagated mounts. These are then moved to the rb tree by
> commit_tree().
>
> When umounting there's a "to umount" list called tmp_list in
> umount_tree(), this list is used to collect direct umounts and then
> propagated umounts. The direct umounts are added in umount_tree(),
> the propagated ones umount_one().
>
> Note: umount_tree() can be called on a not yet finished mount, in that
> case the mounts are still on mnt_list, so umount_tree() needs to deal
> with both.
>
>> IDG how the change to umount_one() works, it looks like umount_list()
>>
>> uses mnt_list. It looks like propagate_umount() is also using mnt_list.
>>
>>
>> Am I missing something obvious?
> So when a mount is part of a namespace (either anonymous or not) it is
> on the rb tree, when not then it can temporarily be on mnt_list.
> MNT_ONRB flag is used to validate that the mount is on the list that
> we expect it to be on, but also to detect the case of the mount setup
> being aborted.
>
> We could handle the second case differently, since we should be able
> to tell when we are removing the mount from a namespace and when we
> are aborting a mount, but this was the least invasive way to do this.

Thanks for the explanation, what you've said is essentially what I

understood reading the series.

But I still haven't quite got this so I'll need to spend more time

on this part of the patch series.

That's not a problem, ;).

Ian

>
> Thanks,
> Miklos
>

2023-10-30 05:38:05

[permalink] [raw]

Subject: Re: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

On 28/10/23 09:36, Ian Kent wrote:
> On 27/10/23 16:17, Miklos Szeredi wrote:
>> On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <[email protected]> wrote:
>>> On 25/10/23 22:02, Miklos Szeredi wrote:
>>>> The mnt.mnt_list is still used to set up the mount tree and for
>>>> propagation, but not after the mount has been added to a
>>>> namespace. Hence
>>>> mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to
>>>> validate that the mount is on the correct list.
>>> Is that accurate, propagation occurs at mount and also at umount.
>> When propagating a mount, the new mount's mnt_list is used as a head
>> for the new propagated mounts. These are then moved to the rb tree by
>> commit_tree().
>>
>> When umounting there's a "to umount" list called tmp_list in
>> umount_tree(), this list is used to collect direct umounts and then
>> propagated umounts. The direct umounts are added in umount_tree(),
>> the propagated ones umount_one().
>>
>> Note: umount_tree() can be called on a not yet finished mount, in that
>> case the mounts are still on mnt_list, so umount_tree() needs to deal
>> with both.
>>
>>> IDG how the change to umount_one() works, it looks like umount_list()
>>>
>>> uses mnt_list. It looks like propagate_umount() is also using mnt_list.
>>>
>>>
>>> Am I missing something obvious?
>> So when a mount is part of a namespace (either anonymous or not) it is
>> on the rb tree, when not then it can temporarily be on mnt_list.
>> MNT_ONRB flag is used to validate that the mount is on the list that
>> we expect it to be on, but also to detect the case of the mount setup
>> being aborted.
>>
>> We could handle the second case differently, since we should be able
>> to tell when we are removing the mount from a namespace and when we
>> are aborting a mount, but this was the least invasive way to do this.
>
> Thanks for the explanation, what you've said is essentially what I
>
> understood reading the series.
>
>
> But I still haven't quite got this so I'll need to spend more time
>
> on this part of the patch series.
>
>
> That's not a problem, ;).

After cloning your git tree and looking in there I don't see what

I was concerned about so I think I was confused by obscurity by

diff rather than seeing a real problem, ;)

Still that union worries me a little bit so I'll keep looking at

the code for a while.

>
>
> Ian
>
>>
>> Thanks,
>> Miklos
>>
>

2023-10-30 05:46:30

[permalink] [raw]

Subject: Re: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

On 30/10/23 13:37, Ian Kent wrote:
> On 28/10/23 09:36, Ian Kent wrote:
>> On 27/10/23 16:17, Miklos Szeredi wrote:
>>> On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <[email protected]> wrote:
>>>> On 25/10/23 22:02, Miklos Szeredi wrote:
>>>>> The mnt.mnt_list is still used to set up the mount tree and for
>>>>> propagation, but not after the mount has been added to a
>>>>> namespace. Hence
>>>>> mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to
>>>>> validate that the mount is on the correct list.
>>>> Is that accurate, propagation occurs at mount and also at umount.
>>> When propagating a mount, the new mount's mnt_list is used as a head
>>> for the new propagated mounts. These are then moved to the rb tree by
>>> commit_tree().
>>>
>>> When umounting there's a "to umount" list called tmp_list in
>>> umount_tree(), this list is used to collect direct umounts and then
>>> propagated umounts. The direct umounts are added in umount_tree(),
>>> the propagated ones umount_one().
>>>
>>> Note: umount_tree() can be called on a not yet finished mount, in that
>>> case the mounts are still on mnt_list, so umount_tree() needs to deal
>>> with both.
>>>
>>>> IDG how the change to umount_one() works, it looks like umount_list()
>>>>
>>>> uses mnt_list. It looks like propagate_umount() is also using
>>>> mnt_list.
>>>>
>>>>
>>>> Am I missing something obvious?
>>> So when a mount is part of a namespace (either anonymous or not) it is
>>> on the rb tree, when not then it can temporarily be on mnt_list.
>>> MNT_ONRB flag is used to validate that the mount is on the list that
>>> we expect it to be on, but also to detect the case of the mount setup
>>> being aborted.
>>>
>>> We could handle the second case differently, since we should be able
>>> to tell when we are removing the mount from a namespace and when we
>>> are aborting a mount, but this was the least invasive way to do this.
>>
>> Thanks for the explanation, what you've said is essentially what I
>>
>> understood reading the series.
>>
>>
>> But I still haven't quite got this so I'll need to spend more time
>>
>> on this part of the patch series.
>>
>>
>> That's not a problem, ;).
>
> After cloning your git tree and looking in there I don't see what
>
> I was concerned about so I think I was confused by obscurity by
>
> diff rather than seeing a real problem, ;)
>
>
> Still that union worries me a little bit so I'll keep looking at
>
> the code for a while.

Is fs/namespace.c:iterate_mounts() a problem?

It's called from:

1) ./kernel/audit_tree.c:709: if (iterate_mounts(compare_root,
2) ./kernel/audit_tree.c:839: err = iterate_mounts(tag_mount, tree, mnt);
3) ./kernel/audit_tree.c:917: failed = iterate_mounts(tag_mount,
tree, tagged);

From functions 1) audit_trim_trees(), 2) audit_add_tree_rule() and

3) audit_tag_tree().

>
>
>>
>>
>> Ian
>>
>>>
>>> Thanks,
>>> Miklos
>>>
>>
>

2023-10-30 09:07:20

[permalink] [raw]

Subject: Re: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

On Mon, Oct 30, 2023 at 6:45 AM Ian Kent <[email protected]> wrote:

> Is fs/namespace.c:iterate_mounts() a problem?
>
> It's called from:
>
> 1) ./kernel/audit_tree.c:709: if (iterate_mounts(compare_root,
> 2) ./kernel/audit_tree.c:839: err = iterate_mounts(tag_mount, tree, mnt);
> 3) ./kernel/audit_tree.c:917: failed = iterate_mounts(tag_mount,
> tree, tagged);
>
>
> From functions 1) audit_trim_trees(), 2) audit_add_tree_rule() and
>
> 3) audit_tag_tree().

So that interface works like this:

- collect_mounts() creates a temporary copy of a mount tree, mounts
are chained on mnt_list.

- iterate_mounts() is used to do some work on the temporary tree

- drop_collected_mounts() frees the temporary tree

These mounts are never installed in a namespace. My guess is that a
private copy is used instead of the original mount tree to prevent
races.

Thanks,
Miklos

2023-10-31 01:24:23

[permalink] [raw]

Subject: Re: [PATCH v4 2/6] mounts: keep list of mounts in an rbtree

On 30/10/23 17:06, Miklos Szeredi wrote:
> On Mon, Oct 30, 2023 at 6:45 AM Ian Kent <[email protected]> wrote:
>
>> Is fs/namespace.c:iterate_mounts() a problem?
>>
>> It's called from:
>>
>> 1) ./kernel/audit_tree.c:709: if (iterate_mounts(compare_root,
>> 2) ./kernel/audit_tree.c:839: err = iterate_mounts(tag_mount, tree, mnt);
>> 3) ./kernel/audit_tree.c:917: failed = iterate_mounts(tag_mount,
>> tree, tagged);
>>
>>
>> From functions 1) audit_trim_trees(), 2) audit_add_tree_rule() and
>>
>> 3) audit_tag_tree().
> So that interface works like this:
>
> - collect_mounts() creates a temporary copy of a mount tree, mounts
> are chained on mnt_list.

Right, sorry for the noise, I didn't look far enough.

Ian

>
> - iterate_mounts() is used to do some work on the temporary tree
>
> - drop_collected_mounts() frees the temporary tree
>
> These mounts are never installed in a namespace. My guess is that a
> private copy is used instead of the original mount tree to prevent
> races.
>
> Thanks,
> Miklos
>

2023-11-01 11:13:42

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On Wed, 25 Oct 2023 16:01:58 +0200, Miklos Szeredi wrote:
> Implement mount querying syscalls agreed on at LSF/MM 2023.
>
> Features:
>
> - statx-like want/got mask
> - allows returning ascii strings (fs type, root, mount point)
> - returned buffer is relocatable (no pointers)
>
> [...]

I think we should start showing clear signs of commitment to this. In
absence of strong objections I don't see a reason to let this rot on
list until we forget about it. Maybe this will entice people to provide
more reviews as well.

It's all pretty close to what we discussed at LSFMM23 and we stated that
we aim to merge something by the end of the year. Let's see if that can
actually happen.

I don't have huge quarrels with this. Yes, there's stuff I'd like to see
done differently but nothing I consider blockers. So let's get this
into -next once rc1 is out so it can get a full cycle of exposure.

I've renamed struct statmnt to struct statmount to align with statx()
and struct statx. I also renamed struct stmt_state to struct kstatmount
as that's how we usually do this. And I renamed struct __mount_arg to
struct mnt_id_req and dropped the comment. Libraries can expose this in
whatever form they want but we'll also have direct consumers. I'd rather
have this struct be underscore free and officially sanctioned.

---

Applied to the vfs.mount branch of the vfs/vfs.git tree.
Patches in the vfs.mount branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs.master

[1/6] add unique mount ID
https://git.kernel.org/vfs/vfs/c/ec873c3baa0c
[2/6] mounts: keep list of mounts in an rbtree
https://git.kernel.org/vfs/vfs/c/f15247ad234c
[3/6] namespace: extract show_path() helper
https://git.kernel.org/vfs/vfs/c/6e5f64ac5382
[4/6] add statmount(2) syscall
https://git.kernel.org/vfs/vfs/c/edf3b2ac1bd5
[5/6] add listmount(2) syscall
https://git.kernel.org/vfs/vfs/c/4412ca803757
[6/6] wire up syscalls for statmount/listmount
https://git.kernel.org/vfs/vfs/c/d0a56e829d2c

2023-11-01 11:53:37

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On 25/10/23 22:01, Miklos Szeredi wrote:
> Implement mount querying syscalls agreed on at LSF/MM 2023.
>
> Features:
>
> - statx-like want/got mask
> - allows returning ascii strings (fs type, root, mount point)
> - returned buffer is relocatable (no pointers)
>
> Still missing:
> - man pages
> - kselftest
>
> Please find the test utility at the end of this mail.
>
> Usage: statmnt [-l|-r] [-u] (mnt_id|path)
>
> Git tree:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git#statmount-v4
>
>
> Changes v3..v4:
>
> - incorporate patch moving list of mounts to an rbtree
> - wire up syscalls for all archs
> - add LISTMOUNT_RECURSIVE (depth first iteration of mount tree)
> - add LSMT_ROOT (list root instead of a specific mount ID)
> - list_for_each_entry_del() moved to a separate patchset
>
> Changes v1..v3:
>
> - rename statmnt(2) -> statmount(2)
> - rename listmnt(2) -> listmount(2)
> - make ABI 32bit compatible by passing 64bit args in a struct (tested on
> i386 and x32)
> - only accept new 64bit mount IDs
> - fix compile on !CONFIG_PROC_FS
> - call security_sb_statfs() in both syscalls
> - make lookup_mnt_in_ns() static
> - add LISTMOUNT_UNREACHABLE flag to listmnt() to explicitly ask for
> listing unreachable mounts
> - remove .sb_opts
> - remove subtype from .fs_type
> - return the number of bytes used (including strings) in .size
> - rename .mountpoint -> .mnt_point
> - point strings by an offset against char[] VLA at the end of the struct.
> E.g. printf("fs_type: %s\n", st->str + st->fs_type);
> - don't save string lengths
> - extend spare space in struct statmnt (complete size is now 512 bytes)
>
>
> Miklos Szeredi (6):
> add unique mount ID
> mounts: keep list of mounts in an rbtree
> namespace: extract show_path() helper
> add statmount(2) syscall
> add listmount(2) syscall
> wire up syscalls for statmount/listmount
>
> arch/alpha/kernel/syscalls/syscall.tbl | 3 +
> arch/arm/tools/syscall.tbl | 3 +
> arch/arm64/include/asm/unistd32.h | 4 +
> arch/ia64/kernel/syscalls/syscall.tbl | 3 +
> arch/m68k/kernel/syscalls/syscall.tbl | 3 +
> arch/microblaze/kernel/syscalls/syscall.tbl | 3 +
> arch/mips/kernel/syscalls/syscall_n32.tbl | 3 +
> arch/mips/kernel/syscalls/syscall_n64.tbl | 3 +
> arch/mips/kernel/syscalls/syscall_o32.tbl | 3 +
> arch/parisc/kernel/syscalls/syscall.tbl | 3 +
> arch/powerpc/kernel/syscalls/syscall.tbl | 3 +
> arch/s390/kernel/syscalls/syscall.tbl | 3 +
> arch/sh/kernel/syscalls/syscall.tbl | 3 +
> arch/sparc/kernel/syscalls/syscall.tbl | 3 +
> arch/x86/entry/syscalls/syscall_32.tbl | 3 +
> arch/x86/entry/syscalls/syscall_64.tbl | 2 +
> arch/xtensa/kernel/syscalls/syscall.tbl | 3 +
> fs/internal.h | 2 +
> fs/mount.h | 27 +-
> fs/namespace.c | 573 ++++++++++++++++----
> fs/pnode.c | 2 +-
> fs/proc_namespace.c | 13 +-
> fs/stat.c | 9 +-
> include/linux/mount.h | 5 +-
> include/linux/syscalls.h | 8 +
> include/uapi/asm-generic/unistd.h | 8 +-
> include/uapi/linux/mount.h | 65 +++
> include/uapi/linux/stat.h | 1 +
> 28 files changed, 635 insertions(+), 129 deletions(-)

Looks ok to me,covers the primary cases I needed when I worked

on using fsinfo() in systemd.

Karel, is there anything missing you would need for adding

libmount support?

Reviewed-by: Ian Kent <[email protected]>

>

2023-11-01 13:19:36

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On Wed, Nov 1, 2023 at 12:13 PM Christian Brauner <[email protected]> wrote:

> I've renamed struct statmnt to struct statmount to align with statx()
> and struct statx. I also renamed struct stmt_state to struct kstatmount
> as that's how we usually do this. And I renamed struct __mount_arg to
> struct mnt_id_req and dropped the comment. Libraries can expose this in
> whatever form they want but we'll also have direct consumers. I'd rather
> have this struct be underscore free and officially sanctioned.

Thanks.

arch/arm64/include/asm/unistd.h needs this fixup:

-#define __NR_compat_syscalls 457
+#define __NR_compat_syscalls 459

Can you fix inline, or should I send a proper patch?

Thanks,
Miklos

2023-11-01 15:55:14

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On Wed, Nov 01, 2023 at 02:18:30PM +0100, Miklos Szeredi wrote:
> On Wed, Nov 1, 2023 at 12:13 PM Christian Brauner <[email protected]> wrote:
>
> > I've renamed struct statmnt to struct statmount to align with statx()
> > and struct statx. I also renamed struct stmt_state to struct kstatmount
> > as that's how we usually do this. And I renamed struct __mount_arg to
> > struct mnt_id_req and dropped the comment. Libraries can expose this in
> > whatever form they want but we'll also have direct consumers. I'd rather
> > have this struct be underscore free and officially sanctioned.
>
> Thanks.
>
> arch/arm64/include/asm/unistd.h needs this fixup:
>
> -#define __NR_compat_syscalls 457
> +#define __NR_compat_syscalls 459

Everytime with that file. It's like a tradition that I forget to update
it at least once.

>
> Can you fix inline, or should I send a proper patch?

No need to send. I'll just fix it it here.

2023-11-06 12:12:06

by Karel Zak

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On Wed, Nov 01, 2023 at 07:52:45PM +0800, Ian Kent wrote:
> On 25/10/23 22:01, Miklos Szeredi wrote:
> Looks ok to me,covers the primary cases I needed when I worked
> on using fsinfo() in systemd.

Our work on systemd was about two areas: get mount info (stat/listmount()
now) from the kernel, and get the mount ID from notification.

There was watch_queue.h with WATCH_TYPE_MOUNT_NOTIFY and struct
mount_notification->auxiliary_mount (aka mount ID) and event subtype
to get the change status (new mount, umount, etc.)

For example David's:
https://patchwork.kernel.org/project/linux-security-module/patch/155991711016.15579.4449417925184028666.stgit@warthog.procyon.org.uk/

Do we have any replacement for this?

> Karel, is there anything missing you would need for adding
> libmount support?

Miklos's statmount() and listmount() API is excellent from my point of
view. It looks pretty straightforward to use, and with the unique
mount ID, it's safe too. It will be ideal for things like umount(8)
(and recursive umount, etc.).

For complex scenarios (systemd), we need to get from the kernel the
unique ID's after any change in the mount table to save resources and
call statmount() only for the affected mount node. Parse mountinfo
sucks, call for(listmount(-1)) { statmount() } sucks too :-)

Karel

--
Karel Zak <[email protected]>
http://karelzak.blogspot.com

2023-11-06 13:34:27

by Amir Goldstein

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On Mon, Nov 6, 2023 at 2:11 PM Karel Zak <[email protected]> wrote:
>
> On Wed, Nov 01, 2023 at 07:52:45PM +0800, Ian Kent wrote:
> > On 25/10/23 22:01, Miklos Szeredi wrote:
> > Looks ok to me,covers the primary cases I needed when I worked
> > on using fsinfo() in systemd.
>
> Our work on systemd was about two areas: get mount info (stat/listmount()
> now) from the kernel, and get the mount ID from notification.
>
> There was watch_queue.h with WATCH_TYPE_MOUNT_NOTIFY and struct
> mount_notification->auxiliary_mount (aka mount ID) and event subtype
> to get the change status (new mount, umount, etc.)
>
> For example David's:
> https://patchwork.kernel.org/project/linux-security-module/patch/155991711016.15579.4449417925184028666.stgit@warthog.procyon.org.uk/
>
> Do we have any replacement for this?
>

The plan is to extend fanotify for mount namespace change notifications.

Here is a simple POC for FAN_UNMOUNT notification:

https://lore.kernel.org/linux-fsdevel/[email protected]/

I was waiting for Miklos' patches to land, so that we can report
mnt_id_unique (of mount and its parent mount) in the events.

The plan is to start with setting a mark on a vfsmount to get
FAN_MOUNT/FAN_UNMOUNT notifications for changes to direct
children of that mount.

This part, I was planning to do myself. I cannot say for sure when
I will be able to get to it, but it should be a rather simple patch.

If anybody else would like to volunteer for the task, I will be
happy to assist.

Not sure if we are going to need special notifications for mount
move and mount beneath?

Not sure if we are going to need notifications on mount attribute
changes?

We may later also implement a mark on a mount namespace
to get events on all mount namespace changes.

If you have any feedback about this rough plan, or more items
to the wish list, please feel free to share them.

Thanks,
Amir.

2023-11-06 23:54:51

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On 6/11/23 20:10, Karel Zak wrote:
> On Wed, Nov 01, 2023 at 07:52:45PM +0800, Ian Kent wrote:
>> On 25/10/23 22:01, Miklos Szeredi wrote:
>> Looks ok to me,covers the primary cases I needed when I worked
>> on using fsinfo() in systemd.
> Our work on systemd was about two areas: get mount info (stat/listmount()
> now) from the kernel, and get the mount ID from notification.
>
> There was watch_queue.h with WATCH_TYPE_MOUNT_NOTIFY and struct
> mount_notification->auxiliary_mount (aka mount ID) and event subtype
> to get the change status (new mount, umount, etc.)
>
> For example David's:
> https://patchwork.kernel.org/project/linux-security-module/patch/155991711016.15579.4449417925184028666.stgit@warthog.procyon.org.uk/
>
> Do we have any replacement for this?

Not yet.

I tried to mention it early on but I don't think my description

conveyed what's actually needed.

>
>> Karel, is there anything missing you would need for adding
>> libmount support?
> Miklos's statmount() and listmount() API is excellent from my point of
> view. It looks pretty straightforward to use, and with the unique
> mount ID, it's safe too. It will be ideal for things like umount(8)
> (and recursive umount, etc.).

Thanks Karel, that's what I was hoping.

>
> For complex scenarios (systemd), we need to get from the kernel the
> unique ID's after any change in the mount table to save resources and
> call statmount() only for the affected mount node. Parse mountinfo
> sucks, call for(listmount(-1)) { statmount() } sucks too :-)

I have been looking at the notifications side of things.

I too need that functionality for the systemd work I was doing on

this. There was a need for event rate management too to get the

most out of the mount query improvements which I really only

realized about the time the work stopped. So for me there's

some new work needed as well.

I'm not sure yet which way to go as the watch queue implementation

that was merged is just the framework and is a bit different from

what we were using so I'm not sure if I can port specific extensions

of David's notifications work to it. I'm only just now getting to a

point where I can spend enough time on it to work this out.

Ian

2023-11-07 00:48:03

[permalink] [raw]

Subject: Re: [PATCH v4 0/6] querying mount attributes

On 6/11/23 21:33, Amir Goldstein wrote:
> On Mon, Nov 6, 2023 at 2:11 PM Karel Zak <[email protected]> wrote:
>> On Wed, Nov 01, 2023 at 07:52:45PM +0800, Ian Kent wrote:
>>> On 25/10/23 22:01, Miklos Szeredi wrote:
>>> Looks ok to me,covers the primary cases I needed when I worked
>>> on using fsinfo() in systemd.
>> Our work on systemd was about two areas: get mount info (stat/listmount()
>> now) from the kernel, and get the mount ID from notification.
>>
>> There was watch_queue.h with WATCH_TYPE_MOUNT_NOTIFY and struct
>> mount_notification->auxiliary_mount (aka mount ID) and event subtype
>> to get the change status (new mount, umount, etc.)
>>
>> For example David's:
>> https://patchwork.kernel.org/project/linux-security-module/patch/155991711016.15579.4449417925184028666.stgit@warthog.procyon.org.uk/
>>
>> Do we have any replacement for this?
>>
> The plan is to extend fanotify for mount namespace change notifications.
>
> Here is a simple POC for FAN_UNMOUNT notification:
>
> https://lore.kernel.org/linux-fsdevel/[email protected]/
>
> I was waiting for Miklos' patches to land, so that we can report
> mnt_id_unique (of mount and its parent mount) in the events.
>
> The plan is to start with setting a mark on a vfsmount to get
> FAN_MOUNT/FAN_UNMOUNT notifications for changes to direct
> children of that mount.

I'll have a look at what I needed when I was working to implement

this in systemd. Without looking at the code I can say I was

handling mount, umount and I think remount events so that's probably

a minimum.

As I mentioned earlier I found I also need event rate management

which was a new requirement at the time.

>
> This part, I was planning to do myself. I cannot say for sure when
> I will be able to get to it, but it should be a rather simple patch.
>
> If anybody else would like to volunteer for the task, I will be
> happy to assist.

I would like to help with this but I'm not familiar with fanotify

so I'll need to spend a bit of time on that. I am just about in

a position to do that now.

I'll also be looking at the watch queue framework that did get merged

back then, I'm not sure how that will turn out.

>
> Not sure if we are going to need special notifications for mount
> move and mount beneath?

Yes that will be an interesting question, I have noticed Christians'

work on mount beneath.

We need to provide the ability to monitor mount tables as is done by

using the proc mount lists to start with and I'm pretty sure that

includes at least mount, umount and moves perhaps more but I'll check

what I was using.

>
> Not sure if we are going to need notifications on mount attribute
> changes?

Also an interesting question, we will see in time I guess.

You would think that the mount/umount/move events would get what's

needed because (assuming mount move maps to remount) mount, umount

and remount should cover cases were mounted mount attributes change.

>
> We may later also implement a mark on a mount namespace
> to get events on all mount namespace changes.

Monitoring the proc mount tables essentially provides lists of mounts

that are present in a mount namespace (as seen by the given process)

so this is going to be needed sooner rather than later if we hope to

realize improvements from our new system calls.

Ian

2024-01-10 22:23:46

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

Hi,

On Wed, Oct 25, 2023 at 04:02:03PM +0200, Miklos Szeredi wrote:
> Add way to query the children of a particular mount. This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Allow listing either
>
> - immediate child mounts only, or
>
> - recursively all descendant mounts (depth first).
>
> Lookup the mount by the new 64bit mount ID. If a mount needs to be queried
> based on path, then statx(2) can be used to first query the mount ID
> belonging to the path.
>
> Return an array of new (64bit) mount ID's. Without privileges only mounts
> are listed which are reachable from the task's root.
>
> Signed-off-by: Miklos Szeredi <[email protected]>

with this patch in the tree, all sh4 builds fail with ICE.

during RTL pass: final
In file included from fs/namespace.c:11:
fs/namespace.c: In function '__se_sys_listmount':
include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275

I tested with gcc 8.2, 11.3, 11.4, and 12.3. The compiler version
does not make a difference. Has anyone else seen the same problem ?
If so, any idea what to do about it ?

Thanks,
Guenter

2024-01-11 00:33:32

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

On Wed, 10 Jan 2024 at 14:23, Guenter Roeck <[email protected]> wrote:
>
> with this patch in the tree, all sh4 builds fail with ICE.
>
> during RTL pass: final
> In file included from fs/namespace.c:11:
> fs/namespace.c: In function '__se_sys_listmount':
> include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275

We do have those very ugly SYSCALL_DEFINEx() macros, but I'm not
seeing _anything_ that would be odd about the listmount case.

And the "__se_sys" thing in particular is just a fairly trivial wrapper.

It does use that asmlinkage_protect() thing, and it is unquestionably
horrendously ugly (staring too long at <linux/syscalls.h> has been
known to cause madness and despair), but we do that for *every* single
system call and I don't see why the new listmount entry would be
different.

Linus

2024-01-11 05:12:32

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

2024-01-11 18:57:55

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

On 1/10/24 16:32, Linus Torvalds wrote:
> On Wed, 10 Jan 2024 at 14:23, Guenter Roeck <[email protected]> wrote:
>>
>> with this patch in the tree, all sh4 builds fail with ICE.
>>
>> during RTL pass: final
>> In file included from fs/namespace.c:11:
>> fs/namespace.c: In function '__se_sys_listmount':
>> include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275
>
> We do have those very ugly SYSCALL_DEFINEx() macros, but I'm not
> seeing _anything_ that would be odd about the listmount case.
>
> And the "__se_sys" thing in particular is just a fairly trivial wrapper.
>
> It does use that asmlinkage_protect() thing, and it is unquestionably
> horrendously ugly (staring too long at <linux/syscalls.h> has been
> known to cause madness and despair), but we do that for *every* single
> system call and I don't see why the new listmount entry would be
> different.
>

It isn't the syscall. The following hack avoids the problem.

diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..28fe2a55bd94 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5070,8 +5070,10 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
ctr = array_index_nospec(ctr, bufsize);
if (put_user(r->mnt_id_unique, buf + ctr))
return -EFAULT;
+#if 0
if (check_add_overflow(ctr, 1, &ctr))
return -ERANGE;
+#endif

But it isn't check_add_overflow() either. This "helps" as well.

diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..b53cb2f13530 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5068,8 +5068,10 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
if (!is_path_reachable(r, r->mnt.mnt_root, orig))
continue;
ctr = array_index_nospec(ctr, bufsize);
+#if 0
if (put_user(r->mnt_id_unique, buf + ctr))
return -EFAULT;
+#endif
if (check_add_overflow(ctr, 1, &ctr))
return -ERANGE;

Any variance of put_user() with &buf[ctr] or buf + ctr fails
if ctr is a variable and permitted to be != 0. For example,
commenting out the call to check_add_overflow() and starting
the loop with ctr = 1 also triggers the problem, as does replacing
the call to check_add_overflow() with ctr++;. Using a temporary
variable such as in
u64 __user *pbuf;
...
pbuf = buf + ctr;
if (put_user(r->mnt_id_unique, pbuf))
return -EFAULT;

doesn't help either. But this does:

- if (put_user(r->mnt_id_unique, buf + ctr))
+ if (put_user(r->mnt_id_unique, (u32 *)(buf + ctr)))

and "buf + 17" as well as "&buf[17]" work as well. Essentially,
every combination of "buf + ctr" or "&buf[ctr]" fails if buf
is u64* and ctr is a variable.

The following works. Would this be acceptable ?

diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..dc0f844205d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5068,10 +5068,11 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
if (!is_path_reachable(r, r->mnt.mnt_root, orig))
continue;
ctr = array_index_nospec(ctr, bufsize);
- if (put_user(r->mnt_id_unique, buf + ctr))
+ if (put_user(r->mnt_id_unique, buf))
return -EFAULT;
if (check_add_overflow(ctr, 1, &ctr))
return -ERANGE;
+ buf++;
}
return ctr;
}

Guenter

2024-01-11 20:23:45

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

On Thu, 11 Jan 2024 at 10:57, Guenter Roeck <[email protected]> wrote:
>
> Any variance of put_user() with &buf[ctr] or buf + ctr fails
> if ctr is a variable and permitted to be != 0.

Crazy. But the 64-bit put_user() is a bit special and tends to require
more registers (the 64-bit value is passed in two registers), so that
probably then results in the ICE.

Side note: looking at the SH version of __put_user_u64(), I think it's
buggy and is missing the exception handler for the second 32-bit move.
I dunno, I don't read sh asm, but it looks suspicious.

> The following works. Would this be acceptable ?

It might be very easy to trigger this once again if somebody goes "that's silly"

That said, I also absolutely detest the "error handling" in that
function. It's horrible.

Noticing the user access error in the middle is just sad, and if that
was just handled better and at least the range was checked first, the
overflow error couldn't happen and checking for it is thus pointless.

And looking at it all, it really looks like the whole interface is
broken. The "bufsize" argument isn't the size of the buffer at all.
It's the number of entries.

Extra confusingly, in the *other* system call, bufsize is in fact the
size of the buffer.

And the 'ctr' overflow checking is doubly garbage, because the only
reason *that* can happen is that we didn't check the incoming
arguments properly.

Same goes for the whole array_index_nospec() - it's pointless, because
the user controls what that code checks against anyway, so there's no
point to trying to manage some range checking.

The only range checking there that matters would be the one that
put_user() has to do against the address space size, but that's done
by put_user().

End result: that thing needs a rewrite.

The SH put_user64() needs to be looked at too, but in the meantime,
maybe something like this fixes the problems with listmount?

NOTE! ENTIRELY untested, but that naming and lack of argument sanity
checking really is horrendous. We should have caught this earlier.

Linus

Attachments:

patch.diff (2.21 kB)

2024-01-11 23:58:28

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

On 1/11/24 12:14, Linus Torvalds wrote:
> On Thu, 11 Jan 2024 at 10:57, Guenter Roeck <[email protected]> wrote:
>>
>> Any variance of put_user() with &buf[ctr] or buf + ctr fails
>> if ctr is a variable and permitted to be != 0.
>
> Crazy. But the 64-bit put_user() is a bit special and tends to require
> more registers (the 64-bit value is passed in two registers), so that
> probably then results in the ICE.
>
> Side note: looking at the SH version of __put_user_u64(), I think it's
> buggy and is missing the exception handler for the second 32-bit move.
> I dunno, I don't read sh asm, but it looks suspicious.
>

I wonder if something may be wrong with the definition and use of __m
for u64 accesses. The code below also fixes the build problem.

But then I really don't know what

struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))

is supposed to be doing in the first place, and I still don't understand
why the problem only shows up with CONFIG_MMU=n.

Guenter

---
diff --git a/arch/sh/include/asm/uaccess_32.h b/arch/sh/include/asm/uaccess_32.h
index 5d7ddc092afd..f0451a37b6ff 100644
--- a/arch/sh/include/asm/uaccess_32.h
+++ b/arch/sh/include/asm/uaccess_32.h
@@ -196,7 +196,7 @@ __asm__ __volatile__( \
".long 1b, 3b\n\t" \
".previous" \
: "=r" (retval) \
- : "r" (val), "m" (__m(addr)), "i" (-EFAULT), "0" (retval) \
+ : "r" (val), "m" (*(u64 *)(addr)), "i" (-EFAULT), "0" (retval) \
: "memory"); })
#else
#define __put_user_u64(val,addr,retval) \
@@ -218,7 +218,7 @@ __asm__ __volatile__( \
".long 1b, 3b\n\t" \
".previous" \
: "=r" (retval) \
- : "r" (val), "m" (__m(addr)), "i" (-EFAULT), "0" (retval) \
+ : "r" (val), "m" (*(u64 *)(addr)), "i" (-EFAULT), "0" (retval) \
: "memory"); })
#endif

2024-01-12 03:41:18

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

On Thu, 11 Jan 2024 at 15:57, Guenter Roeck <[email protected]> wrote:
>
> I wonder if something may be wrong with the definition and use of __m
> for u64 accesses. The code below also fixes the build problem.

Ok, that looks like the right workaround.

> But then I really don't know what
>
> struct __large_struct { unsigned long buf[100]; };
> #define __m(x) (*(struct __large_struct __user *)(x))

This is a historical pattern we've used because the gcc docs weren't
100% clear on what "m" does, and whether it might for example end up
loading the value from memory into a register, spilling it to the
stack, and then using the stack address...

Using the whole "tell the compiler it accesses a big structure" turns
the memory access into "BLKmode" (in gcc terms) and makes sure that
never happens.

NOTE! I'm not sure it ever did happen with gcc, but we have literally
seen clang do that "load from memory, spill to stack, and then use the
stack address for the asm". Crazy, I know. See

https://lore.kernel.org/all/CAHk-=wgobnShg4c2yyMbk2p=U-wmnOmX_0=b3ZY_479Jjey2xw@mail.gmail.com/

where I point to clang doing basically exactly that with the "rm"
constraint for another case entirely. I consider it a clang bug, but
happily I've never seen the "load only to spill" in a case where the
"stupid code generation" turned into "actively buggy code generation".

If it ever does, we may need to turn the "m" into a "p" and a memory
clobber, which will generate horrendous code. Or we may just need to
tell clang developers that enough is enough, and that they actually
need to take the asm constraints more seriously.

Linus

2024-01-12 05:24:42

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

On Thu, Jan 11, 2024 at 07:40:32PM -0800, Linus Torvalds wrote:
> On Thu, 11 Jan 2024 at 15:57, Guenter Roeck <[email protected]> wrote:
> >
> > I wonder if something may be wrong with the definition and use of __m
> > for u64 accesses. The code below also fixes the build problem.
>
> Ok, that looks like the right workaround.
>

I think I'll go with the code below. It doesn't touch the MMU code
(which for some reason doesn't result in a build failure), and
it generates

! 5071 "fs/namespace.c" 1
mov.l r6,@r1
mov.l r7,@(4,r1)

which seems to be correct. It also matches the pattern used
for __put_user_asm(). Does that make sense ?

Thanks,
Guenter

---
diff --git a/arch/sh/include/asm/uaccess_32.h b/arch/sh/include/asm/uaccess_32.h
index 5d7ddc092afd..5ce46b2a95b6 100644
--- a/arch/sh/include/asm/uaccess_32.h
+++ b/arch/sh/include/asm/uaccess_32.h
@@ -176,6 +176,7 @@ do { \
} while (0)
#endif /* CONFIG_MMU */

+#ifdef CONFIG_MMU
#if defined(CONFIG_CPU_LITTLE_ENDIAN)
#define __put_user_u64(val,addr,retval) \
({ \
@@ -221,6 +222,26 @@ __asm__ __volatile__( \
: "r" (val), "m" (__m(addr)), "i" (-EFAULT), "0" (retval) \
: "memory"); })
#endif
+#else
+#if defined(CONFIG_CPU_LITTLE_ENDIAN)
+#define __put_user_u64(val,addr,retval) \
+({ \
+__asm__ __volatile__( \
+ "mov.l %R0,%1\n\t" \
+ "mov.l %S0,%T1\n\t" \
+ : /* no outputs */ \
+ : "r" (val), "m" (*(u64 *)(addr)) \
+ : "memory"); })
+#else
+({ \
+__asm__ __volatile__( \
+ "mov.l %S0,%1\n\t" \
+ "mov.l %R0,%T1\n\t" \
+ : /* no outputs */ \
+ : "r" (val), "m" (*(u64 *)(addr)) \
+ : "memory"); })
+#endif
+#endif /* CONFIG_MMU */

extern void __put_user_unknown(void);

2024-01-12 09:00:47

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

> NOTE! ENTIRELY untested, but that naming and lack of argument sanity

Thanks for catching that. I've slightly tweaked the attached patch and
put it into vfs.fixes at [1]. There's a fsnotify performance regression
fix for io_uring in there as well. I plan to get both to you Saturday
CET. Let us know if there's any additional issues.

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs.fixes

2024-01-12 10:50:00

by Arnd Bergmann

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

On Thu, Jan 11, 2024, at 21:14, Linus Torvalds wrote:

> The SH put_user64() needs to be looked at too, but in the meantime,
> maybe something like this fixes the problems with listmount?

I tried changing it to use the generic memcpy() based uaccess
that m68k-nommu and riscv-nommu use, which also avoids the
build failure. I still run into other unrelated build issues
on arch/sh, so I'm not sure if this is a sufficient fix.

Arnd

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7500521b2b98..2cc3a541e231 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -73,6 +73,7 @@ config SUPERH
select PERF_USE_VMALLOC
select RTC_LIB
select SPARSE_IRQ
+ select UACCESS_MEMCPY if !MMU
select TRACE_IRQFLAGS_SUPPORT
help
The SuperH is a RISC processor targeted for use in embedded systems
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index a79609eb14be..b42764d55901 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -2,6 +2,7 @@
#ifndef __ASM_SH_UACCESS_H
#define __ASM_SH_UACCESS_H

+#ifdef CONFIG_MMU
#include <asm/extable.h>
#include <asm-generic/access_ok.h>

@@ -130,4 +131,8 @@ struct mem_access {
int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs,
struct mem_access *ma, int, unsigned long address);

+#else
+#include <asm-generic/uaccess.h>
+#endif
+
#endif /* __ASM_SH_UACCESS_H */
diff --git a/arch/sh/include/asm/uaccess_32.h b/arch/sh/include/asm/uaccess_32.h
index 5d7ddc092afd..e053f2fd245c 100644
--- a/arch/sh/include/asm/uaccess_32.h
+++ b/arch/sh/include/asm/uaccess_32.h
@@ -35,7 +35,6 @@ do { \
} \
} while (0)

-#ifdef CONFIG_MMU
#define __get_user_asm(x, addr, err, insn) \
({ \
__asm__ __volatile__( \
@@ -56,16 +55,6 @@ __asm__ __volatile__( \
".previous" \
:"=&r" (err), "=&r" (x) \
:"m" (__m(addr)), "i" (-EFAULT), "0" (err)); })
-#else
-#define __get_user_asm(x, addr, err, insn) \
-do { \
- __asm__ __volatile__ ( \
- "mov." insn " %1, %0\n\t" \
- : "=&r" (x) \
- : "m" (__m(addr)) \
- ); \
-} while (0)
-#endif /* CONFIG_MMU */

extern void __get_user_unknown(void);

@@ -140,7 +129,6 @@ do { \
} \
} while (0)

-#ifdef CONFIG_MMU
#define __put_user_asm(x, addr, err, insn) \
do { \
__asm__ __volatile__ ( \
@@ -164,17 +152,6 @@ do { \
: "memory" \
); \
} while (0)
-#else
-#define __put_user_asm(x, addr, err, insn) \
-do { \
- __asm__ __volatile__ ( \
- "mov." insn " %0, %1\n\t" \
- : /* no outputs */ \
- : "r" (x), "m" (__m(addr)) \
- : "memory" \
- ); \
-} while (0)
-#endif /* CONFIG_MMU */

#if defined(CONFIG_CPU_LITTLE_ENDIAN)
#define __put_user_u64(val,addr,retval) \

2024-01-23 14:17:42

by John Paul Adrian Glaubitz

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

Hi Guenter,

> with this patch in the tree, all sh4 builds fail with ICE.
>
> during RTL pass: final
> In file included from fs/namespace.c:11:
> fs/namespace.c: In function '__se_sys_listmount':
> include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275
>
> I tested with gcc 8.2, 11.3, 11.4, and 12.3. The compiler version
> does not make a difference. Has anyone else seen the same problem ?
> If so, any idea what to do about it ?

I'm not seeing any problems building the SH kernel except some -Werror=missing-prototypes warnings.

I'm using gcc 11.1 from here [1].

Adrian

PS: Please always CC linux-sh and the SH maintainers when reporting issues.

> [1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/11.1.0/

--
.''`. John Paul Adrian Glaubitz
: :' : Debian Developer
`. `' Physicist
`- GPG: 62FF 8A75 84E0 2956 9546 0006 7426 3B37 F5B5 F913

2024-01-23 14:17:43

by John Paul Adrian Glaubitz

[permalink] [raw]

Subject: Re: [PATCH v4 5/6] add listmount(2) syscall

2024-01-23 15:32:00