2022-04-21 10:12:36

by Yang Xu (Fujitsu)

[permalink] [raw]
Subject: [PATCH v4 1/8] fs: move sgid strip operation from inode_init_owner into inode_sgid_strip

This has no functional change. Just create and export inode_sgid_strip api for
the subsequent patch. This function is used to strip S_ISGID mode when init
a new inode.

Acked-by: Christian Brauner (Microsoft) <[email protected]>
Signed-off-by: Yang Xu <[email protected]>
---
fs/inode.c | 22 ++++++++++++++++++----
include/linux/fs.h | 3 ++-
2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 9d9b422504d1..3215e61a0021 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2246,10 +2246,8 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
/* Directories are special, and always inherit S_ISGID */
if (S_ISDIR(mode))
mode |= S_ISGID;
- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
- !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
- !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
- mode &= ~S_ISGID;
+ else
+ inode_sgid_strip(mnt_userns, dir, &mode);
} else
inode_fsgid_set(inode, mnt_userns);
inode->i_mode = mode;
@@ -2405,3 +2403,19 @@ struct timespec64 current_time(struct inode *inode)
return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);
+
+void inode_sgid_strip(struct user_namespace *mnt_userns,
+ const struct inode *dir, umode_t *mode)
+{
+ if (S_ISDIR(*mode) || !dir || !(dir->i_mode & S_ISGID))
+ return;
+ if ((*mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
+ return;
+ if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
+ return;
+ if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+ return;
+
+ *mode &= ~S_ISGID;
+}
+EXPORT_SYMBOL(inode_sgid_strip);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbde95387a23..4a617aaab6f6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1897,7 +1897,8 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
const struct inode *dir, umode_t mode);
extern bool may_open_dev(const struct path *path);
-
+void inode_sgid_strip(struct user_namespace *mnt_userns,
+ const struct inode *dir, umode_t *mode);
/*
* This is the "filldir" function type, used by readdir() to let
* the kernel specify what kind of dirent layout it wants to have.
--
2.27.0


2022-04-22 02:56:13

by Yang Xu (Fujitsu)

[permalink] [raw]
Subject: [PATCH v4 3/8] xfs: only call posix_acl_create under CONFIG_XFS_POSIX_ACL

Since xfs_generic_create only calls xfs_set_acl when enable this kconfig, we
don't need to call posix_acl_create for the !CONFIG_XFS_POSIX_ACL case.

The previous patch has added missing umask strip for tmpfile, so all creation
paths handle umask in the vfs directly if the filesystem doesn't support or
enable POSIX ACLs.

So just put this function under CONFIG_XFS_POSIX_ACL and umask strip still works
well.

Also use unified rule for CONFIG_XFS_POSIX_ACL in this file, so use IS_ENABLED in
xfs_generic_create.

Signed-off-by: Yang Xu <[email protected]>
---
fs/xfs/xfs_iops.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b34e8e4344a8..6b8df9ab215a 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -150,6 +150,7 @@ xfs_create_need_xattr(
return true;
if (default_acl)
return true;
+
#if IS_ENABLED(CONFIG_SECURITY)
if (dir->i_sb->s_security)
return true;
@@ -169,7 +170,7 @@ xfs_generic_create(
{
struct inode *inode;
struct xfs_inode *ip = NULL;
- struct posix_acl *default_acl, *acl;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
struct xfs_name name;
int error;

@@ -184,9 +185,11 @@ xfs_generic_create(
rdev = 0;
}

+#if IS_ENABLED(CONFIG_XFS_POSIX_ACL)
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
return error;
+#endif

/* Verify mode is valid also for tmpfile case */
error = xfs_dentry_mode_to_name(&name, dentry, mode);
@@ -209,7 +212,7 @@ xfs_generic_create(
if (unlikely(error))
goto out_cleanup_inode;

-#ifdef CONFIG_XFS_POSIX_ACL
+#if IS_ENABLED(CONFIG_XFS_POSIX_ACL)
if (default_acl) {
error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
--
2.27.0

2022-04-22 19:49:38

by Yang Xu (Fujitsu)

[permalink] [raw]
Subject: [PATCH v4 7/8] fs: strip file's S_ISGID mode on vfs instead of on underlying filesystem

Currently, vfs only passes mode argument to filesystem, then use inode_init_owner()
to strip S_ISGID. Some filesystem(ie ext4/btrfs) will call inode_init_owner
firstly, then posxi acl setup, but xfs uses the contrary order. It will affect
S_ISGID clear especially we filter S_IXGRP by umask or acl.

Regardless of which filesystem is in use, failure to strip the SGID correctly is
considered a security failure that needs to be fixed. The current VFS infrastructure
requires the filesystem to do everything right and not step on any landmines to
strip the SGID bit, when in fact it can easily be done at the VFS and the filesystems
then don't even need to be aware that the SGID needs to be (or has been stripped) by
the operation the user asked to be done.

Vfs has all the info it needs - it doesn't need the filesystems to do everything
correctly with the mode and ensuring that they order things like posix acl setup
functions correctly with inode_init_owner() to strip the SGID bit.

Just strip the SGID bit at the VFS, and then the filesystems can't get it wrong.

Also, the inode_sgid_strip() api should be used before IS_POSIXACL() because
this api may change mode.

Only the following places use inode_init_owner
"
arch/powerpc/platforms/cell/spufs/inode.c: inode_init_owner(&init_user_ns, inode, dir, mode | S_IFDIR);
arch/powerpc/platforms/cell/spufs/inode.c: inode_init_owner(&init_user_ns, inode, dir, mode | S_IFDIR);
fs/9p/vfs_inode.c: inode_init_owner(&init_user_ns, inode, NULL, mode);
fs/bfs/dir.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/btrfs/inode.c: inode_init_owner(mnt_userns, inode, dir, mode);
fs/btrfs/tests/btrfs-tests.c: inode_init_owner(&init_user_ns, inode, NULL, S_IFREG);
fs/ext2/ialloc.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/ext4/ialloc.c: inode_init_owner(mnt_userns, inode, dir, mode);
fs/f2fs/namei.c: inode_init_owner(mnt_userns, inode, dir, mode);
fs/hfsplus/inode.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/hugetlbfs/inode.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/jfs/jfs_inode.c: inode_init_owner(&init_user_ns, inode, parent, mode);
fs/minix/bitmap.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/nilfs2/inode.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/ntfs3/inode.c: inode_init_owner(mnt_userns, inode, dir, mode);
fs/ocfs2/dlmfs/dlmfs.c: inode_init_owner(&init_user_ns, inode, NULL, mode);
fs/ocfs2/dlmfs/dlmfs.c: inode_init_owner(&init_user_ns, inode, parent, mode);
fs/ocfs2/namei.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/omfs/inode.c: inode_init_owner(&init_user_ns, inode, NULL, mode);
fs/overlayfs/dir.c: inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode);
fs/ramfs/inode.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/reiserfs/namei.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/sysv/ialloc.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/ubifs/dir.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/udf/ialloc.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/ufs/ialloc.c: inode_init_owner(&init_user_ns, inode, dir, mode);
fs/xfs/xfs_inode.c: inode_init_owner(mnt_userns, inode, dir, mode);
fs/zonefs/super.c: inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
kernel/bpf/inode.c: inode_init_owner(&init_user_ns, inode, dir, mode);
mm/shmem.c: inode_init_owner(&init_user_ns, inode, dir, mode);
"

They are used in filesystem init new inode function and these init inode functions are used
by following operations:
mkdir
symlink
mknod
create
tmpfile
rename

We don't care about mkdir because we don't strip SGID bit for directory except fs.xfs.irix_sgid_inherit.
But we even call it in do_mkdirat() since inode_sgid_strip() will skip directories anyway. This will
enforce the same ordering for all relevant operations and it will make the code more uniform and
easier to understand by using new helper prepare_mode().

symlink and rename only use valid mode that doesn't have SGID bit.

We have added inode_sgid_strip api for the remaining operations.

In addition to the above six operations, four filesystems has a little difference
1) btrfs has btrfs_create_subvol_root to create new inode but used non SGID bit mode and can ignore
2) ocfs2 reflink function should add inode_sgid_strip api manually because we don't add it in vfs
3) spufs which doesn't really go hrough the regular VFS callpath because it has separate system call
spu_create, but it t only allows the creation of directories and only allows bits in 0777 and can ignore
4)bpf use vfs_mkobj in bpf_obj_do_pin with "S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()) mode and
use bpf_mkobj_ops in bpf_iter_link_pin_kernel with S_IFREG | S_IRUSR; , so bpf is also not affected

This patch also changed grpid behaviour for ext4/xfs because the mode passed to them may been
changed by inode_sgid_strip.

Also as Christian Brauner said"
The patch itself is useful as it would move a security sensitive operation that is currently burried in
individual filesystems into the vfs layer. But it has a decent regression potential since it might strip
filesystems that have so far relied on getting the S_ISGID bit with a mode argument. So this needs a lot
of testing and long exposure in -next for at least one full kernel cycle."

Suggested-by: Dave Chinner <[email protected]>
Signed-off-by: Yang Xu <[email protected]>
---
fs/inode.c | 2 --
fs/namei.c | 22 +++++++++-------------
fs/ocfs2/namei.c | 1 +
include/linux/fs.h | 9 +++++++++
4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 3215e61a0021..0eb1dab99893 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2246,8 +2246,6 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
/* Directories are special, and always inherit S_ISGID */
if (S_ISDIR(mode))
mode |= S_ISGID;
- else
- inode_sgid_strip(mnt_userns, dir, &mode);
} else
inode_fsgid_set(inode, mnt_userns);
inode->i_mode = mode;
diff --git a/fs/namei.c b/fs/namei.c
index 73646e28fae0..f86614ab841f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3287,8 +3287,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL)
open_flag &= ~O_TRUNC;
- if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current_umask();
+ prepare_mode(mnt_userns, dir->d_inode, &mode);
if (likely(got_write))
create_error = may_o_create(mnt_userns, &nd->path,
dentry, mode);
@@ -3521,8 +3520,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
child = d_alloc(dentry, &slash_name);
if (unlikely(!child))
goto out_err;
- if (!IS_POSIXACL(dir))
- mode &= ~current_umask();
+ prepare_mode(mnt_userns, dir, &mode);
error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
if (error)
goto out_err;
@@ -3850,13 +3848,12 @@ static int do_mknodat(int dfd, struct filename *name, umode_t mode,
if (IS_ERR(dentry))
goto out1;

- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mnt_userns = mnt_user_ns(path.mnt);
+ prepare_mode(mnt_userns, path.dentry->d_inode, &mode);
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
goto out2;

- mnt_userns = mnt_user_ns(path.mnt);
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(mnt_userns, path.dentry->d_inode,
@@ -3943,6 +3940,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode)
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_DIRECTORY;
+ struct user_namespace *mnt_userns;

retry:
dentry = filename_create(dfd, name, &path, lookup_flags);
@@ -3950,15 +3948,13 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode)
if (IS_ERR(dentry))
goto out_putname;

- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mnt_userns = mnt_user_ns(path.mnt);
+ prepare_mode(mnt_userns, path.dentry->d_inode, &mode);
error = security_path_mkdir(&path, dentry, mode);
- if (!error) {
- struct user_namespace *mnt_userns;
- mnt_userns = mnt_user_ns(path.mnt);
+ if (!error)
error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
mode);
- }
+
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c75fd54b9185..c81b8e0847aa 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -198,6 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
if (S_ISDIR(mode))
set_nlink(inode, 2);
inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_sgid_strip(&init_user_ns, dir, &mode);
status = dquot_initialize(inode);
if (status)
return ERR_PTR(status);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a617aaab6f6..8c2f4cde974b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3458,6 +3458,15 @@ static inline bool dir_relax_shared(struct inode *inode)
return !IS_DEADDIR(inode);
}

+static inline void prepare_mode(struct user_namespace *mnt_userns,
+ const struct inode *dir, umode_t *mode)
+{
+ inode_sgid_strip(mnt_userns, dir, mode);
+
+ if (!IS_POSIXACL(dir))
+ *mode &= ~current_umask();
+}
+
extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

--
2.27.0