LinuxLists.cc - [PATCH v8 01/19] securityfs: Extend securityfs with namespacing support

2022-01-04 17:05:12

Subject: [PATCH v8 01/19] securityfs: Extend securityfs with namespacing support

From: Stefan Berger <[email protected]>

To prepare for virtualization of SecurityFS, use simple_pin_fs and
simpe_release_fs only when init_user_ns is active.

Extend 'securityfs' for support of IMA namespacing so that each
IMA (user) namespace can have its own front-end for showing the currently
active policy, the measurement list, number of violations and so on.

Enable multiple instances of securityfs by keying each instance with a
pointer to the user namespace it belongs to.

Drop the additional dentry reference to enable simple cleanup of dentries
upon umount. Now the dentries do not need to be explicitly freed anymore
but we can just rely on d_genocide() and the dcache shrinker to do all
the required work.

Prevent mounting of an instance of securityfs in another user namespace
than it belongs to. Also, prevent accesses to files and directories by
a user namespace that it not either the user namespace it belongs to
or an ancestor of the user namespace that the instance of securityfs
belongs to. Do not prevent access if securityfs was bind-mounted and the
init_user_ns is the owning user namespace.

Signed-off-by: Stefan Berger <[email protected]>
Signed-off-by: James Bottomley <[email protected]>
---
security/inode.c | 75 ++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 63 insertions(+), 12 deletions(-)

diff --git a/security/inode.c b/security/inode.c
index 6c326939750d..e525ba960063 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -21,9 +21,37 @@
#include <linux/security.h>
#include <linux/lsm_hooks.h>
#include <linux/magic.h>
+#include <linux/user_namespace.h>

-static struct vfsmount *mount;
-static int mount_count;
+static struct vfsmount *init_securityfs_mount;
+static int init_securityfs_mount_count;
+
+static int securityfs_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ int err;
+
+ err = generic_permission(&init_user_ns, inode, mask);
+ if (!err) {
+ /* Unless bind-mounted, deny access if current_user_ns() is not
+ * ancestor.
+ */
+ if (inode->i_sb->s_user_ns != &init_user_ns &&
+ !in_userns(current_user_ns(), inode->i_sb->s_user_ns))
+ err = -EACCES;
+ }
+
+ return err;
+}
+
+static const struct inode_operations securityfs_dir_inode_operations = {
+ .permission = securityfs_permission,
+ .lookup = simple_lookup,
+};
+
+static const struct inode_operations securityfs_file_inode_operations = {
+ .permission = securityfs_permission,
+};

static void securityfs_free_inode(struct inode *inode)
{
@@ -40,20 +68,25 @@ static const struct super_operations securityfs_super_operations = {
static int securityfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr files[] = {{""}};
+ struct user_namespace *ns = fc->user_ns;
int error;

+ if (WARN_ON(ns != current_user_ns()))
+ return -EINVAL;
+
error = simple_fill_super(sb, SECURITYFS_MAGIC, files);
if (error)
return error;

sb->s_op = &securityfs_super_operations;
+ sb->s_root->d_inode->i_op = &securityfs_dir_inode_operations;

return 0;
}

static int securityfs_get_tree(struct fs_context *fc)
{
- return get_tree_single(fc, securityfs_fill_super);
+ return get_tree_keyed(fc, securityfs_fill_super, fc->user_ns);
}

static const struct fs_context_operations securityfs_context_ops = {
@@ -71,6 +104,7 @@ static struct file_system_type fs_type = {
.name = "securityfs",
.init_fs_context = securityfs_init_fs_context,
.kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT,
};

/**
@@ -109,6 +143,7 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
const struct file_operations *fops,
const struct inode_operations *iops)
{
+ struct user_namespace *ns = current_user_ns();
struct dentry *dentry;
struct inode *dir, *inode;
int error;
@@ -118,12 +153,19 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,

pr_debug("securityfs: creating file '%s'\n",name);

- error = simple_pin_fs(&fs_type, &mount, &mount_count);
- if (error)
- return ERR_PTR(error);
+ if (ns == &init_user_ns) {
+ error = simple_pin_fs(&fs_type, &init_securityfs_mount,
+ &init_securityfs_mount_count);
+ if (error)
+ return ERR_PTR(error);
+ }

- if (!parent)
- parent = mount->mnt_root;
+ if (!parent) {
+ if (ns == &init_user_ns)
+ parent = init_securityfs_mount->mnt_root;
+ else
+ return ERR_PTR(-EINVAL);
+ }

dir = d_inode(parent);

@@ -148,7 +190,7 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_private = data;
if (S_ISDIR(mode)) {
- inode->i_op = &simple_dir_inode_operations;
+ inode->i_op = &securityfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
inc_nlink(inode);
inc_nlink(dir);
@@ -156,10 +198,10 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
inode->i_op = iops ? iops : &simple_symlink_inode_operations;
inode->i_link = data;
} else {
+ inode->i_op = &securityfs_file_inode_operations;
inode->i_fop = fops;
}
d_instantiate(dentry, inode);
- dget(dentry);
inode_unlock(dir);
return dentry;

@@ -168,7 +210,9 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
dentry = ERR_PTR(error);
out:
inode_unlock(dir);
- simple_release_fs(&mount, &mount_count);
+ if (ns == &init_user_ns)
+ simple_release_fs(&init_securityfs_mount,
+ &init_securityfs_mount_count);
return dentry;
}

@@ -294,22 +338,29 @@ EXPORT_SYMBOL_GPL(securityfs_create_symlink);
*/
void securityfs_remove(struct dentry *dentry)
{
+ struct user_namespace *ns;
struct inode *dir;

if (!dentry || IS_ERR(dentry))
return;

+ ns = dentry->d_sb->s_user_ns;
+
dir = d_inode(dentry->d_parent);
inode_lock(dir);
if (simple_positive(dentry)) {
+ dget(dentry);
if (d_is_dir(dentry))
simple_rmdir(dir, dentry);
else
simple_unlink(dir, dentry);
+ d_delete(dentry);
dput(dentry);
}
inode_unlock(dir);
- simple_release_fs(&mount, &mount_count);
+ if (ns == &init_user_ns)
+ simple_release_fs(&init_securityfs_mount,
+ &init_securityfs_mount_count);
}
EXPORT_SYMBOL_GPL(securityfs_remove);

--
2.31.1

2022-01-05 03:58:21

by Al Viro

[permalink] [raw]

Subject: Re: [PATCH v8 01/19] securityfs: Extend securityfs with namespacing support

On Tue, Jan 04, 2022 at 12:03:58PM -0500, Stefan Berger wrote:
> From: Stefan Berger <[email protected]>
>
> To prepare for virtualization of SecurityFS, use simple_pin_fs and
> simpe_release_fs only when init_user_ns is active.
>
> Extend 'securityfs' for support of IMA namespacing so that each
> IMA (user) namespace can have its own front-end for showing the currently
> active policy, the measurement list, number of violations and so on.
>
> Enable multiple instances of securityfs by keying each instance with a
> pointer to the user namespace it belongs to.
>
> Drop the additional dentry reference to enable simple cleanup of dentries
> upon umount. Now the dentries do not need to be explicitly freed anymore
> but we can just rely on d_genocide() and the dcache shrinker to do all
> the required work.

Looks brittle... What are the new rules for securityfs_remove()? Is it
still paired with securityfs_create_...()? When is removal done? On
securityfs instance shutdown? What about the underlying data structures, BTW?
When can they be freed?

That kind of commit message is asking for trouble down the road; please,
document the rules properly.

Incidentally, what happens if you open a file, pass it to somebody in a
different userns and try to shut the opener's userns down?

2022-01-05 10:18:34

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH v8 01/19] securityfs: Extend securityfs with namespacing support

On Wed, Jan 05, 2022 at 03:58:11AM +0000, Al Viro wrote:
> On Tue, Jan 04, 2022 at 12:03:58PM -0500, Stefan Berger wrote:
> > From: Stefan Berger <[email protected]>
> >
> > To prepare for virtualization of SecurityFS, use simple_pin_fs and
> > simpe_release_fs only when init_user_ns is active.
> >
> > Extend 'securityfs' for support of IMA namespacing so that each
> > IMA (user) namespace can have its own front-end for showing the currently
> > active policy, the measurement list, number of violations and so on.
> >
> > Enable multiple instances of securityfs by keying each instance with a
> > pointer to the user namespace it belongs to.
> >
> > Drop the additional dentry reference to enable simple cleanup of dentries
> > upon umount. Now the dentries do not need to be explicitly freed anymore
> > but we can just rely on d_genocide() and the dcache shrinker to do all
> > the required work.
>
> Looks brittle... What are the new rules for securityfs_remove()? Is it
> still paired with securityfs_create_...()? When is removal done? On
> securityfs instance shutdown? What about the underlying data structures, BTW?
> When can they be freed?
>
> That kind of commit message is asking for trouble down the road; please,
> document the rules properly.

Yeah, it's not explaining it in detail. I've asked for that as well. My
explanations below are what I expressed it should look like in prior
reviews. I haven't reviewed this version yet so this as I would expect
it to go.

For the initial securityfs, i.e. the one mounted in the host userns
mount nothing changes.
The rules for securityfs_remove() are as before and it is still paired
with securityfs_create(). Specifically, a file created via
securityfs_create_dentry() in the initial securityfs mount still needs
to be removed by a call to securityfs_remove().
Creating a new dentry in the initial securityfs mount still pins the
filesytem like it always did. Consequently, the initial securityfs
mount is not destroyed on umount/shutdown as long as at least one user
of it still has dentries that it hasn't removed with a call to
securityfs_remove().

This specific part of the commit message you responded to is not
giving enough details, I think:

> > Drop the additional dentry reference to enable simple cleanup of dentries
> > upon umount. Now the dentries do not need to be explicitly freed anymore
> > but we can just rely on d_genocide() and the dcache shrinker to do all
> > the required work.

The "additional dentry reference" mentioned only relates to an afaict
unnecessary dget() in securityfs_create_dentry() which I pointed out
as part of earlier reviews. But the phrasing implies that there's a
behavioral change for the initial securityfs instance based on the
removal of this additional dget() when there really isn't.

After securityfs_create_dentry() has created a new dentry via
lookup_one_len() and eventually called d_instantiate() it currently
takes an additional reference on the newly created dentry via dget().
This additional reference is then paired with an additional dput() in
securityfs_remove(). I have not yet seen a reason why this is
necessary maybe you can help there.

For example, contrast this with debugfs which has the same underlying
logic as securityfs, i.e. any created dentry pins the whole filesystem
via simple_pin_fs() until the dentry is released and simple_unpin_fs()
is called. It uses a similar pairing as securityfs: where securityfs
has the securityfs_create_dentry() and securityfs_remove() pairing,
debugfs has the __debugfs_create_file() and debugfs_remove() pairing.
But debugfs doesn't take an additional reference on the just created
dentry in __debugfs_create_file() which would need to be put in
debugfs_remove().

So if we contrast the creation routines of securityfs and debugfs directly
condensed to just the dentry references:

securityfs | debugfs
---------------- | ------------------
|
lookup_one_len() | lookup_one_len()
d_instantiate() | d_instantiate()
dget() |

And I have not understood why securityfs would need that additional
dget(). Not just intrinsically but also when contrasted with debugfs. So
that additional dget() is removed as part of this patch.

But the explanation in the commit message isn't ideal as it implies
the removal of the additional dget() would have any impact on the
pinning logic for securityfs when it does not.

But the pinning logic doesn't make sense outside of the initial
namespace which can never go away and there are security modules that
have files or settings for the whole system that never go away and will
always keep the filesystem around.

But for unprivileged/userns containers that mount their own securityfs
instance we want the securityfs instance cleaned up when it is
unmounted. There is no need to duplicate the pinning logic or make the
global securityfs instance display different information based on the
userns. Both options would be really messy and hacky.

Instead we can simply give each userns it's own securityfs instance
similar to how each ipc ns has its own mqueue instance and all entries
in there are cleaned up on umount or when the whole container is
shutdown. After the container is shutdown all of the security module
settings for the container go away with it anyway. So for that we don't
want any filesystem pinning done in securityfs_create_dentry(). And we
also really don't want the additional dget() that is currently taken in
securityfs_create_dentry() as it would pointlessly require us to dput()
during superblock shutdown afaict. None of this however should cause any
behavioral changes for the initial securityfs instance.

>
> Incidentally, what happens if you open a file, pass it to somebody in a
> different userns and try to shut the opener's userns down?

I'm not exactly sure what you mean by "shutting down" and whether that's
a generic question or specific to this patch. I assume that you just
mean what happens when the last task for a userns exits and the userns
isn't pinned by e.g. a bind-mount of it to somewhere.

If you're just asking about the generic case then the opener's creds are
pinned by ->f_cred including the caller's userns at open-time. So it
will be released once that file has been closed.

If you're asking about what happens to the userns the
securityfs/tmpfs/mqueue/devpts etc. instance was mounted in it will be
pinned by the superblock which in turn is pinned by the open file.

Does that answer your question or did you have something else in mind?

2022-01-11 12:17:08

by Mimi Zohar

[permalink] [raw]

Subject: Re: [PATCH v8 01/19] securityfs: Extend securityfs with namespacing support

On Wed, 2022-01-05 at 11:18 +0100, Christian Brauner wrote:
> On Wed, Jan 05, 2022 at 03:58:11AM +0000, Al Viro wrote:
> > On Tue, Jan 04, 2022 at 12:03:58PM -0500, Stefan Berger wrote:
> > > From: Stefan Berger <[email protected]>

> > > Drop the additional dentry reference to enable simple cleanup of dentries
> > > upon umount. Now the dentries do not need to be explicitly freed anymore
> > > but we can just rely on d_genocide() and the dcache shrinker to do all
> > > the required work.
>
> The "additional dentry reference" mentioned only relates to an afaict
> unnecessary dget() in securityfs_create_dentry() which I pointed out
> as part of earlier reviews. But the phrasing implies that there's a
> behavioral change for the initial securityfs instance based on the
> removal of this additional dget() when there really isn't.
>
> After securityfs_create_dentry() has created a new dentry via
> lookup_one_len() and eventually called d_instantiate() it currently
> takes an additional reference on the newly created dentry via dget().
> This additional reference is then paired with an additional dput() in
> securityfs_remove(). I have not yet seen a reason why this is
> necessary maybe you can help there.
>
> For example, contrast this with debugfs which has the same underlying
> logic as securityfs, i.e. any created dentry pins the whole filesystem
> via simple_pin_fs() until the dentry is released and simple_unpin_fs()
> is called. It uses a similar pairing as securityfs: where securityfs
> has the securityfs_create_dentry() and securityfs_remove() pairing,
> debugfs has the __debugfs_create_file() and debugfs_remove() pairing.
> But debugfs doesn't take an additional reference on the just created
> dentry in __debugfs_create_file() which would need to be put in
> debugfs_remove().
>
> So if we contrast the creation routines of securityfs and debugfs directly
> condensed to just the dentry references:
>
> securityfs | debugfs
> ---------------- | ------------------
> |
> lookup_one_len() | lookup_one_len()
> d_instantiate() | d_instantiate()
> dget() |
>
> And I have not understood why securityfs would need that additional
> dget(). Not just intrinsically but also when contrasted with debugfs. So
> that additional dget() is removed as part of this patch.

Assuming it isn't needed, could removing it be a separate patch and
upstreamed independently of either the securityfs or IMA namespacing
changes?

thanks,

Mimi

>
> But the explanation in the commit message isn't ideal as it implies
> the removal of the additional dget() would have any impact on the
> pinning logic for securityfs when it does not.
>
> But the pinning logic doesn't make sense outside of the initial
> namespace which can never go away and there are security modules that
> have files or settings for the whole system that never go away and will
> always keep the filesystem around.
>
> But for unprivileged/userns containers that mount their own securityfs
> instance we want the securityfs instance cleaned up when it is
> unmounted. There is no need to duplicate the pinning logic or make the
> global securityfs instance display different information based on the
> userns. Both options would be really messy and hacky.
>
> Instead we can simply give each userns it's own securityfs instance
> similar to how each ipc ns has its own mqueue instance and all entries
> in there are cleaned up on umount or when the whole container is
> shutdown. After the container is shutdown all of the security module
> settings for the container go away with it anyway. So for that we don't
> want any filesystem pinning done in securityfs_create_dentry(). And we
> also really don't want the additional dget() that is currently taken in
> securityfs_create_dentry() as it would pointlessly require us to dput()
> during superblock shutdown afaict. None of this however should cause any
> behavioral changes for the initial securityfs instance.

2022-01-11 14:12:40

by Christian Brauner

[permalink] [raw]

Subject: Re: [PATCH v8 01/19] securityfs: Extend securityfs with namespacing support

On Tue, Jan 11, 2022 at 07:16:26AM -0500, Mimi Zohar wrote:
> On Wed, 2022-01-05 at 11:18 +0100, Christian Brauner wrote:
> > On Wed, Jan 05, 2022 at 03:58:11AM +0000, Al Viro wrote:
> > > On Tue, Jan 04, 2022 at 12:03:58PM -0500, Stefan Berger wrote:
> > > > From: Stefan Berger <[email protected]>
>
> > > > Drop the additional dentry reference to enable simple cleanup of dentries
> > > > upon umount. Now the dentries do not need to be explicitly freed anymore
> > > > but we can just rely on d_genocide() and the dcache shrinker to do all
> > > > the required work.
> >
> > The "additional dentry reference" mentioned only relates to an afaict
> > unnecessary dget() in securityfs_create_dentry() which I pointed out
> > as part of earlier reviews. But the phrasing implies that there's a
> > behavioral change for the initial securityfs instance based on the
> > removal of this additional dget() when there really isn't.
> >
> > After securityfs_create_dentry() has created a new dentry via
> > lookup_one_len() and eventually called d_instantiate() it currently
> > takes an additional reference on the newly created dentry via dget().
> > This additional reference is then paired with an additional dput() in
> > securityfs_remove(). I have not yet seen a reason why this is
> > necessary maybe you can help there.
> >
> > For example, contrast this with debugfs which has the same underlying
> > logic as securityfs, i.e. any created dentry pins the whole filesystem
> > via simple_pin_fs() until the dentry is released and simple_unpin_fs()
> > is called. It uses a similar pairing as securityfs: where securityfs
> > has the securityfs_create_dentry() and securityfs_remove() pairing,
> > debugfs has the __debugfs_create_file() and debugfs_remove() pairing.
> > But debugfs doesn't take an additional reference on the just created
> > dentry in __debugfs_create_file() which would need to be put in
> > debugfs_remove().
> >
> > So if we contrast the creation routines of securityfs and debugfs directly
> > condensed to just the dentry references:
> >
> > securityfs | debugfs
> > ---------------- | ------------------
> > |
> > lookup_one_len() | lookup_one_len()
> > d_instantiate() | d_instantiate()
> > dget() |
> >
> > And I have not understood why securityfs would need that additional
> > dget(). Not just intrinsically but also when contrasted with debugfs. So
> > that additional dget() is removed as part of this patch.
>
> Assuming it isn't needed, could removing it be a separate patch and
> upstreamed independently of either the securityfs or IMA namespacing
> changes?

Yeah, if the security tree wants to take it. So sm like:

From 478e96d1da24960e50897e6752f410b3d0833570 Mon Sep 17 00:00:00 2001
From: Christian Brauner <[email protected]>
Date: Tue, 11 Jan 2022 14:04:11 +0100
Subject: [PATCH] securityfs: rework dentry creation

When securityfs creates a new file or directory via
securityfs_create_dentry() it will take an additional reference on the
newly created dentry after it has attached the new inode to the new
dentry and added it to the hashqueues.
If we contrast this with debugfs which has the same underlying logic as
securityfs. It uses a similar pairing as securityfs. Where securityfs
has the securityfs_create_dentry() and securityfs_remove() pairing,
debugfs has the __debugfs_create_file() and debugfs_remove() pairing.

In contrast to securityfs, debugfs doesn't take an additional reference
on the newly created dentry in __debugfs_create_file() which would need
to be put in debugfs_remove().

The additional dget() isn't a problem per se. In the current
implementation of securityfs each created dentry pins the filesystem via
until it is removed. Since it is virtually guaranteed that there is at
least one user of securityfs that has created dentries the initial
securityfs mount cannot go away until all dentries have been removed.

Since most of the users of the initial securityfs mount don't go away
until the system is shutdown the initial securityfs won't go away when
unmounted. Instead a mount will usually surface the same superblock as
before. The additional dget() doesn't matter in this scenario since it
is required that all dentries have been cleaned up by the respective
users before the superblock can be destroyed, i.e. superblock shutdown
is tied to the lifetime of the associated dentries.

However, in order to support ima namespaces we need to extend securityfs
to support being mounted outside of the initial user namespace. For
namespaced users the pinning logic doesn't make sense. Whereas in the
initial namespace the securityfs instance and the associated data
structures of its users can't go away for reason explained earlier users
of non-initial securityfs instances do go away when the last users of
the namespace are gone.

So for those users we neither want to duplicate the pinning logic nor
make the global securityfs instance display different information based
on the namespace. Both options would be really messy and hacky.

Instead we will simply give each namespace its own securityfs instance
similar to how each ipc namespace has its own mqueue instance and all
entries in there are cleaned up on umount or when the last user of the
associated namespace is gone.

This means that the superblock's lifetime isn't tied to the dentries.
Instead the last umount, without any fds kept open, will trigger a clean
shutdown. But now the additional dget() gets in the way. Instead of
being able to rely on the generic superblock shutdown logic we would
need to drop the additional dentry reference during superblock shutdown
for all associated users. That would force the use of a generic
coordination mechanism for current and future users of securityfs which
is unnecessary. Simply remove the additional dget() in
securityfs_dentry_create().

In securityfs_remove() we will call dget() to take an additional
reference on the dentry about to be removed. After simple_unlink() or
simple_rmdir() have dropped the dentry refcount we can call d_delete()
which will either turn the dentry into negative dentry if our earlier
dget() is the only reference to the dentry, i.e. it has no other users,
or remove it from the hashqueues in case there are additional users.

All of these changes should not have any effect on the userspace
semantics of the initial securityfs mount.

Signed-off-by: Christian Brauner <[email protected]>
---
security/inode.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/inode.c b/security/inode.c
index 6c326939750d..13e6780c4444 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -159,7 +159,6 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
inode->i_fop = fops;
}
d_instantiate(dentry, inode);
- dget(dentry);
inode_unlock(dir);
return dentry;

@@ -302,10 +301,12 @@ void securityfs_remove(struct dentry *dentry)
dir = d_inode(dentry->d_parent);
inode_lock(dir);
if (simple_positive(dentry)) {
+ dget(dentry);
if (d_is_dir(dentry))
simple_rmdir(dir, dentry);
else
simple_unlink(dir, dentry);
+ d_delete(dentry);
dput(dentry);
}
inode_unlock(dir);
--
2.32.0