2020-04-22 14:57:28

by Christian Brauner

[permalink] [raw]
Subject: [PATCH v2 5/7] loop: preserve sysfs backwards compatibility

For sysfs the initial namespace is special. All devices currently
propagate into all non-initial namespaces. For example, sysfs is usually
mounted in a privileged or unprivileged container and all devices are
visible to the container but are owned by global root. Even though none
of the propagated files can be used there are still a lot of read-only
values that are accessed or read by tools running in non-initial
namespaces. Some devices though, which can be moved or created in
another namespace, will only show up in the corresponding namespace.
This currently includes network and loop devices but no other ones.
Since all current workloads depend on devices from the inital namespace
being visible this behavior cannot be simply changed. This patch just
makes sure to keep propagating devices that share the same device class
with loop devices from the initial namespaces into all non-initial
namespaces as before. In short, nothing changes only loopfs loop devices
will be shown in their correct namespace.

Cc: Jens Axboe <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
---
/* v2 */
- Christian Brauner <[email protected]>:
- Protect init_net with a CONFIG_NET ifdef in case it is set to "n".
- As Tejun pointed out there is argument to be made that a new mount
option for sysfs could be added that would change how devices are
propagated. This patch does not prevent this but it is an orthogonal
problem.
---
block/genhd.c | 79 +++++++++++++++++++++++++++++++++++++
fs/kernfs/dir.c | 34 +++++++++++++---
fs/kernfs/kernfs-internal.h | 24 +++++++++++
fs/sysfs/mount.c | 4 ++
include/linux/genhd.h | 3 ++
include/linux/kernfs.h | 22 +++++++++++
include/linux/kobject_ns.h | 4 ++
lib/kobject.c | 2 +
8 files changed, 167 insertions(+), 5 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 06b642b23a07..b5b2601c4311 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1198,11 +1198,81 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data)
return NULL;
}

+#ifdef CONFIG_BLK_DEV_LOOPFS
+static void *user_grab_current_ns(void)
+{
+ struct user_namespace *ns = current_user_ns();
+ return get_user_ns(ns);
+}
+
+static const void *user_initial_ns(void)
+{
+ return &init_user_ns;
+}
+
+static void user_put_ns(void *p)
+{
+ struct user_namespace *ns = p;
+ put_user_ns(ns);
+}
+
+static bool user_current_may_mount(void)
+{
+ return ns_capable(current_user_ns(), CAP_SYS_ADMIN);
+}
+
+const struct kobj_ns_type_operations user_ns_type_operations = {
+ .type = KOBJ_NS_TYPE_USER,
+ .current_may_mount = user_current_may_mount,
+ .grab_current_ns = user_grab_current_ns,
+ .initial_ns = user_initial_ns,
+ .drop_ns = user_put_ns,
+};
+
+static const void *block_class_user_namespace(struct device *dev)
+{
+ struct gendisk *disk;
+
+ if (dev->type == &part_type)
+ disk = part_to_disk(dev_to_part(dev));
+ else
+ disk = dev_to_disk(dev);
+
+ return disk->user_ns;
+}
+
+static void block_class_get_ownership(struct device *dev, kuid_t *uid, kgid_t *gid)
+{
+ struct gendisk *disk;
+ struct user_namespace *ns;
+
+ if (dev->type == &part_type)
+ disk = part_to_disk(dev_to_part(dev));
+ else
+ disk = dev_to_disk(dev);
+
+ ns = disk->user_ns;
+ if (ns && ns != &init_user_ns) {
+ kuid_t ns_root_uid = make_kuid(ns, 0);
+ kgid_t ns_root_gid = make_kgid(ns, 0);
+
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+ }
+}
+#endif /* CONFIG_BLK_DEV_LOOPFS */
+
static int __init genhd_device_init(void)
{
int error;

block_class.dev_kobj = sysfs_dev_block_kobj;
+#ifdef CONFIG_BLK_DEV_LOOPFS
+ kobj_ns_type_register(&user_ns_type_operations);
+#endif
error = class_register(&block_class);
if (unlikely(error))
return error;
@@ -1524,8 +1594,14 @@ static void disk_release(struct device *dev)
blk_put_queue(disk->queue);
kfree(disk);
}
+
struct class block_class = {
.name = "block",
+#ifdef CONFIG_BLK_DEV_LOOPFS
+ .ns_type = &user_ns_type_operations,
+ .namespace = block_class_user_namespace,
+ .get_ownership = block_class_get_ownership,
+#endif
};

static char *block_devnode(struct device *dev, umode_t *mode,
@@ -1715,6 +1791,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
+#ifdef CONFIG_BLK_DEV_LOOPFS
+ disk->user_ns = &init_user_ns;
+#endif
}
return disk;
}
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1f2d894ae454..02796ba6521a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -575,10 +575,15 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
goto out_bad;

/* The kernfs node has been moved to a different namespace */
- if (kn->parent && kernfs_ns_enabled(kn->parent) &&
- kernfs_info(dentry->d_sb)->ns[kn->ns_type] != kn->ns)
- goto out_bad;
+ if (kn->parent && kernfs_ns_enabled(kn->parent)) {
+ if (kernfs_init_ns_propagates(kn->parent) &&
+ kn->ns == kernfs_init_ns(kn->parent->ns_type))
+ goto out_good;
+ if (kernfs_info(dentry->d_sb)->ns[kn->parent->ns_type] != kn->ns)
+ goto out_bad;
+ }

+out_good:
mutex_unlock(&kernfs_mutex);
return 1;
out_bad:
@@ -1090,6 +1095,10 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
ns = kernfs_info(dir->i_sb)->ns[parent->ns_type];

kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+ if (!kn && kernfs_init_ns_propagates(parent)) {
+ ns = kernfs_init_ns(parent->ns_type);
+ kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+ }

/* no such entry */
if (!kn || !kernfs_active(kn)) {
@@ -1614,6 +1623,8 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
static struct kernfs_node *kernfs_dir_pos(const void *ns,
struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
{
+ const void *init_ns;
+
if (pos) {
int valid = kernfs_active(pos) &&
pos->parent == parent && hash == pos->hash;
@@ -1621,6 +1632,12 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
if (!valid)
pos = NULL;
}
+
+ if (kernfs_init_ns_propagates(parent))
+ init_ns = kernfs_init_ns(parent->ns_type);
+ else
+ init_ns = NULL;
+
if (!pos && (hash > 1) && (hash < INT_MAX)) {
struct rb_node *node = parent->dir.children.rb_node;
while (node) {
@@ -1635,7 +1652,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
}
}
/* Skip over entries which are dying/dead or in the wrong namespace */
- while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
+ while (pos && (!kernfs_active(pos) || (pos->ns != ns && pos->ns != init_ns))) {
struct rb_node *node = rb_next(&pos->rb);
if (!node)
pos = NULL;
@@ -1650,13 +1667,20 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
{
pos = kernfs_dir_pos(ns, parent, ino, pos);
if (pos) {
+ const void *init_ns;
+ if (kernfs_init_ns_propagates(parent))
+ init_ns = kernfs_init_ns(parent->ns_type);
+ else
+ init_ns = NULL;
+
do {
struct rb_node *node = rb_next(&pos->rb);
if (!node)
pos = NULL;
else
pos = rb_to_kn(node);
- } while (pos && (!kernfs_active(pos) || pos->ns != ns));
+ } while (pos && (!kernfs_active(pos) ||
+ (pos->ns != ns && pos->ns != init_ns)));
}
return pos;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 7c972c00f84a..74eb6c447361 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -80,6 +80,30 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
return d_inode(dentry)->i_private;
}

+#ifdef CONFIG_NET
+extern struct net init_net;
+#endif
+
+extern struct user_namespace init_user_ns;
+
+static inline const void *kernfs_init_ns(enum kobj_ns_type ns_type)
+{
+ switch (ns_type) {
+ case KOBJ_NS_TYPE_NET:
+#ifdef CONFIG_NET
+ return &init_net;
+#else
+ break;
+#endif
+ case KOBJ_NS_TYPE_USER:
+ return &init_user_ns;
+ default:
+ pr_debug("Unsupported namespace type %d for kernfs\n", ns_type);
+ }
+
+ return NULL;
+}
+
extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 5e2ec88a709e..99b82a0ae7ea 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -43,6 +43,8 @@ static void sysfs_fs_context_free(struct fs_context *fc)

if (kfc->ns_tag[KOBJ_NS_TYPE_NET])
kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag[KOBJ_NS_TYPE_NET]);
+ if (kfc->ns_tag[KOBJ_NS_TYPE_USER])
+ kobj_ns_drop(KOBJ_NS_TYPE_USER, kfc->ns_tag[KOBJ_NS_TYPE_USER]);
kernfs_free_fs_context(fc);
kfree(kfc);
}
@@ -67,6 +69,7 @@ static int sysfs_init_fs_context(struct fs_context *fc)
return -ENOMEM;

kfc->ns_tag[KOBJ_NS_TYPE_NET] = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+ kfc->ns_tag[KOBJ_NS_TYPE_USER] = kobj_ns_grab_current(KOBJ_NS_TYPE_USER);
kfc->root = sysfs_root;
kfc->magic = SYSFS_MAGIC;
fc->fs_private = kfc;
@@ -85,6 +88,7 @@ static void sysfs_kill_sb(struct super_block *sb)

kernfs_kill_sb(sb);
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns[KOBJ_NS_TYPE_NET]);
+ kobj_ns_drop(KOBJ_NS_TYPE_USER, ns[KOBJ_NS_TYPE_USER]);
}

static struct file_system_type sysfs_fs_type = {
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 9b3fffdf4011..4edf8a82e815 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -220,6 +220,9 @@ struct gendisk {
int node_id;
struct badblocks *bb;
struct lockdep_map lockdep_map;
+#ifdef CONFIG_BLK_DEV_LOOPFS
+ struct user_namespace *user_ns;
+#endif
};

static inline struct gendisk *part_to_disk(struct hd_struct *part)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d0544f2e0c99..7809584a7c8e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -53,6 +53,7 @@ enum kernfs_node_flag {
KERNFS_SUICIDED = 0x0800,
KERNFS_EMPTY_DIR = 0x1000,
KERNFS_HAS_RELEASE = 0x2000,
+ KERNFS_NS_PROPAGATE = 0x4000,
};

/* @flags for kernfs_create_root() */
@@ -337,6 +338,27 @@ static inline void kernfs_enable_ns(struct kernfs_node *kn,
kn->ns_type = ns_type;
}

+static inline void kernfs_enable_init_ns_propagates(struct kernfs_node *kn)
+{
+ WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
+ WARN_ON_ONCE(!(kn->flags & KERNFS_NS));
+ kn->flags |= KERNFS_NS_PROPAGATE;
+}
+
+/**
+ * kernfs_init_ns_propagates - test whether init ns propagates
+ * @kn: the node to test
+ *
+ * Test whether kernfs entries created in the init namespace propagate into
+ * other namespaces.
+ */
+static inline bool kernfs_init_ns_propagates(const struct kernfs_node *kn)
+{
+ return ((kn->flags & (KERNFS_NS | KERNFS_NS_PROPAGATE)) ==
+ (KERNFS_NS | KERNFS_NS_PROPAGATE));
+}
+
/**
* kernfs_ns_enabled - test whether namespace is enabled
* @kn: the node to test
diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h
index 991a9286bcea..a9c45bcce235 100644
--- a/include/linux/kobject_ns.h
+++ b/include/linux/kobject_ns.h
@@ -26,6 +26,7 @@ struct kobject;
enum kobj_ns_type {
KOBJ_NS_TYPE_NONE = 0,
KOBJ_NS_TYPE_NET,
+ KOBJ_NS_TYPE_USER,
KOBJ_NS_TYPES
};

@@ -34,6 +35,8 @@ enum kobj_ns_type {
* @grab_current_ns: return a new reference to calling task's namespace
* @initial_ns: return the initial namespace (i.e. init_net_ns)
* @drop_ns: drops a reference to namespace
+ * @initial_ns_propagates: whether devices in the initial namespace propagate
+ * to all other namespaces
*/
struct kobj_ns_type_operations {
enum kobj_ns_type type;
@@ -41,6 +44,7 @@ struct kobj_ns_type_operations {
void *(*grab_current_ns)(void);
const void *(*initial_ns)(void);
void (*drop_ns)(void *);
+ bool (*initial_ns_propagates)(void);
};

int kobj_ns_type_register(const struct kobj_ns_type_operations *ops);
diff --git a/lib/kobject.c b/lib/kobject.c
index c58c62d49a10..96bb8c732d1c 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -121,6 +121,8 @@ static int create_dir(struct kobject *kobj)
BUG_ON(!kobj_ns_type_registered(ops->type));

sysfs_enable_ns(kobj->sd, ops->type);
+ if (ops->initial_ns_propagates && ops->initial_ns_propagates())
+ kernfs_enable_init_ns_propagates(kobj->sd);
}

return 0;
--
2.26.1


2020-04-23 01:18:29

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH v2 5/7] loop: preserve sysfs backwards compatibility

On Wed, Apr 22, 2020 at 04:54:35PM +0200, Christian Brauner wrote:
> For sysfs the initial namespace is special. All devices currently
> propagate into all non-initial namespaces. For example, sysfs is usually
> mounted in a privileged or unprivileged container and all devices are
> visible to the container but are owned by global root. Even though none
> of the propagated files can be used there are still a lot of read-only
> values that are accessed or read by tools running in non-initial
> namespaces. Some devices though, which can be moved or created in
> another namespace, will only show up in the corresponding namespace.
> This currently includes network and loop devices but no other ones.
> Since all current workloads depend on devices from the inital namespace
> being visible this behavior cannot be simply changed. This patch just
> makes sure to keep propagating devices that share the same device class
> with loop devices from the initial namespaces into all non-initial
> namespaces as before. In short, nothing changes only loopfs loop devices
> will be shown in their correct namespace.
>
> Cc: Jens Axboe <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Signed-off-by: Christian Brauner <[email protected]>

Hi,

two comments below:

> ---
> /* v2 */
> - Christian Brauner <[email protected]>:
> - Protect init_net with a CONFIG_NET ifdef in case it is set to "n".
> - As Tejun pointed out there is argument to be made that a new mount
> option for sysfs could be added that would change how devices are
> propagated. This patch does not prevent this but it is an orthogonal
> problem.
> ---
> block/genhd.c | 79 +++++++++++++++++++++++++++++++++++++
> fs/kernfs/dir.c | 34 +++++++++++++---
> fs/kernfs/kernfs-internal.h | 24 +++++++++++
> fs/sysfs/mount.c | 4 ++
> include/linux/genhd.h | 3 ++
> include/linux/kernfs.h | 22 +++++++++++
> include/linux/kobject_ns.h | 4 ++
> lib/kobject.c | 2 +
> 8 files changed, 167 insertions(+), 5 deletions(-)
>
> diff --git a/block/genhd.c b/block/genhd.c
> index 06b642b23a07..b5b2601c4311 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1198,11 +1198,81 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data)
> return NULL;
> }
>
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> +static void *user_grab_current_ns(void)
> +{
> + struct user_namespace *ns = current_user_ns();
> + return get_user_ns(ns);
> +}
> +
> +static const void *user_initial_ns(void)
> +{
> + return &init_user_ns;
> +}
> +
> +static void user_put_ns(void *p)
> +{
> + struct user_namespace *ns = p;
> + put_user_ns(ns);
> +}
> +
> +static bool user_current_may_mount(void)
> +{
> + return ns_capable(current_user_ns(), CAP_SYS_ADMIN);
> +}
> +
> +const struct kobj_ns_type_operations user_ns_type_operations = {
> + .type = KOBJ_NS_TYPE_USER,
> + .current_may_mount = user_current_may_mount,
> + .grab_current_ns = user_grab_current_ns,
> + .initial_ns = user_initial_ns,
> + .drop_ns = user_put_ns,
> +};
> +
> +static const void *block_class_user_namespace(struct device *dev)
> +{
> + struct gendisk *disk;
> +
> + if (dev->type == &part_type)
> + disk = part_to_disk(dev_to_part(dev));
> + else
> + disk = dev_to_disk(dev);
> +
> + return disk->user_ns;
> +}
> +
> +static void block_class_get_ownership(struct device *dev, kuid_t *uid, kgid_t *gid)
> +{
> + struct gendisk *disk;
> + struct user_namespace *ns;
> +
> + if (dev->type == &part_type)
> + disk = part_to_disk(dev_to_part(dev));
> + else
> + disk = dev_to_disk(dev);
> +
> + ns = disk->user_ns;
> + if (ns && ns != &init_user_ns) {
> + kuid_t ns_root_uid = make_kuid(ns, 0);
> + kgid_t ns_root_gid = make_kgid(ns, 0);
> +
> + if (uid_valid(ns_root_uid))
> + *uid = ns_root_uid;
> +
> + if (gid_valid(ns_root_gid))
> + *gid = ns_root_gid;
> + }

You're not setting uid and gid in the else case?

> +}
> +#endif /* CONFIG_BLK_DEV_LOOPFS */
> +
> static int __init genhd_device_init(void)
> {
> int error;
>
> block_class.dev_kobj = sysfs_dev_block_kobj;
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + kobj_ns_type_register(&user_ns_type_operations);
> +#endif
> error = class_register(&block_class);
> if (unlikely(error))
> return error;
> @@ -1524,8 +1594,14 @@ static void disk_release(struct device *dev)
> blk_put_queue(disk->queue);
> kfree(disk);
> }
> +
> struct class block_class = {
> .name = "block",
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + .ns_type = &user_ns_type_operations,
> + .namespace = block_class_user_namespace,
> + .get_ownership = block_class_get_ownership,
> +#endif
> };
>
> static char *block_devnode(struct device *dev, umode_t *mode,
> @@ -1715,6 +1791,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
> disk_to_dev(disk)->class = &block_class;
> disk_to_dev(disk)->type = &disk_type;
> device_initialize(disk_to_dev(disk));
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + disk->user_ns = &init_user_ns;
> +#endif
> }
> return disk;
> }
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 1f2d894ae454..02796ba6521a 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -575,10 +575,15 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
> goto out_bad;
>
> /* The kernfs node has been moved to a different namespace */
> - if (kn->parent && kernfs_ns_enabled(kn->parent) &&
> - kernfs_info(dentry->d_sb)->ns[kn->ns_type] != kn->ns)
> - goto out_bad;
> + if (kn->parent && kernfs_ns_enabled(kn->parent)) {
> + if (kernfs_init_ns_propagates(kn->parent) &&
> + kn->ns == kernfs_init_ns(kn->parent->ns_type))
> + goto out_good;
> + if (kernfs_info(dentry->d_sb)->ns[kn->parent->ns_type] != kn->ns)
> + goto out_bad;
> + }
>
> +out_good:
> mutex_unlock(&kernfs_mutex);
> return 1;
> out_bad:
> @@ -1090,6 +1095,10 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
> ns = kernfs_info(dir->i_sb)->ns[parent->ns_type];
>
> kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> + if (!kn && kernfs_init_ns_propagates(parent)) {
> + ns = kernfs_init_ns(parent->ns_type);
> + kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> + }
>
> /* no such entry */
> if (!kn || !kernfs_active(kn)) {
> @@ -1614,6 +1623,8 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
> static struct kernfs_node *kernfs_dir_pos(const void *ns,
> struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
> {
> + const void *init_ns;
> +
> if (pos) {
> int valid = kernfs_active(pos) &&
> pos->parent == parent && hash == pos->hash;
> @@ -1621,6 +1632,12 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> if (!valid)
> pos = NULL;
> }
> +
> + if (kernfs_init_ns_propagates(parent))
> + init_ns = kernfs_init_ns(parent->ns_type);
> + else
> + init_ns = NULL;
> +
> if (!pos && (hash > 1) && (hash < INT_MAX)) {
> struct rb_node *node = parent->dir.children.rb_node;
> while (node) {
> @@ -1635,7 +1652,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> }
> }
> /* Skip over entries which are dying/dead or in the wrong namespace */
> - while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
> + while (pos && (!kernfs_active(pos) || (pos->ns != ns && pos->ns != init_ns))) {
> struct rb_node *node = rb_next(&pos->rb);
> if (!node)
> pos = NULL;
> @@ -1650,13 +1667,20 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
> {
> pos = kernfs_dir_pos(ns, parent, ino, pos);
> if (pos) {
> + const void *init_ns;
> + if (kernfs_init_ns_propagates(parent))
> + init_ns = kernfs_init_ns(parent->ns_type);
> + else
> + init_ns = NULL;
> +
> do {
> struct rb_node *node = rb_next(&pos->rb);
> if (!node)
> pos = NULL;
> else
> pos = rb_to_kn(node);
> - } while (pos && (!kernfs_active(pos) || pos->ns != ns));
> + } while (pos && (!kernfs_active(pos) ||
> + (pos->ns != ns && pos->ns != init_ns)));
> }
> return pos;
> }
> diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
> index 7c972c00f84a..74eb6c447361 100644
> --- a/fs/kernfs/kernfs-internal.h
> +++ b/fs/kernfs/kernfs-internal.h
> @@ -80,6 +80,30 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
> return d_inode(dentry)->i_private;
> }
>
> +#ifdef CONFIG_NET
> +extern struct net init_net;
> +#endif
> +
> +extern struct user_namespace init_user_ns;
> +
> +static inline const void *kernfs_init_ns(enum kobj_ns_type ns_type)
> +{
> + switch (ns_type) {
> + case KOBJ_NS_TYPE_NET:
> +#ifdef CONFIG_NET
> + return &init_net;
> +#else
> + break;
> +#endif
> + case KOBJ_NS_TYPE_USER:
> + return &init_user_ns;
> + default:
> + pr_debug("Unsupported namespace type %d for kernfs\n", ns_type);
> + }
> +
> + return NULL;
> +}
> +
> extern const struct super_operations kernfs_sops;
> extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
>
> diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
> index 5e2ec88a709e..99b82a0ae7ea 100644
> --- a/fs/sysfs/mount.c
> +++ b/fs/sysfs/mount.c
> @@ -43,6 +43,8 @@ static void sysfs_fs_context_free(struct fs_context *fc)
>
> if (kfc->ns_tag[KOBJ_NS_TYPE_NET])
> kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag[KOBJ_NS_TYPE_NET]);
> + if (kfc->ns_tag[KOBJ_NS_TYPE_USER])
> + kobj_ns_drop(KOBJ_NS_TYPE_USER, kfc->ns_tag[KOBJ_NS_TYPE_USER]);
> kernfs_free_fs_context(fc);
> kfree(kfc);
> }
> @@ -67,6 +69,7 @@ static int sysfs_init_fs_context(struct fs_context *fc)
> return -ENOMEM;
>
> kfc->ns_tag[KOBJ_NS_TYPE_NET] = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
> + kfc->ns_tag[KOBJ_NS_TYPE_USER] = kobj_ns_grab_current(KOBJ_NS_TYPE_USER);

It's nice and tidy this way so maybe worth it, but getting
the kobj_ns_type_lock spinlock twice in a row here seems
unfortunate.

> kfc->root = sysfs_root;
> kfc->magic = SYSFS_MAGIC;
> fc->fs_private = kfc;
> @@ -85,6 +88,7 @@ static void sysfs_kill_sb(struct super_block *sb)
>
> kernfs_kill_sb(sb);
> kobj_ns_drop(KOBJ_NS_TYPE_NET, ns[KOBJ_NS_TYPE_NET]);
> + kobj_ns_drop(KOBJ_NS_TYPE_USER, ns[KOBJ_NS_TYPE_USER]);
> }
>
> static struct file_system_type sysfs_fs_type = {
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 9b3fffdf4011..4edf8a82e815 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -220,6 +220,9 @@ struct gendisk {
> int node_id;
> struct badblocks *bb;
> struct lockdep_map lockdep_map;
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + struct user_namespace *user_ns;
> +#endif
> };
>
> static inline struct gendisk *part_to_disk(struct hd_struct *part)
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index d0544f2e0c99..7809584a7c8e 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -53,6 +53,7 @@ enum kernfs_node_flag {
> KERNFS_SUICIDED = 0x0800,
> KERNFS_EMPTY_DIR = 0x1000,
> KERNFS_HAS_RELEASE = 0x2000,
> + KERNFS_NS_PROPAGATE = 0x4000,
> };
>
> /* @flags for kernfs_create_root() */
> @@ -337,6 +338,27 @@ static inline void kernfs_enable_ns(struct kernfs_node *kn,
> kn->ns_type = ns_type;
> }
>
> +static inline void kernfs_enable_init_ns_propagates(struct kernfs_node *kn)
> +{
> + WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
> + WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
> + WARN_ON_ONCE(!(kn->flags & KERNFS_NS));
> + kn->flags |= KERNFS_NS_PROPAGATE;
> +}
> +
> +/**
> + * kernfs_init_ns_propagates - test whether init ns propagates
> + * @kn: the node to test
> + *
> + * Test whether kernfs entries created in the init namespace propagate into
> + * other namespaces.
> + */
> +static inline bool kernfs_init_ns_propagates(const struct kernfs_node *kn)
> +{
> + return ((kn->flags & (KERNFS_NS | KERNFS_NS_PROPAGATE)) ==
> + (KERNFS_NS | KERNFS_NS_PROPAGATE));
> +}
> +
> /**
> * kernfs_ns_enabled - test whether namespace is enabled
> * @kn: the node to test
> diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h
> index 991a9286bcea..a9c45bcce235 100644
> --- a/include/linux/kobject_ns.h
> +++ b/include/linux/kobject_ns.h
> @@ -26,6 +26,7 @@ struct kobject;
> enum kobj_ns_type {
> KOBJ_NS_TYPE_NONE = 0,
> KOBJ_NS_TYPE_NET,
> + KOBJ_NS_TYPE_USER,
> KOBJ_NS_TYPES
> };
>
> @@ -34,6 +35,8 @@ enum kobj_ns_type {
> * @grab_current_ns: return a new reference to calling task's namespace
> * @initial_ns: return the initial namespace (i.e. init_net_ns)
> * @drop_ns: drops a reference to namespace
> + * @initial_ns_propagates: whether devices in the initial namespace propagate
> + * to all other namespaces
> */
> struct kobj_ns_type_operations {
> enum kobj_ns_type type;
> @@ -41,6 +44,7 @@ struct kobj_ns_type_operations {
> void *(*grab_current_ns)(void);
> const void *(*initial_ns)(void);
> void (*drop_ns)(void *);
> + bool (*initial_ns_propagates)(void);
> };
>
> int kobj_ns_type_register(const struct kobj_ns_type_operations *ops);
> diff --git a/lib/kobject.c b/lib/kobject.c
> index c58c62d49a10..96bb8c732d1c 100644
> --- a/lib/kobject.c
> +++ b/lib/kobject.c
> @@ -121,6 +121,8 @@ static int create_dir(struct kobject *kobj)
> BUG_ON(!kobj_ns_type_registered(ops->type));
>
> sysfs_enable_ns(kobj->sd, ops->type);
> + if (ops->initial_ns_propagates && ops->initial_ns_propagates())
> + kernfs_enable_init_ns_propagates(kobj->sd);
> }
>
> return 0;
> --
> 2.26.1

2020-04-23 11:21:01

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH v2 5/7] loop: preserve sysfs backwards compatibility

On Wed, Apr 22, 2020 at 08:17:06PM -0500, Serge Hallyn wrote:
> On Wed, Apr 22, 2020 at 04:54:35PM +0200, Christian Brauner wrote:
> > For sysfs the initial namespace is special. All devices currently
> > propagate into all non-initial namespaces. For example, sysfs is usually
> > mounted in a privileged or unprivileged container and all devices are
> > visible to the container but are owned by global root. Even though none
> > of the propagated files can be used there are still a lot of read-only
> > values that are accessed or read by tools running in non-initial
> > namespaces. Some devices though, which can be moved or created in
> > another namespace, will only show up in the corresponding namespace.
> > This currently includes network and loop devices but no other ones.
> > Since all current workloads depend on devices from the inital namespace
> > being visible this behavior cannot be simply changed. This patch just
> > makes sure to keep propagating devices that share the same device class
> > with loop devices from the initial namespaces into all non-initial
> > namespaces as before. In short, nothing changes only loopfs loop devices
> > will be shown in their correct namespace.
> >
> > Cc: Jens Axboe <[email protected]>
> > Cc: Tejun Heo <[email protected]>
> > Cc: Greg Kroah-Hartman <[email protected]>
> > Signed-off-by: Christian Brauner <[email protected]>
>
> Hi,
>
> two comments below:
>
> > ---
> > /* v2 */
> > - Christian Brauner <[email protected]>:
> > - Protect init_net with a CONFIG_NET ifdef in case it is set to "n".
> > - As Tejun pointed out there is argument to be made that a new mount
> > option for sysfs could be added that would change how devices are
> > propagated. This patch does not prevent this but it is an orthogonal
> > problem.
> > ---
> > block/genhd.c | 79 +++++++++++++++++++++++++++++++++++++
> > fs/kernfs/dir.c | 34 +++++++++++++---
> > fs/kernfs/kernfs-internal.h | 24 +++++++++++
> > fs/sysfs/mount.c | 4 ++
> > include/linux/genhd.h | 3 ++
> > include/linux/kernfs.h | 22 +++++++++++
> > include/linux/kobject_ns.h | 4 ++
> > lib/kobject.c | 2 +
> > 8 files changed, 167 insertions(+), 5 deletions(-)
> >
> > diff --git a/block/genhd.c b/block/genhd.c
> > index 06b642b23a07..b5b2601c4311 100644
> > --- a/block/genhd.c
> > +++ b/block/genhd.c
> > @@ -1198,11 +1198,81 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data)
> > return NULL;
> > }
> >
> > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > +static void *user_grab_current_ns(void)
> > +{
> > + struct user_namespace *ns = current_user_ns();
> > + return get_user_ns(ns);
> > +}
> > +
> > +static const void *user_initial_ns(void)
> > +{
> > + return &init_user_ns;
> > +}
> > +
> > +static void user_put_ns(void *p)
> > +{
> > + struct user_namespace *ns = p;
> > + put_user_ns(ns);
> > +}
> > +
> > +static bool user_current_may_mount(void)
> > +{
> > + return ns_capable(current_user_ns(), CAP_SYS_ADMIN);
> > +}
> > +
> > +const struct kobj_ns_type_operations user_ns_type_operations = {
> > + .type = KOBJ_NS_TYPE_USER,
> > + .current_may_mount = user_current_may_mount,
> > + .grab_current_ns = user_grab_current_ns,
> > + .initial_ns = user_initial_ns,
> > + .drop_ns = user_put_ns,
> > +};
> > +
> > +static const void *block_class_user_namespace(struct device *dev)
> > +{
> > + struct gendisk *disk;
> > +
> > + if (dev->type == &part_type)
> > + disk = part_to_disk(dev_to_part(dev));
> > + else
> > + disk = dev_to_disk(dev);
> > +
> > + return disk->user_ns;
> > +}
> > +
> > +static void block_class_get_ownership(struct device *dev, kuid_t *uid, kgid_t *gid)
> > +{
> > + struct gendisk *disk;
> > + struct user_namespace *ns;
> > +
> > + if (dev->type == &part_type)
> > + disk = part_to_disk(dev_to_part(dev));
> > + else
> > + disk = dev_to_disk(dev);
> > +
> > + ns = disk->user_ns;
> > + if (ns && ns != &init_user_ns) {
> > + kuid_t ns_root_uid = make_kuid(ns, 0);
> > + kgid_t ns_root_gid = make_kgid(ns, 0);
> > +
> > + if (uid_valid(ns_root_uid))
> > + *uid = ns_root_uid;
> > +
> > + if (gid_valid(ns_root_gid))
> > + *gid = ns_root_gid;
> > + }
>
> You're not setting uid and gid in the else case?

Right, the reason being that sysfs and the associated kobject
infrastructure will always set global root as the default. So the
callchain is:
kobject_get_ownership()
and this calls the ktype callbacks which hits
-> device_get_ownership()
which calls into the device class specific callbacks which in this is
case calls block_class_get_ownership().

And there's no direct callers of, say <device-class>->get_ownership()
that all needs to always go through the callback infrastructure.

>
> > +}
> > +#endif /* CONFIG_BLK_DEV_LOOPFS */
> > +
> > static int __init genhd_device_init(void)
> > {
> > int error;
> >
> > block_class.dev_kobj = sysfs_dev_block_kobj;
> > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > + kobj_ns_type_register(&user_ns_type_operations);
> > +#endif
> > error = class_register(&block_class);
> > if (unlikely(error))
> > return error;
> > @@ -1524,8 +1594,14 @@ static void disk_release(struct device *dev)
> > blk_put_queue(disk->queue);
> > kfree(disk);
> > }
> > +
> > struct class block_class = {
> > .name = "block",
> > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > + .ns_type = &user_ns_type_operations,
> > + .namespace = block_class_user_namespace,
> > + .get_ownership = block_class_get_ownership,
> > +#endif
> > };
> >
> > static char *block_devnode(struct device *dev, umode_t *mode,
> > @@ -1715,6 +1791,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
> > disk_to_dev(disk)->class = &block_class;
> > disk_to_dev(disk)->type = &disk_type;
> > device_initialize(disk_to_dev(disk));
> > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > + disk->user_ns = &init_user_ns;
> > +#endif
> > }
> > return disk;
> > }
> > diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> > index 1f2d894ae454..02796ba6521a 100644
> > --- a/fs/kernfs/dir.c
> > +++ b/fs/kernfs/dir.c
> > @@ -575,10 +575,15 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
> > goto out_bad;
> >
> > /* The kernfs node has been moved to a different namespace */
> > - if (kn->parent && kernfs_ns_enabled(kn->parent) &&
> > - kernfs_info(dentry->d_sb)->ns[kn->ns_type] != kn->ns)
> > - goto out_bad;
> > + if (kn->parent && kernfs_ns_enabled(kn->parent)) {
> > + if (kernfs_init_ns_propagates(kn->parent) &&
> > + kn->ns == kernfs_init_ns(kn->parent->ns_type))
> > + goto out_good;
> > + if (kernfs_info(dentry->d_sb)->ns[kn->parent->ns_type] != kn->ns)
> > + goto out_bad;
> > + }
> >
> > +out_good:
> > mutex_unlock(&kernfs_mutex);
> > return 1;
> > out_bad:
> > @@ -1090,6 +1095,10 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
> > ns = kernfs_info(dir->i_sb)->ns[parent->ns_type];
> >
> > kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> > + if (!kn && kernfs_init_ns_propagates(parent)) {
> > + ns = kernfs_init_ns(parent->ns_type);
> > + kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> > + }
> >
> > /* no such entry */
> > if (!kn || !kernfs_active(kn)) {
> > @@ -1614,6 +1623,8 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
> > static struct kernfs_node *kernfs_dir_pos(const void *ns,
> > struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
> > {
> > + const void *init_ns;
> > +
> > if (pos) {
> > int valid = kernfs_active(pos) &&
> > pos->parent == parent && hash == pos->hash;
> > @@ -1621,6 +1632,12 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> > if (!valid)
> > pos = NULL;
> > }
> > +
> > + if (kernfs_init_ns_propagates(parent))
> > + init_ns = kernfs_init_ns(parent->ns_type);
> > + else
> > + init_ns = NULL;
> > +
> > if (!pos && (hash > 1) && (hash < INT_MAX)) {
> > struct rb_node *node = parent->dir.children.rb_node;
> > while (node) {
> > @@ -1635,7 +1652,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> > }
> > }
> > /* Skip over entries which are dying/dead or in the wrong namespace */
> > - while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
> > + while (pos && (!kernfs_active(pos) || (pos->ns != ns && pos->ns != init_ns))) {
> > struct rb_node *node = rb_next(&pos->rb);
> > if (!node)
> > pos = NULL;
> > @@ -1650,13 +1667,20 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
> > {
> > pos = kernfs_dir_pos(ns, parent, ino, pos);
> > if (pos) {
> > + const void *init_ns;
> > + if (kernfs_init_ns_propagates(parent))
> > + init_ns = kernfs_init_ns(parent->ns_type);
> > + else
> > + init_ns = NULL;
> > +
> > do {
> > struct rb_node *node = rb_next(&pos->rb);
> > if (!node)
> > pos = NULL;
> > else
> > pos = rb_to_kn(node);
> > - } while (pos && (!kernfs_active(pos) || pos->ns != ns));
> > + } while (pos && (!kernfs_active(pos) ||
> > + (pos->ns != ns && pos->ns != init_ns)));
> > }
> > return pos;
> > }
> > diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
> > index 7c972c00f84a..74eb6c447361 100644
> > --- a/fs/kernfs/kernfs-internal.h
> > +++ b/fs/kernfs/kernfs-internal.h
> > @@ -80,6 +80,30 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
> > return d_inode(dentry)->i_private;
> > }
> >
> > +#ifdef CONFIG_NET
> > +extern struct net init_net;
> > +#endif
> > +
> > +extern struct user_namespace init_user_ns;
> > +
> > +static inline const void *kernfs_init_ns(enum kobj_ns_type ns_type)
> > +{
> > + switch (ns_type) {
> > + case KOBJ_NS_TYPE_NET:
> > +#ifdef CONFIG_NET
> > + return &init_net;
> > +#else
> > + break;
> > +#endif
> > + case KOBJ_NS_TYPE_USER:
> > + return &init_user_ns;
> > + default:
> > + pr_debug("Unsupported namespace type %d for kernfs\n", ns_type);
> > + }
> > +
> > + return NULL;
> > +}
> > +
> > extern const struct super_operations kernfs_sops;
> > extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
> >
> > diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
> > index 5e2ec88a709e..99b82a0ae7ea 100644
> > --- a/fs/sysfs/mount.c
> > +++ b/fs/sysfs/mount.c
> > @@ -43,6 +43,8 @@ static void sysfs_fs_context_free(struct fs_context *fc)
> >
> > if (kfc->ns_tag[KOBJ_NS_TYPE_NET])
> > kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag[KOBJ_NS_TYPE_NET]);
> > + if (kfc->ns_tag[KOBJ_NS_TYPE_USER])
> > + kobj_ns_drop(KOBJ_NS_TYPE_USER, kfc->ns_tag[KOBJ_NS_TYPE_USER]);
> > kernfs_free_fs_context(fc);
> > kfree(kfc);
> > }
> > @@ -67,6 +69,7 @@ static int sysfs_init_fs_context(struct fs_context *fc)
> > return -ENOMEM;
> >
> > kfc->ns_tag[KOBJ_NS_TYPE_NET] = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
> > + kfc->ns_tag[KOBJ_NS_TYPE_USER] = kobj_ns_grab_current(KOBJ_NS_TYPE_USER);
>
> It's nice and tidy this way so maybe worth it, but getting
> the kobj_ns_type_lock spinlock twice in a row here seems
> unfortunate.

Let me see if I can do something non-ugly and moderately simple about
this. If not, it's probably fine as it is since it only happens on sysfs
mount.

2020-04-23 16:16:01

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH v2 5/7] loop: preserve sysfs backwards compatibility

On Thu, Apr 23, 2020 at 01:15:24PM +0200, Christian Brauner wrote:
> On Wed, Apr 22, 2020 at 08:17:06PM -0500, Serge Hallyn wrote:
> > On Wed, Apr 22, 2020 at 04:54:35PM +0200, Christian Brauner wrote:
> > > For sysfs the initial namespace is special. All devices currently
> > > propagate into all non-initial namespaces. For example, sysfs is usually
> > > mounted in a privileged or unprivileged container and all devices are
> > > visible to the container but are owned by global root. Even though none
> > > of the propagated files can be used there are still a lot of read-only
> > > values that are accessed or read by tools running in non-initial
> > > namespaces. Some devices though, which can be moved or created in
> > > another namespace, will only show up in the corresponding namespace.
> > > This currently includes network and loop devices but no other ones.
> > > Since all current workloads depend on devices from the inital namespace
> > > being visible this behavior cannot be simply changed. This patch just
> > > makes sure to keep propagating devices that share the same device class
> > > with loop devices from the initial namespaces into all non-initial
> > > namespaces as before. In short, nothing changes only loopfs loop devices
> > > will be shown in their correct namespace.
> > >
> > > Cc: Jens Axboe <[email protected]>
> > > Cc: Tejun Heo <[email protected]>
> > > Cc: Greg Kroah-Hartman <[email protected]>
> > > Signed-off-by: Christian Brauner <[email protected]>
> >
> > Hi,
> >
> > two comments below:
> >
> > > ---
> > > /* v2 */
> > > - Christian Brauner <[email protected]>:
> > > - Protect init_net with a CONFIG_NET ifdef in case it is set to "n".
> > > - As Tejun pointed out there is argument to be made that a new mount
> > > option for sysfs could be added that would change how devices are
> > > propagated. This patch does not prevent this but it is an orthogonal
> > > problem.
> > > ---
> > > block/genhd.c | 79 +++++++++++++++++++++++++++++++++++++
> > > fs/kernfs/dir.c | 34 +++++++++++++---
> > > fs/kernfs/kernfs-internal.h | 24 +++++++++++
> > > fs/sysfs/mount.c | 4 ++
> > > include/linux/genhd.h | 3 ++
> > > include/linux/kernfs.h | 22 +++++++++++
> > > include/linux/kobject_ns.h | 4 ++
> > > lib/kobject.c | 2 +
> > > 8 files changed, 167 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/block/genhd.c b/block/genhd.c
> > > index 06b642b23a07..b5b2601c4311 100644
> > > --- a/block/genhd.c
> > > +++ b/block/genhd.c
> > > @@ -1198,11 +1198,81 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data)
> > > return NULL;
> > > }
> > >
> > > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > > +static void *user_grab_current_ns(void)
> > > +{
> > > + struct user_namespace *ns = current_user_ns();
> > > + return get_user_ns(ns);
> > > +}
> > > +
> > > +static const void *user_initial_ns(void)
> > > +{
> > > + return &init_user_ns;
> > > +}
> > > +
> > > +static void user_put_ns(void *p)
> > > +{
> > > + struct user_namespace *ns = p;
> > > + put_user_ns(ns);
> > > +}
> > > +
> > > +static bool user_current_may_mount(void)
> > > +{
> > > + return ns_capable(current_user_ns(), CAP_SYS_ADMIN);
> > > +}
> > > +
> > > +const struct kobj_ns_type_operations user_ns_type_operations = {
> > > + .type = KOBJ_NS_TYPE_USER,
> > > + .current_may_mount = user_current_may_mount,
> > > + .grab_current_ns = user_grab_current_ns,
> > > + .initial_ns = user_initial_ns,
> > > + .drop_ns = user_put_ns,
> > > +};
> > > +
> > > +static const void *block_class_user_namespace(struct device *dev)
> > > +{
> > > + struct gendisk *disk;
> > > +
> > > + if (dev->type == &part_type)
> > > + disk = part_to_disk(dev_to_part(dev));
> > > + else
> > > + disk = dev_to_disk(dev);
> > > +
> > > + return disk->user_ns;
> > > +}
> > > +
> > > +static void block_class_get_ownership(struct device *dev, kuid_t *uid, kgid_t *gid)
> > > +{
> > > + struct gendisk *disk;
> > > + struct user_namespace *ns;
> > > +
> > > + if (dev->type == &part_type)
> > > + disk = part_to_disk(dev_to_part(dev));
> > > + else
> > > + disk = dev_to_disk(dev);
> > > +
> > > + ns = disk->user_ns;
> > > + if (ns && ns != &init_user_ns) {
> > > + kuid_t ns_root_uid = make_kuid(ns, 0);
> > > + kgid_t ns_root_gid = make_kgid(ns, 0);
> > > +
> > > + if (uid_valid(ns_root_uid))
> > > + *uid = ns_root_uid;
> > > +
> > > + if (gid_valid(ns_root_gid))
> > > + *gid = ns_root_gid;
> > > + }
> >
> > You're not setting uid and gid in the else case?
>
> Right, the reason being that sysfs and the associated kobject
> infrastructure will always set global root as the default. So the

Oh, ok, I had thought that would be the case but failed to find
it yesterday :) thx

Reviewed-by: Serge Hallyn <[email protected]>

> callchain is:
> kobject_get_ownership()
> and this calls the ktype callbacks which hits
> -> device_get_ownership()
> which calls into the device class specific callbacks which in this is
> case calls block_class_get_ownership().
>
> And there's no direct callers of, say <device-class>->get_ownership()
> that all needs to always go through the callback infrastructure.
>
> >
> > > +}
> > > +#endif /* CONFIG_BLK_DEV_LOOPFS */
> > > +
> > > static int __init genhd_device_init(void)
> > > {
> > > int error;
> > >
> > > block_class.dev_kobj = sysfs_dev_block_kobj;
> > > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > > + kobj_ns_type_register(&user_ns_type_operations);
> > > +#endif
> > > error = class_register(&block_class);
> > > if (unlikely(error))
> > > return error;
> > > @@ -1524,8 +1594,14 @@ static void disk_release(struct device *dev)
> > > blk_put_queue(disk->queue);
> > > kfree(disk);
> > > }
> > > +
> > > struct class block_class = {
> > > .name = "block",
> > > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > > + .ns_type = &user_ns_type_operations,
> > > + .namespace = block_class_user_namespace,
> > > + .get_ownership = block_class_get_ownership,
> > > +#endif
> > > };
> > >
> > > static char *block_devnode(struct device *dev, umode_t *mode,
> > > @@ -1715,6 +1791,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
> > > disk_to_dev(disk)->class = &block_class;
> > > disk_to_dev(disk)->type = &disk_type;
> > > device_initialize(disk_to_dev(disk));
> > > +#ifdef CONFIG_BLK_DEV_LOOPFS
> > > + disk->user_ns = &init_user_ns;
> > > +#endif
> > > }
> > > return disk;
> > > }
> > > diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> > > index 1f2d894ae454..02796ba6521a 100644
> > > --- a/fs/kernfs/dir.c
> > > +++ b/fs/kernfs/dir.c
> > > @@ -575,10 +575,15 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
> > > goto out_bad;
> > >
> > > /* The kernfs node has been moved to a different namespace */
> > > - if (kn->parent && kernfs_ns_enabled(kn->parent) &&
> > > - kernfs_info(dentry->d_sb)->ns[kn->ns_type] != kn->ns)
> > > - goto out_bad;
> > > + if (kn->parent && kernfs_ns_enabled(kn->parent)) {
> > > + if (kernfs_init_ns_propagates(kn->parent) &&
> > > + kn->ns == kernfs_init_ns(kn->parent->ns_type))
> > > + goto out_good;
> > > + if (kernfs_info(dentry->d_sb)->ns[kn->parent->ns_type] != kn->ns)
> > > + goto out_bad;
> > > + }
> > >
> > > +out_good:
> > > mutex_unlock(&kernfs_mutex);
> > > return 1;
> > > out_bad:
> > > @@ -1090,6 +1095,10 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
> > > ns = kernfs_info(dir->i_sb)->ns[parent->ns_type];
> > >
> > > kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> > > + if (!kn && kernfs_init_ns_propagates(parent)) {
> > > + ns = kernfs_init_ns(parent->ns_type);
> > > + kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> > > + }
> > >
> > > /* no such entry */
> > > if (!kn || !kernfs_active(kn)) {
> > > @@ -1614,6 +1623,8 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
> > > static struct kernfs_node *kernfs_dir_pos(const void *ns,
> > > struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
> > > {
> > > + const void *init_ns;
> > > +
> > > if (pos) {
> > > int valid = kernfs_active(pos) &&
> > > pos->parent == parent && hash == pos->hash;
> > > @@ -1621,6 +1632,12 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> > > if (!valid)
> > > pos = NULL;
> > > }
> > > +
> > > + if (kernfs_init_ns_propagates(parent))
> > > + init_ns = kernfs_init_ns(parent->ns_type);
> > > + else
> > > + init_ns = NULL;
> > > +
> > > if (!pos && (hash > 1) && (hash < INT_MAX)) {
> > > struct rb_node *node = parent->dir.children.rb_node;
> > > while (node) {
> > > @@ -1635,7 +1652,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> > > }
> > > }
> > > /* Skip over entries which are dying/dead or in the wrong namespace */
> > > - while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
> > > + while (pos && (!kernfs_active(pos) || (pos->ns != ns && pos->ns != init_ns))) {
> > > struct rb_node *node = rb_next(&pos->rb);
> > > if (!node)
> > > pos = NULL;
> > > @@ -1650,13 +1667,20 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
> > > {
> > > pos = kernfs_dir_pos(ns, parent, ino, pos);
> > > if (pos) {
> > > + const void *init_ns;
> > > + if (kernfs_init_ns_propagates(parent))
> > > + init_ns = kernfs_init_ns(parent->ns_type);
> > > + else
> > > + init_ns = NULL;
> > > +
> > > do {
> > > struct rb_node *node = rb_next(&pos->rb);
> > > if (!node)
> > > pos = NULL;
> > > else
> > > pos = rb_to_kn(node);
> > > - } while (pos && (!kernfs_active(pos) || pos->ns != ns));
> > > + } while (pos && (!kernfs_active(pos) ||
> > > + (pos->ns != ns && pos->ns != init_ns)));
> > > }
> > > return pos;
> > > }
> > > diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
> > > index 7c972c00f84a..74eb6c447361 100644
> > > --- a/fs/kernfs/kernfs-internal.h
> > > +++ b/fs/kernfs/kernfs-internal.h
> > > @@ -80,6 +80,30 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
> > > return d_inode(dentry)->i_private;
> > > }
> > >
> > > +#ifdef CONFIG_NET
> > > +extern struct net init_net;
> > > +#endif
> > > +
> > > +extern struct user_namespace init_user_ns;
> > > +
> > > +static inline const void *kernfs_init_ns(enum kobj_ns_type ns_type)
> > > +{
> > > + switch (ns_type) {
> > > + case KOBJ_NS_TYPE_NET:
> > > +#ifdef CONFIG_NET
> > > + return &init_net;
> > > +#else
> > > + break;
> > > +#endif
> > > + case KOBJ_NS_TYPE_USER:
> > > + return &init_user_ns;
> > > + default:
> > > + pr_debug("Unsupported namespace type %d for kernfs\n", ns_type);
> > > + }
> > > +
> > > + return NULL;
> > > +}
> > > +
> > > extern const struct super_operations kernfs_sops;
> > > extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
> > >
> > > diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
> > > index 5e2ec88a709e..99b82a0ae7ea 100644
> > > --- a/fs/sysfs/mount.c
> > > +++ b/fs/sysfs/mount.c
> > > @@ -43,6 +43,8 @@ static void sysfs_fs_context_free(struct fs_context *fc)
> > >
> > > if (kfc->ns_tag[KOBJ_NS_TYPE_NET])
> > > kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag[KOBJ_NS_TYPE_NET]);
> > > + if (kfc->ns_tag[KOBJ_NS_TYPE_USER])
> > > + kobj_ns_drop(KOBJ_NS_TYPE_USER, kfc->ns_tag[KOBJ_NS_TYPE_USER]);
> > > kernfs_free_fs_context(fc);
> > > kfree(kfc);
> > > }
> > > @@ -67,6 +69,7 @@ static int sysfs_init_fs_context(struct fs_context *fc)
> > > return -ENOMEM;
> > >
> > > kfc->ns_tag[KOBJ_NS_TYPE_NET] = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
> > > + kfc->ns_tag[KOBJ_NS_TYPE_USER] = kobj_ns_grab_current(KOBJ_NS_TYPE_USER);
> >
> > It's nice and tidy this way so maybe worth it, but getting
> > the kobj_ns_type_lock spinlock twice in a row here seems
> > unfortunate.
>
> Let me see if I can do something non-ugly and moderately simple about
> this. If not, it's probably fine as it is since it only happens on sysfs
> mount.