Date: Mon, 2 Jun 2008 14:24:59 -0500
From: "Serge E. Hallyn" <serue@us.ibm.com>
To: Benjamin Thery <benjamin.thery@bull.net>
Cc: Andrew Morton <akpm@linux-foundation.org>,
       Greg Kroah-Hartman <gregkh@suse.de>,
       Eric Biederman <ebiederm@xmission.com>, Serge Hallyn <serue@us.ibm.com>,
       linux-kernel@vger.kernel.org, Tejun Heo <htejun@gmail.com>,
       Al Viro <viro@ftp.linux.org.uk>, Daniel Lezcano <dlezcano@fr.ibm.com>
Subject: Re: [PATCH 10/10] sysfs: user namespaces: fix bug with
	clone(CLONE_NEWUSER) with fairsched
Message-ID: <20080602192459.GA18509@us.ibm.com>
References: <20080602134438.224352910@theryb.frec.bull.fr> <20080602134439.953880460@theryb.frec.bull.fr>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20080602134439.953880460@theryb.frec.bull.fr>
User-Agent: Mutt/1.5.17+20080114 (2008-01-14)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7257
Lines: 234

Quoting Benjamin Thery (benjamin.thery@bull.net):
> Mark the /sys/kernel/uids directory to be tagged so that processes in
> different user namespaces can remount /sys and see their own uid
> listings.
> 
> Without this patch, having CONFIG_FAIR_SCHED=y makes user namespaces
> unusable, because when you 
>   clone(CLONE_NEWUSER)
> it will auto-create the root userid and try to create
> /sys/kernel/uids/0.  Since that already exists from the parent user
> namespace, the create fails, and the clone misleadingly ends up
> returning -ENOMEM.
> 
> This patch fixes the issue by allowing each user namespace to remount
> /sys, and having /sys filter the /sys/kernel/uid/ entries by user
> namespace.
> 
> Signed-off-by: Serge Hallyn <serue@us.ibm.com>
> Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>

Thanks for picking this up, Benjamin.

Eric, please look this one over.  I think I removed everything that
shouldn't be here from the last version, and at this point only
do what I need to to access the user_ns where we need it, and tag
the appropriate /sys/ files.

thanks,
-serge

> ---
>  fs/sysfs/mount.c               |   24 ++++++++++++++++++++++++
>  include/linux/sched.h          |    1 +
>  include/linux/sysfs.h          |    9 +++++++++
>  include/linux/user_namespace.h |    1 +
>  kernel/user.c                  |   21 +++++++++++++++++++++
>  kernel/user_namespace.c        |    3 ++-
>  6 files changed, 58 insertions(+), 1 deletion(-)
> 
> Index: linux-mm/fs/sysfs/mount.c
> ===================================================================
> --- linux-mm.orig/fs/sysfs/mount.c
> +++ linux-mm/fs/sysfs/mount.c
> @@ -81,6 +81,7 @@ static int sysfs_fill_super(struct super
>  	sb->s_root = root;
>  	sb->s_fs_info = info;
>  	info->tag.net_ns = hold_net(current->nsproxy->net_ns);
> +	info->tag.user_ns = current->nsproxy->user_ns;
>  	return 0;
> 
>  out_err:
> @@ -100,6 +101,8 @@ static int sysfs_test_super(struct super
> 
>  	if (task->nsproxy->net_ns != info->tag.net_ns)
>  		found = 0;
> +	if (task->nsproxy->user_ns != info->tag.user_ns)
> +		found = 0;
> 
>  	return found;
>  }
> @@ -214,6 +217,27 @@ static struct pernet_operations sysfs_ne
>  };
>  #endif
> 
> +#ifdef CONFIG_USER_NS
> +void sysfs_userns_exit(struct user_namespace *user_ns)
> +{
> +	/* Allow the net namespace to go away while sysfs is still mounted. */
> +	struct super_block *sb;
> +	printk(KERN_NOTICE "sysfs: user namespace exiting\n");
> +	mutex_lock(&sysfs_rename_mutex);
> +	sysfs_grab_supers();
> +	mutex_lock(&sysfs_mutex);
> +	list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
> +		struct sysfs_super_info *info = sysfs_info(sb);
> +		if (info->tag.user_ns != user_ns)
> +			continue;
> +		info->tag.user_ns = NULL;
> +	}
> +	mutex_unlock(&sysfs_mutex);
> +	sysfs_release_supers();
> +	mutex_unlock(&sysfs_rename_mutex);
> +}
> +#endif
> +
>  int __init sysfs_init(void)
>  {
>  	int err = -ENOMEM;
> Index: linux-mm/include/linux/sched.h
> ===================================================================
> --- linux-mm.orig/include/linux/sched.h
> +++ linux-mm/include/linux/sched.h
> @@ -600,6 +600,7 @@ struct user_struct {
>  	/* Hash table maintenance information */
>  	struct hlist_node uidhash_node;
>  	uid_t uid;
> +	struct user_namespace *user_ns;
> 
>  #ifdef CONFIG_USER_SCHED
>  	struct task_group *tg;
> Index: linux-mm/include/linux/sysfs.h
> ===================================================================
> --- linux-mm.orig/include/linux/sysfs.h
> +++ linux-mm/include/linux/sysfs.h
> @@ -20,6 +20,7 @@
>  struct kobject;
>  struct module;
>  struct net;
> +struct user_namespace;
> 
>  /* FIXME
>   * The *owner field is no longer used, but leave around
> @@ -81,6 +82,7 @@ struct sysfs_ops {
> 
>  struct sysfs_tag_info {
>  	struct net *net_ns;
> +	struct user_namespace *user_ns;
>  };
> 
>  struct sysfs_tagged_dir_operations {
> @@ -138,6 +140,9 @@ int sysfs_enable_tagging(struct kobject 
> 
>  extern int __must_check sysfs_init(void);
> 
> +struct user_namespace;
> +void sysfs_userns_exit(struct user_namespace *user_ns);
> +
>  #else /* CONFIG_SYSFS */
> 
>  static inline int sysfs_schedule_callback(struct kobject *kobj,
> @@ -254,6 +259,10 @@ static inline int __must_check sysfs_ini
>  	return 0;
>  }
> 
> +static inline void sysfs_userns_exit(struct user_namespace *user_ns)
> +{
> +}
> +
>  static inline void sysfs_printk_last_file(void)
>  {
>  }
> Index: linux-mm/include/linux/user_namespace.h
> ===================================================================
> --- linux-mm.orig/include/linux/user_namespace.h
> +++ linux-mm/include/linux/user_namespace.h
> @@ -12,6 +12,7 @@
>  struct user_namespace {
>  	struct kref		kref;
>  	struct hlist_head	uidhash_table[UIDHASH_SZ];
> +	struct kset		*kset;
>  	struct user_struct	*root_user;
>  };
> 
> Index: linux-mm/kernel/user.c
> ===================================================================
> --- linux-mm.orig/kernel/user.c
> +++ linux-mm/kernel/user.c
> @@ -53,6 +53,7 @@ struct user_struct root_user = {
>  	.files		= ATOMIC_INIT(0),
>  	.sigpending	= ATOMIC_INIT(0),
>  	.locked_shm     = 0,
> +	.user_ns	= &init_user_ns,
>  #ifdef CONFIG_USER_SCHED
>  	.tg		= &init_task_group,
>  #endif
> @@ -236,6 +237,23 @@ static void uids_release(struct kobject 
>  	return;
>  }
> 
> +static const void *userns_sb_tag(struct sysfs_tag_info *info)
> +{
> +	return info->user_ns;
> +}
> +
> +static const void *userns_kobject_tag(struct kobject *kobj)
> +{
> +	struct user_struct *up;
> +	up = container_of(kobj, struct user_struct, kobj);
> +	return up->user_ns;
> +}
> +
> +static struct sysfs_tagged_dir_operations userns_tagged_dir_operations = {
> +	.sb_tag = userns_sb_tag,
> +	.kobject_tag = userns_kobject_tag,
> +};
> +
>  static struct kobj_type uids_ktype = {
>  	.sysfs_ops = &kobj_sysfs_ops,
>  	.default_attrs = uids_attributes,
> @@ -272,6 +290,8 @@ int __init uids_sysfs_init(void)
>  	if (!uids_kset)
>  		return -ENOMEM;
> 
> +	sysfs_enable_tagging(&uids_kset->kobj, &userns_tagged_dir_operations);
> +
>  	return uids_user_create(&root_user);
>  }
> 
> @@ -404,6 +424,7 @@ struct user_struct *alloc_uid(struct use
>  			goto out_unlock;
> 
>  		new->uid = uid;
> +		new->user_ns = ns;
>  		atomic_set(&new->__count, 1);
> 
>  		if (sched_create_user(new) < 0)
> Index: linux-mm/kernel/user_namespace.c
> ===================================================================
> --- linux-mm.orig/kernel/user_namespace.c
> +++ linux-mm/kernel/user_namespace.c
> @@ -22,7 +22,7 @@ static struct user_namespace *clone_user
>  	struct user_struct *new_user;
>  	int n;
> 
> -	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
> +	ns = kzalloc(sizeof(struct user_namespace), GFP_KERNEL);
>  	if (!ns)
>  		return ERR_PTR(-ENOMEM);
> 
> @@ -71,6 +71,7 @@ void free_user_ns(struct kref *kref)
>  	struct user_namespace *ns;
> 
>  	ns = container_of(kref, struct user_namespace, kref);
> +	sysfs_userns_exit(ns);
>  	release_uids(ns);
>  	kfree(ns);
>  }
> 
> -- 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/