Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753320AbZAZX3r (ORCPT ); Mon, 26 Jan 2009 18:29:47 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753131AbZAZX3L (ORCPT ); Mon, 26 Jan 2009 18:29:11 -0500 Received: from smtp1.linux-foundation.org ([140.211.169.13]:49877 "EHLO smtp1.linux-foundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753013AbZAZX3J (ORCPT ); Mon, 26 Jan 2009 18:29:09 -0500 Date: Mon, 26 Jan 2009 15:28:39 -0800 From: Andrew Morton To: "Serge E. Hallyn" Cc: linux-kernel@vger.kernel.org, containers@lists.osdl.org Subject: Re: [PATCH 2/3] ipc namespaces: implement support for posix msqueues Message-Id: <20090126152839.b35dbb45.akpm@linux-foundation.org> In-Reply-To: <20090117020332.GB8708@us.ibm.com> References: <20090117020248.GA8615@us.ibm.com> <20090117020332.GB8708@us.ibm.com> X-Mailer: Sylpheed version 2.2.4 (GTK+ 2.8.20; i486-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9627 Lines: 329 On Fri, 16 Jan 2009 20:03:32 -0600 "Serge E. Hallyn" wrote: > Implement multiple mounts of the mqueue file system, and > link it to usage of CLONE_NEWIPC. > > Each ipc ns has a corresponding mqueuefs superblock. When > a user does clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the > unshare will cause an internal mount of a new mqueuefs sb > linked to the new ipc ns. > > When a user does 'mount -t mqueue mqueue /dev/mqueue', he > mounts the mqueuefs superblock. > > Posix message queues can be worked with both through the > mq_* system calls (see mq_overview(7)), and through the VFS > through the mqueue mount. Any usage of mq_open() and friends > will work with the acting task's ipc namespace. Any actions > through the VFS will work with the mqueuefs in which the > file was created. So if a user doesn't remount mqueuefs > after unshare(CLONE_NEWIPC), mq_open("/ab") will not be > reflected in "ls /dev/mqueue". > > If task a mounts mqueue for ipc_ns:1, then clones task b with > a new ipcns, ipcns:2, and then task a is the last task in > ipc_ns:1 to exit, then (1) ipc_ns:1 will be freed, (2) it's > superblock will live on until task b umounts the corresponding > mqueuefs, and vfs actions will continue to succeed, but (3) > sb->s_fs_info will be NULL for the sb corresponding to the > deceased ipc_ns:1. I suppose that stuff like this should find its way into a manpage one day. That'll be fun for someone. > > ... > > --- a/include/linux/ipc_namespace.h > +++ b/include/linux/ipc_namespace.h > @@ -25,7 +25,7 @@ struct ipc_ids { > }; > > struct ipc_namespace { > - struct kref kref; > + atomic_t count; > struct ipc_ids ids[3]; > > int sem_ctls[4]; > @@ -56,6 +56,7 @@ struct ipc_namespace { > extern struct ipc_namespace init_ipc_ns; > extern atomic_t nr_ipc_ns; > > +extern spinlock_t mq_lock; > #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) > #define INIT_IPC_NS(ns) .ns = &init_ipc_ns, > #else > @@ -75,18 +76,18 @@ extern int ipcns_notify(unsigned long); > #endif /* CONFIG_SYSVIPC */ > > #ifdef CONFIG_POSIX_MQUEUE > -extern void mq_init_ns(struct ipc_namespace *ns); > +extern int mq_init_ns(struct ipc_namespace *ns); > /* default values */ > #define DFLT_QUEUESMAX 256 /* max number of message queues */ > #define DFLT_MSGMAX 10 /* max number of messages in each queue */ > #define HARD_MSGMAX (131072/sizeof(void *)) > #define DFLT_MSGSIZEMAX 8192 /* max message size */ > #else > -#define mq_init_ns(ns) ((void) 0) > +#define mq_init_ns(ns) (0) argh > #endif > > #if defined(CONFIG_IPC_NS) > -extern void free_ipc_ns(struct kref *kref); > +extern void free_ipc_ns(struct ipc_namespace *ns); > extern struct ipc_namespace *copy_ipcs(unsigned long flags, > struct ipc_namespace *ns); > extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, > > ... > > +/* > + * This routine should be called with the mq_lock held. > + */ > +static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode) > +{ > + return get_ipc_ns(inode->i_sb->s_fs_info); > } > > -void mq_exit_ns(struct ipc_namespace *ns) { > - /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */ > - mntput(ns->mq_mnt); > +static inline struct ipc_namespace *get_ns_from_inode(struct inode *inode) > +{ > + struct ipc_namespace *ns; > + > + spin_lock(&mq_lock); > + ns = __get_ns_from_inode(inode); > + spin_unlock(&mq_lock); > + return ns; > } No need for the explicit inlining here. > > ... > > +static int compare_sb_single_ns(struct super_block *sb, void *data) > +{ > + return sb->s_fs_info == data; > +} > + > +static int set_sb_single_ns(struct super_block *sb, void *data) > +{ > + sb->s_fs_info = data; > + return set_anon_super(sb, NULL); > +} > + > +static int get_sb_single_ns(struct file_system_type *fs_type, > + int flags, void *data, > + int (*fill_super)(struct super_block *, void *, int), > + struct vfsmount *mnt) > +{ > + struct super_block *s; > + int error; > + > + s = sget(fs_type, compare_sb_single_ns, set_sb_single_ns, data); > + if (IS_ERR(s)) > + return PTR_ERR(s); > + if (!s->s_root) { > + s->s_flags = flags; > + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); > + if (error) { > + up_write(&s->s_umount); > + deactivate_super(s); > + return error; > + } > + s->s_flags |= MS_ACTIVE; > + } > + do_remount_sb(s, flags, data, 0); > + return simple_set_mnt(mnt, s); > } The above doesn't seem specific to mqueue. Is it in the best place? > static int mqueue_get_sb(struct file_system_type *fs_type, > int flags, const char *dev_name, > void *data, struct vfsmount *mnt) > { > - return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt); > + struct ipc_namespace *ns = data; > + > + if (!(flags & MS_KERNMOUNT)) > + ns = current->nsproxy->ipc_ns; > + return get_sb_single_ns(fs_type, flags, ns, mqueue_fill_super, mnt); > } > > static void init_once(void *foo) > @@ -245,12 +295,13 @@ static void mqueue_delete_inode(struct inode *inode) > struct user_struct *user; > unsigned long mq_bytes; > int i; > - struct ipc_namespace *ipc_ns = &init_ipc_ns; > + struct ipc_namespace *ipc_ns; > > if (S_ISDIR(inode->i_mode)) { > clear_inode(inode); > return; > } > + ipc_ns = get_ns_from_inode(inode); > info = MQUEUE_I(inode); > spin_lock(&info->lock); > for (i = 0; i < info->attr.mq_curmsgs; i++) > @@ -266,10 +317,12 @@ static void mqueue_delete_inode(struct inode *inode) > if (user) { > spin_lock(&mq_lock); > user->mq_bytes -= mq_bytes; > - ipc_ns->mq_queues_count--; > + if (ipc_ns) The reader might be wondering why ipc_ns==NULL is an acceptable state, and what that state actually means. This reader is wondering that. > + ipc_ns->mq_queues_count--; > spin_unlock(&mq_lock); > free_uid(user); > } > + put_ipc_ns(ipc_ns); > } > > > ... > > +int mq_init_ns(struct ipc_namespace *ns) > +{ > + ns->mq_queues_count = 0; > + ns->mq_queues_max = DFLT_QUEUESMAX; > + ns->mq_msg_max = DFLT_MSGMAX; > + ns->mq_msgsize_max = DFLT_MSGSIZEMAX; > + > + ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns); > + if (IS_ERR(ns->mq_mnt)) > + return PTR_ERR(ns->mq_mnt); It seems dangerous to leave an IS_ERR() pointer sitting in ns->mq_mnt for someone else to trip over. > + return 0; > +} > + > +void mq_clear_sbinfo(struct ipc_namespace *ns) > +{ > + ns->mq_mnt->mnt_sb->s_fs_info = NULL; > +} > + > +void mq_put_mnt(struct ipc_namespace *ns) > +{ > + mntput(ns->mq_mnt); > +} > + > static int msg_max_limit_min = MIN_MSGMAX; > static int msg_max_limit_max = MAX_MSGMAX; > > @@ -1284,15 +1367,14 @@ static int __init init_mqueue_fs(void) > if (error) > goto out_sysctl; > > - init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type); > + spin_lock_init(&mq_lock); > + > + init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns); > if (IS_ERR(init_ipc_ns.mq_mnt)) { > error = PTR_ERR(init_ipc_ns.mq_mnt); > goto out_filesystem; > } > > - /* internal initialization - not common for vfs */ > - spin_lock_init(&mq_lock); > - > return 0; > > out_filesystem: > diff --git a/ipc/msgutil.c b/ipc/msgutil.c > index c197cd1..21475b0 100644 > --- a/ipc/msgutil.c > +++ b/ipc/msgutil.c > @@ -18,18 +18,16 @@ > > #include "util.h" > > +spinlock_t mq_lock; DEFINE_SPINLOCK(), please. For lockdep reasons. > > ... > > +/* > + * put_ipc_ns - drop a reference to an ipc namespace. > + * @ns: the namespace to put > + * > + * If this is the last task in the namespace exiting, and > + * it is dropping the refcount to 0, then it can race with > + * a task in another ipc namespace but in a mounts namespace > + * which has this ipcns's mqueuefs mounted, doing some action > + * with one of the mqueuefs files. That can raise the refcount. > + * So dropping the refcount, and raising the refcount when > + * accessing it through the VFS, are protected with mq_lock. > + * > + * (Clearly, a task raising the refcount on its own ipc_ns > + * needn't take mq_lock since it can't race with the last task > + * in the ipcns exiting). > + */ > +void put_ipc_ns(struct ipc_namespace *ns) > { > - struct ipc_namespace *ns; > + if (ns && atomic_dec_and_lock(&ns->count, &mq_lock)) { > + mq_clear_sbinfo(ns); > + spin_unlock(&mq_lock); > + mq_put_mnt(ns); > + free_ipc_ns(ns); > + } > +} Why are we supporting calls with a NULL arg here? > - ns = container_of(kref, struct ipc_namespace, kref); > +void free_ipc_ns(struct ipc_namespace *ns) > +{ > /* > * Unregistering the hotplug notifier at the beginning guarantees > * that the ipc namespace won't be freed while we are inside the > @@ -102,7 +132,6 @@ void free_ipc_ns(struct kref *kref) > sem_exit_ns(ns); > msg_exit_ns(ns); > shm_exit_ns(ns); > - mq_exit_ns(ns); > kfree(ns); > atomic_dec(&nr_ipc_ns); > > diff --git a/ipc/util.h b/ipc/util.h > index 9b6cc33..236e4ed 100644 > --- a/ipc/util.h > +++ b/ipc/util.h > @@ -21,9 +21,11 @@ void shm_init (void); > struct ipc_namespace; > > #ifdef CONFIG_POSIX_MQUEUE > -void mq_exit_ns(struct ipc_namespace *ns); > +extern void mq_clear_sbinfo(struct ipc_namespace *ns); > +extern void mq_put_mnt(struct ipc_namespace *ns); > #else > -#define mq_exit_ns(ns) ((void) 0) > +#define mq_clear_sbinfo(ns) ((void) 0) > +#define mq_put_mnt(ns) ((void) 0) argh. > #endif > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/