Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752629AbdLSUOX (ORCPT ); Tue, 19 Dec 2017 15:14:23 -0500 Received: from zeniv.linux.org.uk ([195.92.253.2]:43626 "EHLO ZenIV.linux.org.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752606AbdLSUOV (ORCPT ); Tue, 19 Dec 2017 15:14:21 -0500 Date: Tue, 19 Dec 2017 20:14:12 +0000 From: Al Viro To: Giuseppe Scrivano Cc: Andrew Morton , LKML , alexander.deucher@amd.com, broonie@kernel.org, chris@chris-wilson.co.uk, David Miller , deepa.kernel@gmail.com, Greg KH , luc.vanoostenryck@gmail.com, lucien xin , Ingo Molnar , Neil Horman , syzkaller-bugs@googlegroups.com, Vladislav Yasevich Subject: Re: [PATCH linux-next] mqueue: fix IPC namespace use-after-free Message-ID: <20171219201411.GT21978@ZenIV.linux.org.uk> References: <20171219101440.19736-1-gscrivan@redhat.com> <20171219114819.GQ21978@ZenIV.linux.org.uk> <20171219153225.GA14771@ZenIV.linux.org.uk> <874lomhcwb.fsf@redhat.com> <87vah2ftn8.fsf@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <87vah2ftn8.fsf@redhat.com> User-Agent: Mutt/1.9.0 (2017-09-02) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 5954 Lines: 181 On Tue, Dec 19, 2017 at 07:40:43PM +0100, Giuseppe Scrivano wrote: > Giuseppe Scrivano writes: > > > The only issue I've seen with my version is that if I do: > > > > # unshare -im /bin/sh > > # mount -t mqueue mqueue /dev/mqueue > > # touch /dev/mqueue/foo > > # umount /dev/mqueue > > # mount -t mqueue mqueue /dev/mqueue > > > > then /dev/mqueue/foo doesn't exist at this point. Your patch does not > > have this problem and /dev/mqueue/foo is again accessible after the > > second mount. > > although, how much is that of an issue? Is there any other way to delay You do realize that with your patch you would end up with worse than that - mount/touch/umount and now you've got ->mq_sb non-NULL and pointing to freed super_block. With really unpleasant effects when you quit that ipcns... > the cost of kern_mount_data()? Most containers have /dev/mqueue mounted > but it is not really going to be used. _What_ cost? At mount(2) time you are setting the superblock up anyway, so what would you be delaying? kmem_cache_alloc() for struct mount and assignments to its fields? That's noise; if anything, I would expect the main cost with a plenty of containers to be in sget() scanning the list of mqueue superblocks. And we can get rid of that, while we are at it - to hell with mount_ns(), with that approach we can just use mount_nodev() instead. The logics in mq_internal_mount() will deal with multiple instances - if somebody has already triggered creation of internal mount, all subsequent calls in that ipcns will end up avoiding kern_mount_data() entirely. And if you have two callers racing - sure, you will get two superblocks. Not for long, though - the first one to get to setting ->mq_mnt (serialized on mq_lock) wins, the second loses and prompty destroys his vfsmount and superblock. I seriously suspect that variant below would cut down on the cost a whole lot more - as it is, we have the total of O(N^2) spent in the loop inside of sget_userns() when we create N ipcns and mount in each of those; this patch should cut that to O(N)... diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 10b82338415b..e9870b0cda29 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -325,8 +325,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb, static int mqueue_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct ipc_namespace *ns = sb->s_fs_info; + struct ipc_namespace *ns = data; + sb->s_fs_info = ns; sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -343,18 +344,44 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) return 0; } +static struct file_system_type mqueue_fs_type; +/* + * Return value is pinned only by reference in ->mq_mnt; it will + * live until ipcns dies. Caller does not need to drop it. + */ +static struct vfsmount *mq_internal_mount(void) +{ + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + struct vfsmount *m = ns->mq_mnt; + if (m) + return m; + m = kern_mount_data(&mqueue_fs_type, ns); + spin_lock(&mq_lock); + if (unlikely(ns->mq_mnt)) { + spin_unlock(&mq_lock); + if (!IS_ERR(m)) + kern_unmount(m); + return ns->mq_mnt; + } + if (!IS_ERR(m)) + ns->mq_mnt = m; + spin_unlock(&mq_lock); + return m; +} + static struct dentry *mqueue_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct ipc_namespace *ns; - if (flags & MS_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = current->nsproxy->ipc_ns; - } - return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super); + struct vfsmount *m; + if (flags & MS_KERNMOUNT) + return mount_nodev(fs_type, flags, data, mqueue_fill_super); + m = mq_internal_mount(); + if (IS_ERR(m)) + return ERR_CAST(m); + atomic_inc(&m->mnt_sb->s_active); + down_write(&m->mnt_sb->s_umount); + return dget(m->mnt_root); } static void init_once(void *foo) @@ -743,13 +770,16 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro, static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, struct mq_attr *attr) { - struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt; - struct dentry *root = mnt->mnt_root; + struct vfsmount *mnt = mq_internal_mount(); + struct dentry *root; struct filename *name; struct path path; int fd, error; int ro; + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + audit_mq_open(oflag, mode, attr); if (IS_ERR(name = getname(u_name))) @@ -760,6 +790,7 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, goto out_putname; ro = mnt_want_write(mnt); /* we'll drop it in any case */ + root = mnt->mnt_root; inode_lock(d_inode(root)); path.dentry = lookup_one_len(name->name, root, strlen(name->name)); if (IS_ERR(path.dentry)) { @@ -1534,28 +1565,26 @@ int mq_init_ns(struct ipc_namespace *ns) ns->mq_msgsize_max = DFLT_MSGSIZEMAX; ns->mq_msg_default = DFLT_MSG; ns->mq_msgsize_default = DFLT_MSGSIZE; + ns->mq_mnt = NULL; - ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns); - if (IS_ERR(ns->mq_mnt)) { - int err = PTR_ERR(ns->mq_mnt); - ns->mq_mnt = NULL; - return err; - } return 0; } void mq_clear_sbinfo(struct ipc_namespace *ns) { - ns->mq_mnt->mnt_sb->s_fs_info = NULL; + if (ns->mq_mnt) + ns->mq_mnt->mnt_sb->s_fs_info = NULL; } void mq_put_mnt(struct ipc_namespace *ns) { - kern_unmount(ns->mq_mnt); + if (ns->mq_mnt) + kern_unmount(ns->mq_mnt); } static int __init init_mqueue_fs(void) { + struct vfsmount *m; int error; mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache", @@ -1577,6 +1606,10 @@ static int __init init_mqueue_fs(void) if (error) goto out_filesystem; + m = kern_mount_data(&mqueue_fs_type, &init_ipc_ns); + if (IS_ERR(m)) + goto out_filesystem; + init_ipc_ns.mq_mnt = m; return 0; out_filesystem: