Date: Tue, 19 Dec 2017 20:14:12 +0000
From: Al Viro <viro@ZenIV.linux.org.uk>
To: Giuseppe Scrivano <gscrivan@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
        LKML <linux-kernel@vger.kernel.org>, alexander.deucher@amd.com,
        broonie@kernel.org, chris@chris-wilson.co.uk,
        David Miller <davem@davemloft.net>, deepa.kernel@gmail.com,
        Greg KH <gregkh@linuxfoundation.org>, luc.vanoostenryck@gmail.com,
        lucien xin <lucien.xin@gmail.com>, Ingo Molnar <mingo@kernel.org>,
        Neil Horman <nhorman@tuxdriver.com>, syzkaller-bugs@googlegroups.com,
        Vladislav Yasevich <vyasevich@gmail.com>
Subject: Re: [PATCH linux-next] mqueue: fix IPC namespace use-after-free
Message-ID: <20171219201411.GT21978@ZenIV.linux.org.uk>
References: <20171219101440.19736-1-gscrivan@redhat.com>
 <20171219114819.GQ21978@ZenIV.linux.org.uk>
 <20171219153225.GA14771@ZenIV.linux.org.uk>
 <874lomhcwb.fsf@redhat.com>
 <87vah2ftn8.fsf@redhat.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <87vah2ftn8.fsf@redhat.com>
User-Agent: Mutt/1.9.0 (2017-09-02)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5954
Lines: 181

On Tue, Dec 19, 2017 at 07:40:43PM +0100, Giuseppe Scrivano wrote:
> Giuseppe Scrivano <gscrivan@redhat.com> writes:
> 
> > The only issue I've seen with my version is that if I do:
> >
> > # unshare -im /bin/sh
> > # mount -t mqueue mqueue /dev/mqueue
> > # touch /dev/mqueue/foo
> > # umount /dev/mqueue
> > # mount -t mqueue mqueue /dev/mqueue
> >
> > then /dev/mqueue/foo doesn't exist at this point.  Your patch does not
> > have this problem and /dev/mqueue/foo is again accessible after the
> > second mount.
> 
> although, how much is that of an issue?  Is there any other way to delay

You do realize that with your patch you would end up with worse than that -
mount/touch/umount and now you've got ->mq_sb non-NULL and pointing to freed
super_block.  With really unpleasant effects when you quit that ipcns...

> the cost of kern_mount_data()?  Most containers have /dev/mqueue mounted
> but it is not really going to be used.

_What_ cost?  At mount(2) time you are setting the superblock up anyway, so
what would you be delaying?  kmem_cache_alloc() for struct mount and assignments
to its fields?  That's noise; if anything, I would expect the main cost with
a plenty of containers to be in sget() scanning the list of mqueue superblocks.
And we can get rid of that, while we are at it - to hell with mount_ns(), with
that approach we can just use mount_nodev() instead.  The logics in
mq_internal_mount() will deal with multiple instances - if somebody has already
triggered creation of internal mount, all subsequent calls in that ipcns will
end up avoiding kern_mount_data() entirely.  And if you have two callers
racing - sure, you will get two superblocks.  Not for long, though - the first
one to get to setting ->mq_mnt (serialized on mq_lock) wins, the second loses
and prompty destroys his vfsmount and superblock.  I seriously suspect that
variant below would cut down on the cost a whole lot more - as it is, we have
the total of O(N^2) spent in the loop inside of sget_userns() when we create
N ipcns and mount in each of those; this patch should cut that to O(N)...

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 10b82338415b..e9870b0cda29 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -325,8 +325,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
-	struct ipc_namespace *ns = sb->s_fs_info;
+	struct ipc_namespace *ns = data;
 
+	sb->s_fs_info = ns;
 	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
@@ -343,18 +344,44 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
+static struct file_system_type mqueue_fs_type;
+/*
+ * Return value is pinned only by reference in ->mq_mnt; it will
+ * live until ipcns dies.  Caller does not need to drop it.
+ */
+static struct vfsmount *mq_internal_mount(void)
+{
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+	struct vfsmount *m = ns->mq_mnt;
+	if (m)
+		return m;
+	m = kern_mount_data(&mqueue_fs_type, ns);
+	spin_lock(&mq_lock);
+	if (unlikely(ns->mq_mnt)) {
+		spin_unlock(&mq_lock);
+		if (!IS_ERR(m))
+			kern_unmount(m);
+		return ns->mq_mnt;
+	}
+	if (!IS_ERR(m))
+		ns->mq_mnt = m;
+	spin_unlock(&mq_lock);
+	return m;
+}
+
 static struct dentry *mqueue_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name,
 			 void *data)
 {
-	struct ipc_namespace *ns;
-	if (flags & MS_KERNMOUNT) {
-		ns = data;
-		data = NULL;
-	} else {
-		ns = current->nsproxy->ipc_ns;
-	}
-	return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super);
+	struct vfsmount *m;
+	if (flags & MS_KERNMOUNT)
+		return mount_nodev(fs_type, flags, data, mqueue_fill_super);
+	m = mq_internal_mount();
+	if (IS_ERR(m))
+		return ERR_CAST(m);
+	atomic_inc(&m->mnt_sb->s_active);
+	down_write(&m->mnt_sb->s_umount);
+	return dget(m->mnt_root);
 }
 
 static void init_once(void *foo)
@@ -743,13 +770,16 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro,
 static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
 		      struct mq_attr *attr)
 {
-	struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
-	struct dentry *root = mnt->mnt_root;
+	struct vfsmount *mnt = mq_internal_mount();
+	struct dentry *root;
 	struct filename *name;
 	struct path path;
 	int fd, error;
 	int ro;
 
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
 	audit_mq_open(oflag, mode, attr);
 
 	if (IS_ERR(name = getname(u_name)))
@@ -760,6 +790,7 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
 		goto out_putname;
 
 	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
+	root = mnt->mnt_root;
 	inode_lock(d_inode(root));
 	path.dentry = lookup_one_len(name->name, root, strlen(name->name));
 	if (IS_ERR(path.dentry)) {
@@ -1534,28 +1565,26 @@ int mq_init_ns(struct ipc_namespace *ns)
 	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
 	ns->mq_msg_default   = DFLT_MSG;
 	ns->mq_msgsize_default  = DFLT_MSGSIZE;
+	ns->mq_mnt = NULL;
 
-	ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
-	if (IS_ERR(ns->mq_mnt)) {
-		int err = PTR_ERR(ns->mq_mnt);
-		ns->mq_mnt = NULL;
-		return err;
-	}
 	return 0;
 }
 
 void mq_clear_sbinfo(struct ipc_namespace *ns)
 {
-	ns->mq_mnt->mnt_sb->s_fs_info = NULL;
+	if (ns->mq_mnt)
+		ns->mq_mnt->mnt_sb->s_fs_info = NULL;
 }
 
 void mq_put_mnt(struct ipc_namespace *ns)
 {
-	kern_unmount(ns->mq_mnt);
+	if (ns->mq_mnt)
+		kern_unmount(ns->mq_mnt);
 }
 
 static int __init init_mqueue_fs(void)
 {
+	struct vfsmount *m;
 	int error;
 
 	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
@@ -1577,6 +1606,10 @@ static int __init init_mqueue_fs(void)
 	if (error)
 		goto out_filesystem;
 
+	m = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
+	if (IS_ERR(m))
+		goto out_filesystem;
+	init_ipc_ns.mq_mnt = m;
 	return 0;
 
 out_filesystem: