LinuxLists.cc - [Patch 0/3] posix mqueue namespace (v14)

2009-01-17 02:03:00

Subject: [Patch 0/3] posix mqueue namespace (v14)

IPC namespaces are completely disjoint id->object mappings.
A task can pass CLONE_NEWIPC to unshare and clone to get
a new, empty, IPC namespace. Until now this has supported
SYSV IPC.

Most Posix IPC is done in userspace. The posix mqueue
support, however, is implemented on top of the mqueue fs.

This patchset implements multiple mqueue fs instances,
one per IPC namespace to be precise.

To create a new ipc namespace with posix mq support, you
should now:

unshare(CLONE_NEWIPC|CLONE_NEWNS);
umount /dev/mqueue
mount -t mqueue mqueue /dev/mqueue

It's perfectly valid to do vfs operations on files
in another ipc_namespace's /dev/mqueue, but any use
of mq_open(3) and friends will act in your own ipc_ns.
After the ipc namespace has exited, you can still
unlink but no longer create files in that fs (since
accounting is carried.

Changelog:
v14: (Jan 16 2009) port to linux-next
v13: (Dec 28 2009)
1. addressed comments by Dave and Suka
2. ported Cedric's patch to make posix mq sysctls
per-namespace

When convenient, it would be great to see this tested
in -mm.

thanks,
-serge

2009-01-17 02:03:40

by Serge E. Hallyn

[permalink] [raw]

Subject: [PATCH 1/3] mqueue ns: move mqueue_mnt into struct ipc_namespace

Move mqueue vfsmount plus a few tunables into the
ipc_namespace struct. The CONFIG_IPC_NS boolean
and the ipc_namespace struct will serve both the
posix message queue namespaces and the SYSV ipc
namespaces.

Changelog:
Dec 18: fix dummy fn #defines

Signed-off-by: Cedric Le Goater <[email protected]>
Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/ipc_namespace.h | 32 ++++++++++--
init/Kconfig | 4 +-
ipc/mqueue.c | 116 ++++++++++++++++++++++-------------------
ipc/msgutil.c | 21 +++++++
ipc/namespace.c | 2 +
ipc/util.c | 9 ---
ipc/util.h | 15 +++++
7 files changed, 131 insertions(+), 68 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index ea330f9..2994eac 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -44,24 +44,48 @@ struct ipc_namespace {
int shm_tot;

struct notifier_block ipcns_nb;
+
+ struct vfsmount *mq_mnt;
+ unsigned int mq_queues_count;
+ unsigned int mq_queues_max;
+ unsigned int mq_msg_max;
+ unsigned int mq_msgsize_max;
+
};

extern struct ipc_namespace init_ipc_ns;
extern atomic_t nr_ipc_ns;

-#ifdef CONFIG_SYSVIPC
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
#define INIT_IPC_NS(ns) .ns = &init_ipc_ns,
+#else
+#define INIT_IPC_NS(ns)
+#endif

+#ifdef CONFIG_SYSVIPC
extern int register_ipcns_notifier(struct ipc_namespace *);
extern int cond_register_ipcns_notifier(struct ipc_namespace *);
extern void unregister_ipcns_notifier(struct ipc_namespace *);
extern int ipcns_notify(unsigned long);
-
#else /* CONFIG_SYSVIPC */
-#define INIT_IPC_NS(ns)
+#define register_ipcns_notifier(ns) (0)
+#define cond_register_ipcns_notifier(ns) (0)
+#define unregister_ipcns_notifier(ns) ((void) 0)
+#define ipcns_notify(l) (0)
#endif /* CONFIG_SYSVIPC */

-#if defined(CONFIG_SYSVIPC) && defined(CONFIG_IPC_NS)
+#ifdef CONFIG_POSIX_MQUEUE
+extern void mq_init_ns(struct ipc_namespace *ns);
+/* default values */
+#define DFLT_QUEUESMAX 256 /* max number of message queues */
+#define DFLT_MSGMAX 10 /* max number of messages in each queue */
+#define HARD_MSGMAX (131072/sizeof(void *))
+#define DFLT_MSGSIZEMAX 8192 /* max message size */
+#else
+#define mq_init_ns(ns) ((void) 0)
+#endif
+
+#if defined(CONFIG_IPC_NS)
extern void free_ipc_ns(struct kref *kref);
extern struct ipc_namespace *copy_ipcs(unsigned long flags,
struct ipc_namespace *ns);
diff --git a/init/Kconfig b/init/Kconfig
index 0e99247..8084385 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -517,10 +517,10 @@ config UTS_NS

config IPC_NS
bool "IPC namespace"
- depends on NAMESPACES && SYSVIPC
+ depends on NAMESPACES && (SYSVIPC || POSIX_MQUEUE)
help
In this namespace tasks work with IPC ids which correspond to
- different IPC objects in different namespaces
+ different IPC objects in different namespaces.

config USER_NS
bool "User namespace (EXPERIMENTAL)"
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 54b4077..50c0658 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -31,6 +31,7 @@
#include <linux/mutex.h>
#include <linux/nsproxy.h>
#include <linux/pid.h>
+#include <linux/ipc_namespace.h>

#include <net/sock.h>
#include "util.h"
@@ -46,12 +47,6 @@
#define STATE_PENDING 1
#define STATE_READY 2

-/* default values */
-#define DFLT_QUEUESMAX 256 /* max number of message queues */
-#define DFLT_MSGMAX 10 /* max number of messages in each queue */
-#define HARD_MSGMAX (131072/sizeof(void*))
-#define DFLT_MSGSIZEMAX 8192 /* max message size */
-
/*
* Define the ranges various user-specified maximum values can
* be set to.
@@ -95,12 +90,6 @@ static void remove_notification(struct mqueue_inode_info *info);

static spinlock_t mq_lock;
static struct kmem_cache *mqueue_inode_cachep;
-static struct vfsmount *mqueue_mnt;
-
-static unsigned int queues_count;
-static unsigned int queues_max = DFLT_QUEUESMAX;
-static unsigned int msg_max = DFLT_MSGMAX;
-static unsigned int msgsize_max = DFLT_MSGSIZEMAX;

static struct ctl_table_header * mq_sysctl_table;

@@ -109,11 +98,25 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
return container_of(inode, struct mqueue_inode_info, vfs_inode);
}

+void mq_init_ns(struct ipc_namespace *ns) {
+ ns->mq_queues_count = 0;
+ ns->mq_queues_max = DFLT_QUEUESMAX;
+ ns->mq_msg_max = DFLT_MSGMAX;
+ ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
+ ns->mq_mnt = mntget(init_ipc_ns.mq_mnt);
+}
+
+void mq_exit_ns(struct ipc_namespace *ns) {
+ /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
+ mntput(ns->mq_mnt);
+}
+
static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
struct mq_attr *attr)
{
struct user_struct *u = current_user();
struct inode *inode;
+ struct ipc_namespace *ipc_ns = &init_ipc_ns;

inode = new_inode(sb);
if (inode) {
@@ -141,8 +144,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
info->qsize = 0;
info->user = NULL; /* set when all is ok */
memset(&info->attr, 0, sizeof(info->attr));
- info->attr.mq_maxmsg = msg_max;
- info->attr.mq_msgsize = msgsize_max;
+ info->attr.mq_maxmsg = ipc_ns->mq_msg_max;
+ info->attr.mq_msgsize = ipc_ns->mq_msgsize_max;
if (attr) {
info->attr.mq_maxmsg = attr->mq_maxmsg;
info->attr.mq_msgsize = attr->mq_msgsize;
@@ -242,6 +245,7 @@ static void mqueue_delete_inode(struct inode *inode)
struct user_struct *user;
unsigned long mq_bytes;
int i;
+ struct ipc_namespace *ipc_ns = &init_ipc_ns;

if (S_ISDIR(inode->i_mode)) {
clear_inode(inode);
@@ -262,7 +266,7 @@ static void mqueue_delete_inode(struct inode *inode)
if (user) {
spin_lock(&mq_lock);
user->mq_bytes -= mq_bytes;
- queues_count--;
+ ipc_ns->mq_queues_count--;
spin_unlock(&mq_lock);
free_uid(user);
}
@@ -274,20 +278,22 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
struct inode *inode;
struct mq_attr *attr = dentry->d_fsdata;
int error;
+ struct ipc_namespace *ipc_ns = &init_ipc_ns;

spin_lock(&mq_lock);
- if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) {
+ if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
+ !capable(CAP_SYS_RESOURCE)) {
error = -ENOSPC;
goto out_lock;
}
- queues_count++;
+ ipc_ns->mq_queues_count++;
spin_unlock(&mq_lock);

inode = mqueue_get_inode(dir->i_sb, mode, attr);
if (!inode) {
error = -ENOMEM;
spin_lock(&mq_lock);
- queues_count--;
+ ipc_ns->mq_queues_count--;
goto out_lock;
}

@@ -562,7 +568,7 @@ static void remove_notification(struct mqueue_inode_info *info)
info->notify_owner = NULL;
}

-static int mq_attr_ok(struct mq_attr *attr)
+static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
{
if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
return 0;
@@ -570,8 +576,8 @@ static int mq_attr_ok(struct mq_attr *attr)
if (attr->mq_maxmsg > HARD_MSGMAX)
return 0;
} else {
- if (attr->mq_maxmsg > msg_max ||
- attr->mq_msgsize > msgsize_max)
+ if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
+ attr->mq_msgsize > ipc_ns->mq_msgsize_max)
return 0;
}
/* check for overflow */
@@ -587,8 +593,9 @@ static int mq_attr_ok(struct mq_attr *attr)
/*
* Invoked when creating a new queue via sys_mq_open
*/
-static struct file *do_create(struct dentry *dir, struct dentry *dentry,
- int oflag, mode_t mode, struct mq_attr *attr)
+static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,
+ struct dentry *dentry, int oflag, mode_t mode,
+ struct mq_attr *attr)
{
const struct cred *cred = current_cred();
struct file *result;
@@ -596,14 +603,14 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,

if (attr) {
ret = -EINVAL;
- if (!mq_attr_ok(attr))
+ if (!mq_attr_ok(ipc_ns, attr))
goto out;
/* store for use during create */
dentry->d_fsdata = attr;
}

mode &= ~current->fs->umask;
- ret = mnt_want_write(mqueue_mnt);
+ ret = mnt_want_write(ipc_ns->mq_mnt);
if (ret)
goto out;
ret = vfs_create(dir->d_inode, dentry, mode, NULL);
@@ -611,24 +618,25 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
if (ret)
goto out_drop_write;

- result = dentry_open(dentry, mqueue_mnt, oflag, cred);
+ result = dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
/*
* dentry_open() took a persistent mnt_want_write(),
* so we can now drop this one.
*/
- mnt_drop_write(mqueue_mnt);
+ mnt_drop_write(ipc_ns->mq_mnt);
return result;

out_drop_write:
- mnt_drop_write(mqueue_mnt);
+ mnt_drop_write(ipc_ns->mq_mnt);
out:
dput(dentry);
- mntput(mqueue_mnt);
+ mntput(ipc_ns->mq_mnt);
return ERR_PTR(ret);
}

/* Opens existing queue */
-static struct file *do_open(struct dentry *dentry, int oflag)
+static struct file *do_open(struct ipc_namespace *ipc_ns,
+ struct dentry *dentry, int oflag)
{
const struct cred *cred = current_cred();

@@ -637,17 +645,17 @@ static struct file *do_open(struct dentry *dentry, int oflag)

if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) {
dput(dentry);
- mntput(mqueue_mnt);
+ mntput(ipc_ns->mq_mnt);
return ERR_PTR(-EINVAL);
}

if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) {
dput(dentry);
- mntput(mqueue_mnt);
+ mntput(ipc_ns->mq_mnt);
return ERR_PTR(-EACCES);
}

- return dentry_open(dentry, mqueue_mnt, oflag, cred);
+ return dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
}

SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
@@ -658,6 +666,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
char *name;
struct mq_attr attr;
int fd, error;
+ struct ipc_namespace *ipc_ns = &init_ipc_ns;

if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
return -EFAULT;
@@ -671,13 +680,13 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
if (fd < 0)
goto out_putname;

- mutex_lock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
- dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+ mutex_lock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
+ dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
goto out_err;
}
- mntget(mqueue_mnt);
+ mntget(ipc_ns->mq_mnt);

if (oflag & O_CREAT) {
if (dentry->d_inode) { /* entry already exists */
@@ -685,10 +694,10 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
error = -EEXIST;
if (oflag & O_EXCL)
goto out;
- filp = do_open(dentry, oflag);
+ filp = do_open(ipc_ns, dentry, oflag);
} else {
- filp = do_create(mqueue_mnt->mnt_root, dentry,
- oflag, mode,
+ filp = do_create(ipc_ns, ipc_ns->mq_mnt->mnt_root,
+ dentry, oflag, mode,
u_attr ? &attr : NULL);
}
} else {
@@ -696,7 +705,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
if (!dentry->d_inode)
goto out;
audit_inode(name, dentry);
- filp = do_open(dentry, oflag);
+ filp = do_open(ipc_ns, dentry, oflag);
}

if (IS_ERR(filp)) {
@@ -709,13 +718,13 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,

out:
dput(dentry);
- mntput(mqueue_mnt);
+ mntput(ipc_ns->mq_mnt);
out_putfd:
put_unused_fd(fd);
out_err:
fd = error;
out_upsem:
- mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
+ mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
out_putname:
putname(name);
return fd;
@@ -727,14 +736,15 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
char *name;
struct dentry *dentry;
struct inode *inode = NULL;
+ struct ipc_namespace *ipc_ns = &init_ipc_ns;

name = getname(u_name);
if (IS_ERR(name))
return PTR_ERR(name);

- mutex_lock_nested(&mqueue_mnt->mnt_root->d_inode->i_mutex,
+ mutex_lock_nested(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex,
I_MUTEX_PARENT);
- dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+ dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock;
@@ -748,16 +758,16 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
inode = dentry->d_inode;
if (inode)
atomic_inc(&inode->i_count);
- err = mnt_want_write(mqueue_mnt);
+ err = mnt_want_write(ipc_ns->mq_mnt);
if (err)
goto out_err;
err = vfs_unlink(dentry->d_parent->d_inode, dentry);
- mnt_drop_write(mqueue_mnt);
+ mnt_drop_write(ipc_ns->mq_mnt);
out_err:
dput(dentry);

out_unlock:
- mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
+ mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
putname(name);
if (inode)
iput(inode);
@@ -1212,14 +1222,14 @@ static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
static ctl_table mq_sysctls[] = {
{
.procname = "queues_max",
- .data = &queues_max,
+ .data = &init_ipc_ns.mq_queues_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.procname = "msg_max",
- .data = &msg_max,
+ .data = &init_ipc_ns.mq_msg_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
@@ -1228,7 +1238,7 @@ static ctl_table mq_sysctls[] = {
},
{
.procname = "msgsize_max",
- .data = &msgsize_max,
+ .data = &init_ipc_ns.mq_msgsize_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
@@ -1274,13 +1284,13 @@ static int __init init_mqueue_fs(void)
if (error)
goto out_sysctl;

- if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) {
- error = PTR_ERR(mqueue_mnt);
+ init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type);
+ if (IS_ERR(init_ipc_ns.mq_mnt)) {
+ error = PTR_ERR(init_ipc_ns.mq_mnt);
goto out_filesystem;
}

/* internal initialization - not common for vfs */
- queues_count = 0;
spin_lock_init(&mq_lock);

return 0;
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index c82c215..c197cd1 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -13,10 +13,31 @@
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/ipc.h>
+#include <linux/ipc_namespace.h>
#include <asm/uaccess.h>

#include "util.h"

+/*
+ * The next 2 defines are here bc this is the only file
+ * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
+ * and not CONFIG_IPC_NS.
+ */
+struct ipc_namespace init_ipc_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+#ifdef CONFIG_POSIX_MQUEUE
+ .mq_mnt = NULL,
+ .mq_queues_count = 0,
+ .mq_queues_max = DFLT_QUEUESMAX,
+ .mq_msg_max = DFLT_MSGMAX,
+ .mq_msgsize_max = DFLT_MSGSIZEMAX,
+#endif
+};
+
+atomic_t nr_ipc_ns = ATOMIC_INIT(1);
+
struct msg_msgseg {
struct msg_msgseg* next;
/* the next part of the message follows immediately */
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 9171d94..4b4dc6d 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -25,6 +25,7 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
sem_init_ns(ns);
msg_init_ns(ns);
shm_init_ns(ns);
+ mq_init_ns(ns);

/*
* msgmni has already been computed for the new ipc ns.
@@ -101,6 +102,7 @@ void free_ipc_ns(struct kref *kref)
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
+ mq_exit_ns(ns);
kfree(ns);
atomic_dec(&nr_ipc_ns);

diff --git a/ipc/util.c b/ipc/util.c
index 7585a72..b8e4ba9 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -47,15 +47,6 @@ struct ipc_proc_iface {
int (*show)(struct seq_file *, void *);
};

-struct ipc_namespace init_ipc_ns = {
- .kref = {
- .refcount = ATOMIC_INIT(2),
- },
-};
-
-atomic_t nr_ipc_ns = ATOMIC_INIT(1);
-
-
#ifdef CONFIG_MEMORY_HOTPLUG

static void ipc_memory_notifier(struct work_struct *work)
diff --git a/ipc/util.h b/ipc/util.h
index 3646b45..9b6cc33 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -20,6 +20,13 @@ void shm_init (void);

struct ipc_namespace;

+#ifdef CONFIG_POSIX_MQUEUE
+void mq_exit_ns(struct ipc_namespace *ns);
+#else
+#define mq_exit_ns(ns) ((void) 0)
+#endif
+
+#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
void msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);
@@ -27,6 +34,14 @@ void shm_init_ns(struct ipc_namespace *ns);
void sem_exit_ns(struct ipc_namespace *ns);
void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
+#else
+#define sem_init_ns(ns) ((void) 0)
+#define msg_init_ns(ns) ((void) 0)
+#define shm_init_ns(ns) ((void) 0)
+#define sem_exit_ns(ns) ((void) 0)
+#define msg_exit_ns(ns) ((void) 0)
+#define shm_exit_ns(ns) ((void) 0)
+#endif

/*
* Structure that holds the parameters needed by the ipc operations
--
1.5.4.3

2009-01-17 02:04:01

by Serge E. Hallyn

[permalink] [raw]

Subject: [PATCH 2/3] ipc namespaces: implement support for posix msqueues

Implement multiple mounts of the mqueue file system, and
link it to usage of CLONE_NEWIPC.

Each ipc ns has a corresponding mqueuefs superblock. When
a user does clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the
unshare will cause an internal mount of a new mqueuefs sb
linked to the new ipc ns.

When a user does 'mount -t mqueue mqueue /dev/mqueue', he
mounts the mqueuefs superblock.

Posix message queues can be worked with both through the
mq_* system calls (see mq_overview(7)), and through the VFS
through the mqueue mount. Any usage of mq_open() and friends
will work with the acting task's ipc namespace. Any actions
through the VFS will work with the mqueuefs in which the
file was created. So if a user doesn't remount mqueuefs
after unshare(CLONE_NEWIPC), mq_open("/ab") will not be
reflected in "ls /dev/mqueue".

If task a mounts mqueue for ipc_ns:1, then clones task b with
a new ipcns, ipcns:2, and then task a is the last task in
ipc_ns:1 to exit, then (1) ipc_ns:1 will be freed, (2) it's
superblock will live on until task b umounts the corresponding
mqueuefs, and vfs actions will continue to succeed, but (3)
sb->s_fs_info will be NULL for the sb corresponding to the
deceased ipc_ns:1.

To make this happen, we must protect the ipc reference count
when
a. a task exits and drops its ipcns->count, since
it might be dropping it to 0 and freeing the ipcns
b. a task accesses the ipcns through its mqueuefs
interface, since it bumps the ipcns refcount and
might race with the last task in the ipcns exiting.
So the kref is changed to an atomic_t so we can use
atomic_dec_and_lock(&ns->count,mq_lock), and every access to
the ipcns through ns = mqueuefs_sb->s_fs_info is protected by
the same lock.

Changelog:
Dec 19: Commented put_ipc_ns (As per Dave Hansen comment)
Addressed nits (per Sukadev Bhattiprolu comment)
Dec 17: removed unused static fn (get_ipcns_from_sb)

Signed-off-by: Cedric Le Goater <[email protected]>
Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/ipc_namespace.h | 16 ++---
ipc/mqueue.c | 140 ++++++++++++++++++++++++++++++++---------
ipc/msgutil.c | 8 +--
ipc/namespace.c | 41 ++++++++++--
ipc/util.h | 6 +-
5 files changed, 160 insertions(+), 51 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 2994eac..a73bbef 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -25,7 +25,7 @@ struct ipc_ids {
};

struct ipc_namespace {
- struct kref kref;
+ atomic_t count;
struct ipc_ids ids[3];

int sem_ctls[4];
@@ -56,6 +56,7 @@ struct ipc_namespace {
extern struct ipc_namespace init_ipc_ns;
extern atomic_t nr_ipc_ns;

+extern spinlock_t mq_lock;
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
#define INIT_IPC_NS(ns) .ns = &init_ipc_ns,
#else
@@ -75,18 +76,18 @@ extern int ipcns_notify(unsigned long);
#endif /* CONFIG_SYSVIPC */

#ifdef CONFIG_POSIX_MQUEUE
-extern void mq_init_ns(struct ipc_namespace *ns);
+extern int mq_init_ns(struct ipc_namespace *ns);
/* default values */
#define DFLT_QUEUESMAX 256 /* max number of message queues */
#define DFLT_MSGMAX 10 /* max number of messages in each queue */
#define HARD_MSGMAX (131072/sizeof(void *))
#define DFLT_MSGSIZEMAX 8192 /* max message size */
#else
-#define mq_init_ns(ns) ((void) 0)
+#define mq_init_ns(ns) (0)
#endif

#if defined(CONFIG_IPC_NS)
-extern void free_ipc_ns(struct kref *kref);
+extern void free_ipc_ns(struct ipc_namespace *ns);
extern struct ipc_namespace *copy_ipcs(unsigned long flags,
struct ipc_namespace *ns);
extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
@@ -96,14 +97,11 @@ extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
if (ns)
- kref_get(&ns->kref);
+ atomic_inc(&ns->count);
return ns;
}

-static inline void put_ipc_ns(struct ipc_namespace *ns)
-{
- kref_put(&ns->kref, free_ipc_ns);
-}
+extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
struct ipc_namespace *ns)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 50c0658..b51a6d3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -88,7 +88,6 @@ static const struct file_operations mqueue_file_operations;
static struct super_operations mqueue_super_ops;
static void remove_notification(struct mqueue_inode_info *info);

-static spinlock_t mq_lock;
static struct kmem_cache *mqueue_inode_cachep;

static struct ctl_table_header * mq_sysctl_table;
@@ -98,25 +97,30 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
return container_of(inode, struct mqueue_inode_info, vfs_inode);
}

-void mq_init_ns(struct ipc_namespace *ns) {
- ns->mq_queues_count = 0;
- ns->mq_queues_max = DFLT_QUEUESMAX;
- ns->mq_msg_max = DFLT_MSGMAX;
- ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
- ns->mq_mnt = mntget(init_ipc_ns.mq_mnt);
+/*
+ * This routine should be called with the mq_lock held.
+ */
+static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
+{
+ return get_ipc_ns(inode->i_sb->s_fs_info);
}

-void mq_exit_ns(struct ipc_namespace *ns) {
- /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
- mntput(ns->mq_mnt);
+static inline struct ipc_namespace *get_ns_from_inode(struct inode *inode)
+{
+ struct ipc_namespace *ns;
+
+ spin_lock(&mq_lock);
+ ns = __get_ns_from_inode(inode);
+ spin_unlock(&mq_lock);
+ return ns;
}

-static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
- struct mq_attr *attr)
+static struct inode *mqueue_get_inode(struct super_block *sb,
+ struct ipc_namespace *ipc_ns, int mode,
+ struct mq_attr *attr)
{
struct user_struct *u = current_user();
struct inode *inode;
- struct ipc_namespace *ipc_ns = &init_ipc_ns;

inode = new_inode(sb);
if (inode) {
@@ -191,30 +195,76 @@ out_inode:
static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
+ struct ipc_namespace *ns = data;
+ int error = 0;

sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = MQUEUE_MAGIC;
sb->s_op = &mqueue_super_ops;

- inode = mqueue_get_inode(sb, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
- if (!inode)
- return -ENOMEM;
+ inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
+ NULL);
+ if (!inode) {
+ error = -ENOMEM;
+ goto out;
+ }

sb->s_root = d_alloc_root(inode);
if (!sb->s_root) {
iput(inode);
- return -ENOMEM;
+ error = -ENOMEM;
}

- return 0;
+out:
+ return error;
+}
+
+static int compare_sb_single_ns(struct super_block *sb, void *data)
+{
+ return sb->s_fs_info == data;
+}
+
+static int set_sb_single_ns(struct super_block *sb, void *data)
+{
+ sb->s_fs_info = data;
+ return set_anon_super(sb, NULL);
+}
+
+static int get_sb_single_ns(struct file_system_type *fs_type,
+ int flags, void *data,
+ int (*fill_super)(struct super_block *, void *, int),
+ struct vfsmount *mnt)
+{
+ struct super_block *s;
+ int error;
+
+ s = sget(fs_type, compare_sb_single_ns, set_sb_single_ns, data);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+ if (!s->s_root) {
+ s->s_flags = flags;
+ error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ return error;
+ }
+ s->s_flags |= MS_ACTIVE;
+ }
+ do_remount_sb(s, flags, data, 0);
+ return simple_set_mnt(mnt, s);
}

static int mqueue_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name,
void *data, struct vfsmount *mnt)
{
- return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
+ struct ipc_namespace *ns = data;
+
+ if (!(flags & MS_KERNMOUNT))
+ ns = current->nsproxy->ipc_ns;
+ return get_sb_single_ns(fs_type, flags, ns, mqueue_fill_super, mnt);
}

static void init_once(void *foo)
@@ -245,12 +295,13 @@ static void mqueue_delete_inode(struct inode *inode)
struct user_struct *user;
unsigned long mq_bytes;
int i;
- struct ipc_namespace *ipc_ns = &init_ipc_ns;
+ struct ipc_namespace *ipc_ns;

if (S_ISDIR(inode->i_mode)) {
clear_inode(inode);
return;
}
+ ipc_ns = get_ns_from_inode(inode);
info = MQUEUE_I(inode);
spin_lock(&info->lock);
for (i = 0; i < info->attr.mq_curmsgs; i++)
@@ -266,10 +317,12 @@ static void mqueue_delete_inode(struct inode *inode)
if (user) {
spin_lock(&mq_lock);
user->mq_bytes -= mq_bytes;
- ipc_ns->mq_queues_count--;
+ if (ipc_ns)
+ ipc_ns->mq_queues_count--;
spin_unlock(&mq_lock);
free_uid(user);
}
+ put_ipc_ns(ipc_ns);
}

static int mqueue_create(struct inode *dir, struct dentry *dentry,
@@ -278,9 +331,14 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
struct inode *inode;
struct mq_attr *attr = dentry->d_fsdata;
int error;
- struct ipc_namespace *ipc_ns = &init_ipc_ns;
+ struct ipc_namespace *ipc_ns;

spin_lock(&mq_lock);
+ ipc_ns = __get_ns_from_inode(dir);
+ if (!ipc_ns) {
+ error = -EACCES;
+ goto out_lock;
+ }
if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
!capable(CAP_SYS_RESOURCE)) {
error = -ENOSPC;
@@ -289,7 +347,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
ipc_ns->mq_queues_count++;
spin_unlock(&mq_lock);

- inode = mqueue_get_inode(dir->i_sb, mode, attr);
+ inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
if (!inode) {
error = -ENOMEM;
spin_lock(&mq_lock);
@@ -297,6 +355,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
goto out_lock;
}

+ put_ipc_ns(ipc_ns);
dir->i_size += DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;

@@ -305,6 +364,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
return 0;
out_lock:
spin_unlock(&mq_lock);
+ put_ipc_ns(ipc_ns);
return error;
}

@@ -666,7 +726,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
char *name;
struct mq_attr attr;
int fd, error;
- struct ipc_namespace *ipc_ns = &init_ipc_ns;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;

if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
return -EFAULT;
@@ -736,7 +796,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
char *name;
struct dentry *dentry;
struct inode *inode = NULL;
- struct ipc_namespace *ipc_ns = &init_ipc_ns;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;

name = getname(u_name);
if (IS_ERR(name))
@@ -1213,6 +1273,29 @@ static struct file_system_type mqueue_fs_type = {
.kill_sb = kill_litter_super,
};

+int mq_init_ns(struct ipc_namespace *ns)
+{
+ ns->mq_queues_count = 0;
+ ns->mq_queues_max = DFLT_QUEUESMAX;
+ ns->mq_msg_max = DFLT_MSGMAX;
+ ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
+
+ ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
+ if (IS_ERR(ns->mq_mnt))
+ return PTR_ERR(ns->mq_mnt);
+ return 0;
+}
+
+void mq_clear_sbinfo(struct ipc_namespace *ns)
+{
+ ns->mq_mnt->mnt_sb->s_fs_info = NULL;
+}
+
+void mq_put_mnt(struct ipc_namespace *ns)
+{
+ mntput(ns->mq_mnt);
+}
+
static int msg_max_limit_min = MIN_MSGMAX;
static int msg_max_limit_max = MAX_MSGMAX;

@@ -1284,15 +1367,14 @@ static int __init init_mqueue_fs(void)
if (error)
goto out_sysctl;

- init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type);
+ spin_lock_init(&mq_lock);
+
+ init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
if (IS_ERR(init_ipc_ns.mq_mnt)) {
error = PTR_ERR(init_ipc_ns.mq_mnt);
goto out_filesystem;
}

- /* internal initialization - not common for vfs */
- spin_lock_init(&mq_lock);
-
return 0;

out_filesystem:
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index c197cd1..21475b0 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,18 +18,16 @@

#include "util.h"

+spinlock_t mq_lock;
+
/*
* The next 2 defines are here bc this is the only file
* compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
* and not CONFIG_IPC_NS.
*/
struct ipc_namespace init_ipc_ns = {
- .kref = {
- .refcount = ATOMIC_INIT(2),
- },
+ .count = ATOMIC_INIT(2),
#ifdef CONFIG_POSIX_MQUEUE
- .mq_mnt = NULL,
- .mq_queues_count = 0,
.mq_queues_max = DFLT_QUEUESMAX,
.mq_msg_max = DFLT_MSGMAX,
.mq_msgsize_max = DFLT_MSGSIZEMAX,
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 4b4dc6d..e3bbf56 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -9,23 +9,31 @@
#include <linux/rcupdate.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mount.h>

#include "util.h"

static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
{
struct ipc_namespace *ns;
+ int err;

ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
if (ns == NULL)
return ERR_PTR(-ENOMEM);

+ atomic_set(&ns->count, 1);
+ err = mq_init_ns(ns);
+ if (err) {
+ kfree(ns);
+ return ERR_PTR(err);
+ }
atomic_inc(&nr_ipc_ns);

sem_init_ns(ns);
msg_init_ns(ns);
shm_init_ns(ns);
- mq_init_ns(ns);

/*
* msgmni has already been computed for the new ipc ns.
@@ -35,7 +43,6 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
ipcns_notify(IPCNS_CREATED);
register_ipcns_notifier(ns);

- kref_init(&ns->kref);
return ns;
}

@@ -85,11 +92,34 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
up_write(&ids->rw_mutex);
}

-void free_ipc_ns(struct kref *kref)
+/*
+ * put_ipc_ns - drop a reference to an ipc namespace.
+ * @ns: the namespace to put
+ *
+ * If this is the last task in the namespace exiting, and
+ * it is dropping the refcount to 0, then it can race with
+ * a task in another ipc namespace but in a mounts namespace
+ * which has this ipcns's mqueuefs mounted, doing some action
+ * with one of the mqueuefs files. That can raise the refcount.
+ * So dropping the refcount, and raising the refcount when
+ * accessing it through the VFS, are protected with mq_lock.
+ *
+ * (Clearly, a task raising the refcount on its own ipc_ns
+ * needn't take mq_lock since it can't race with the last task
+ * in the ipcns exiting).
+ */
+void put_ipc_ns(struct ipc_namespace *ns)
{
- struct ipc_namespace *ns;
+ if (ns && atomic_dec_and_lock(&ns->count, &mq_lock)) {
+ mq_clear_sbinfo(ns);
+ spin_unlock(&mq_lock);
+ mq_put_mnt(ns);
+ free_ipc_ns(ns);
+ }
+}

- ns = container_of(kref, struct ipc_namespace, kref);
+void free_ipc_ns(struct ipc_namespace *ns)
+{
/*
* Unregistering the hotplug notifier at the beginning guarantees
* that the ipc namespace won't be freed while we are inside the
@@ -102,7 +132,6 @@ void free_ipc_ns(struct kref *kref)
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
- mq_exit_ns(ns);
kfree(ns);
atomic_dec(&nr_ipc_ns);

diff --git a/ipc/util.h b/ipc/util.h
index 9b6cc33..236e4ed 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -21,9 +21,11 @@ void shm_init (void);
struct ipc_namespace;

#ifdef CONFIG_POSIX_MQUEUE
-void mq_exit_ns(struct ipc_namespace *ns);
+extern void mq_clear_sbinfo(struct ipc_namespace *ns);
+extern void mq_put_mnt(struct ipc_namespace *ns);
#else
-#define mq_exit_ns(ns) ((void) 0)
+#define mq_clear_sbinfo(ns) ((void) 0)
+#define mq_put_mnt(ns) ((void) 0)
#endif

#ifdef CONFIG_SYSVIPC
--
1.5.4.3

2009-01-17 02:05:37

by Serge E. Hallyn

[permalink] [raw]

Subject: [PATCH 3/3] mqueue namespace: adapt sysctl

Largely inspired from ipc/ipc_sysctl.c. This patch isolates the mqueue
sysctl stuff in its own file.

Signed-off-by: Cedric Le Goater <[email protected]>
Signed-off-by: Nadia Derbey <[email protected]>
Signed-off-by: Serge E. Hallyn <[email protected]>
---
include/linux/ipc_namespace.h | 14 +++++
init/Kconfig | 6 ++
ipc/Makefile | 1 +
ipc/mq_sysctl.c | 125 +++++++++++++++++++++++++++++++++++++++++
ipc/mqueue.c | 65 +---------------------
5 files changed, 147 insertions(+), 64 deletions(-)
create mode 100644 ipc/mq_sysctl.c

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index a73bbef..9f57f55 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -121,4 +121,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
{
}
#endif
+
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+
+struct ctl_table_header;
+extern struct ctl_table_header *mq_register_sysctl_table(void);
+
+#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
+
+static inline struct ctl_table_header *mq_register_sysctl_table(void)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
#endif
diff --git a/init/Kconfig b/init/Kconfig
index 8084385..f926c10 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -148,6 +148,12 @@ config POSIX_MQUEUE

If unsure, say Y.

+config POSIX_MQUEUE_SYSCTL
+ bool
+ depends on POSIX_MQUEUE
+ depends on SYSCTL
+ default y
+
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
help
diff --git a/ipc/Makefile b/ipc/Makefile
index 65c3843..4e1955e 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o
obj_mq-$(CONFIG_COMPAT) += compat_mq.o
obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
obj-$(CONFIG_IPC_NS) += namespace.o
+obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o

diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
new file mode 100644
index 0000000..81f01b7
--- /dev/null
+++ b/ipc/mq_sysctl.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2007 IBM Corporation
+ *
+ * Author: Cedric Le Goater <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+#include <linux/sysctl.h>
+
+/*
+ * Define the ranges various user-specified maximum values can
+ * be set to.
+ */
+#define MIN_MSGMAX 1 /* min value for msg_max */
+#define MAX_MSGMAX HARD_MSGMAX /* max value for msg_max */
+#define MIN_MSGSIZEMAX 128 /* min value for msgsize_max */
+#define MAX_MSGSIZEMAX (8192*128) /* max value for msgsize_max */
+
+static void *get_mq(ctl_table *table)
+{
+ char *which = table->data;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+ return which;
+}
+
+#ifdef CONFIG_PROC_FS
+static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table mq_table;
+ memcpy(&mq_table, table, sizeof(mq_table));
+ mq_table.data = get_mq(table);
+
+ return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos);
+}
+
+static int proc_mq_dointvec_minmax(ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table mq_table;
+ memcpy(&mq_table, table, sizeof(mq_table));
+ mq_table.data = get_mq(table);
+
+ return proc_dointvec_minmax(&mq_table, write, filp, buffer,
+ lenp, ppos);
+}
+#else /* CONFIG_PROC_FS */
+static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+static int proc_mq_dointvec_minmax(ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int msg_max_limit_min = MIN_MSGMAX;
+static int msg_max_limit_max = MAX_MSGMAX;
+
+static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
+static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
+
+static ctl_table mq_sysctls[] = {
+ {
+ .procname = "queues_max",
+ .data = &init_ipc_ns.mq_queues_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_mq_dointvec,
+ },
+ {
+ .procname = "msg_max",
+ .data = &init_ipc_ns.mq_msg_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_mq_dointvec_minmax,
+ .extra1 = &msg_max_limit_min,
+ .extra2 = &msg_max_limit_max,
+ },
+ {
+ .procname = "msgsize_max",
+ .data = &init_ipc_ns.mq_msgsize_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_mq_dointvec_minmax,
+ .extra1 = &msg_maxsize_limit_min,
+ .extra2 = &msg_maxsize_limit_max,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_dir[] = {
+ {
+ .procname = "mqueue",
+ .mode = 0555,
+ .child = mq_sysctls,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_root[] = {
+ {
+ .ctl_name = CTL_FS,
+ .procname = "fs",
+ .mode = 0555,
+ .child = mq_sysctl_dir,
+ },
+ { .ctl_name = 0 }
+};
+
+struct ctl_table_header *mq_register_sysctl_table(void)
+{
+ return register_sysctl_table(mq_sysctl_root);
+}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index b51a6d3..5fc8d57 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,15 +47,6 @@
#define STATE_PENDING 1
#define STATE_READY 2

-/*
- * Define the ranges various user-specified maximum values can
- * be set to.
- */
-#define MIN_MSGMAX 1 /* min value for msg_max */
-#define MAX_MSGMAX HARD_MSGMAX /* max value for msg_max */
-#define MIN_MSGSIZEMAX 128 /* min value for msgsize_max */
-#define MAX_MSGSIZEMAX (8192*128) /* max value for msgsize_max */
-
struct ext_wait_queue { /* queue of sleeping tasks */
struct task_struct *task;
struct list_head list;
@@ -1296,60 +1287,6 @@ void mq_put_mnt(struct ipc_namespace *ns)
mntput(ns->mq_mnt);
}

-static int msg_max_limit_min = MIN_MSGMAX;
-static int msg_max_limit_max = MAX_MSGMAX;
-
-static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
-static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
-
-static ctl_table mq_sysctls[] = {
- {
- .procname = "queues_max",
- .data = &init_ipc_ns.mq_queues_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .procname = "msg_max",
- .data = &init_ipc_ns.mq_msg_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .extra1 = &msg_max_limit_min,
- .extra2 = &msg_max_limit_max,
- },
- {
- .procname = "msgsize_max",
- .data = &init_ipc_ns.mq_msgsize_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .extra1 = &msg_maxsize_limit_min,
- .extra2 = &msg_maxsize_limit_max,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table mq_sysctl_dir[] = {
- {
- .procname = "mqueue",
- .mode = 0555,
- .child = mq_sysctls,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table mq_sysctl_root[] = {
- {
- .ctl_name = CTL_FS,
- .procname = "fs",
- .mode = 0555,
- .child = mq_sysctl_dir,
- },
- { .ctl_name = 0 }
-};
-
static int __init init_mqueue_fs(void)
{
int error;
@@ -1361,7 +1298,7 @@ static int __init init_mqueue_fs(void)
return -ENOMEM;

/* ignore failues - they are not fatal */
- mq_sysctl_table = register_sysctl_table(mq_sysctl_root);
+ mq_sysctl_table = mq_register_sysctl_table();

error = register_filesystem(&mqueue_fs_type);
if (error)
--
1.5.4.3

2009-01-26 23:29:18

by Andrew Morton

[permalink] [raw]

Subject: Re: [Patch 0/3] posix mqueue namespace (v14)

On Fri, 16 Jan 2009 20:02:48 -0600
"Serge E. Hallyn" <[email protected]> wrote:

> IPC namespaces are completely disjoint id->object mappings.
> A task can pass CLONE_NEWIPC to unshare and clone to get
> a new, empty, IPC namespace. Until now this has supported
> SYSV IPC.
>
> Most Posix IPC is done in userspace. The posix mqueue
> support, however, is implemented on top of the mqueue fs.
>
> This patchset implements multiple mqueue fs instances,
> one per IPC namespace to be precise.
>
> To create a new ipc namespace with posix mq support, you
> should now:
>
> unshare(CLONE_NEWIPC|CLONE_NEWNS);
> umount /dev/mqueue
> mount -t mqueue mqueue /dev/mqueue
>
> It's perfectly valid to do vfs operations on files
> in another ipc_namespace's /dev/mqueue, but any use
> of mq_open(3) and friends will act in your own ipc_ns.
> After the ipc namespace has exited, you can still
> unlink but no longer create files in that fs (since
> accounting is carried.
>
> Changelog:
> v14: (Jan 16 2009) port to linux-next
> v13: (Dec 28 2009)
> 1. addressed comments by Dave and Suka
> 2. ported Cedric's patch to make posix mq sysctls
> per-namespace
>
> When convenient, it would be great to see this tested
> in -mm.

hm. Who is going to test it?

2009-01-26 23:29:32

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 1/3] mqueue ns: move mqueue_mnt into struct ipc_namespace

On Fri, 16 Jan 2009 20:03:22 -0600
"Serge E. Hallyn" <[email protected]> wrote:

> Move mqueue vfsmount plus a few tunables into the
> ipc_namespace struct. The CONFIG_IPC_NS boolean
> and the ipc_namespace struct will serve both the
> posix message queue namespaces and the SYSV ipc
> namespaces.
>
>
> ...
>
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -44,24 +44,48 @@ struct ipc_namespace {
> int shm_tot;
>
> struct notifier_block ipcns_nb;
> +
> + struct vfsmount *mq_mnt;
> + unsigned int mq_queues_count;
> + unsigned int mq_queues_max;
> + unsigned int mq_msg_max;
> + unsigned int mq_msgsize_max;
> +
> };

Some documentation for the new fields would be nice. It should mention
the locking protocol for those fields.

> extern struct ipc_namespace init_ipc_ns;
> extern atomic_t nr_ipc_ns;
>
> -#ifdef CONFIG_SYSVIPC
> +#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
> #define INIT_IPC_NS(ns) .ns = &init_ipc_ns,
> +#else
> +#define INIT_IPC_NS(ns)
> +#endif
>
> +#ifdef CONFIG_SYSVIPC
> extern int register_ipcns_notifier(struct ipc_namespace *);
> extern int cond_register_ipcns_notifier(struct ipc_namespace *);
> extern void unregister_ipcns_notifier(struct ipc_namespace *);
> extern int ipcns_notify(unsigned long);
> -
> #else /* CONFIG_SYSVIPC */
> -#define INIT_IPC_NS(ns)
> +#define register_ipcns_notifier(ns) (0)
> +#define cond_register_ipcns_notifier(ns) (0)
> +#define unregister_ipcns_notifier(ns) ((void) 0)
> +#define ipcns_notify(l) (0)

Can we use inlines here? We get typechecking..

> #endif /* CONFIG_SYSVIPC */
>
> -#if defined(CONFIG_SYSVIPC) && defined(CONFIG_IPC_NS)
> +#ifdef CONFIG_POSIX_MQUEUE
> +extern void mq_init_ns(struct ipc_namespace *ns);
> +/* default values */
> +#define DFLT_QUEUESMAX 256 /* max number of message queues */
> +#define DFLT_MSGMAX 10 /* max number of messages in each queue */
> +#define HARD_MSGMAX (131072/sizeof(void *))
> +#define DFLT_MSGSIZEMAX 8192 /* max message size */
> +#else
> +#define mq_init_ns(ns) ((void) 0)
> +#endif
> +
>
> ...
>
> +void mq_init_ns(struct ipc_namespace *ns) {
> + ns->mq_queues_count = 0;
> + ns->mq_queues_max = DFLT_QUEUESMAX;
> + ns->mq_msg_max = DFLT_MSGMAX;
> + ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
> + ns->mq_mnt = mntget(init_ipc_ns.mq_mnt);
> +}
> +
> +void mq_exit_ns(struct ipc_namespace *ns) {
> + /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
> + mntput(ns->mq_mnt);
> +}

You might want to ask checkpatch about this, please.

> static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
> struct mq_attr *attr)
> {
> struct user_struct *u = current_user();
> struct inode *inode;
> + struct ipc_namespace *ipc_ns = &init_ipc_ns;
>
> inode = new_inode(sb);
> if (inode) {
> @@ -141,8 +144,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
> info->qsize = 0;
> info->user = NULL; /* set when all is ok */
> memset(&info->attr, 0, sizeof(info->attr));
> - info->attr.mq_maxmsg = msg_max;
> - info->attr.mq_msgsize = msgsize_max;
> + info->attr.mq_maxmsg = ipc_ns->mq_msg_max;
> + info->attr.mq_msgsize = ipc_ns->mq_msgsize_max;
> if (attr) {
> info->attr.mq_maxmsg = attr->mq_maxmsg;
> info->attr.mq_msgsize = attr->mq_msgsize;
> @@ -242,6 +245,7 @@ static void mqueue_delete_inode(struct inode *inode)
> struct user_struct *user;
> unsigned long mq_bytes;
> int i;
> + struct ipc_namespace *ipc_ns = &init_ipc_ns;
>
> if (S_ISDIR(inode->i_mode)) {
> clear_inode(inode);
> @@ -262,7 +266,7 @@ static void mqueue_delete_inode(struct inode *inode)
> if (user) {
> spin_lock(&mq_lock);
> user->mq_bytes -= mq_bytes;
> - queues_count--;
> + ipc_ns->mq_queues_count--;

ah-hah. mqueue_lock!

> spin_unlock(&mq_lock);
> free_uid(user);
> }
> @@ -274,20 +278,22 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
> struct inode *inode;
> struct mq_attr *attr = dentry->d_fsdata;
> int error;
> + struct ipc_namespace *ipc_ns = &init_ipc_ns;
>
> spin_lock(&mq_lock);
> - if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) {
> + if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
> + !capable(CAP_SYS_RESOURCE)) {
> error = -ENOSPC;
> goto out_lock;
> }
> - queues_count++;
> + ipc_ns->mq_queues_count++;
> spin_unlock(&mq_lock);
>
> inode = mqueue_get_inode(dir->i_sb, mode, attr);
> if (!inode) {
> error = -ENOMEM;
> spin_lock(&mq_lock);
> - queues_count--;
> + ipc_ns->mq_queues_count--;
> goto out_lock;

hm. That should have been called out_unlock, conventionally.

> }
>
> @@ -562,7 +568,7 @@ static void remove_notification(struct mqueue_inode_info *info)
> info->notify_owner = NULL;
> }
>
>
> ...
>
> @@ -1212,14 +1222,14 @@ static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
> static ctl_table mq_sysctls[] = {
> {
> .procname = "queues_max",
> - .data = &queues_max,
> + .data = &init_ipc_ns.mq_queues_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> .proc_handler = &proc_dointvec,
> },
> {
> .procname = "msg_max",
> - .data = &msg_max,
> + .data = &init_ipc_ns.mq_msg_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> .proc_handler = &proc_dointvec_minmax,
> @@ -1228,7 +1238,7 @@ static ctl_table mq_sysctls[] = {
> },
> {
> .procname = "msgsize_max",
> - .data = &msgsize_max,
> + .data = &init_ipc_ns.mq_msgsize_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> .proc_handler = &proc_dointvec_minmax,

So the sysctl has new semantics. Some description of the designed
decisions here would be appropriate.

If I'm running in IPC namespace "foo" and I modify msg_max, it seems
that this will alter the tunable on the top-level namespace but will
leave my namespace unaltered. Surprise?

(comes back after reading patch #3)

Ah. This seems to have been secretly fixed in that patch?

>
> ...
>
> +/*
> + * The next 2 defines are here bc this is the only file
> + * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
> + * and not CONFIG_IPC_NS.
> + */
> +struct ipc_namespace init_ipc_ns = {
> + .kref = {
> + .refcount = ATOMIC_INIT(2),

2? How come? Needs a comment?

> + },
> +#ifdef CONFIG_POSIX_MQUEUE
> + .mq_mnt = NULL,
> + .mq_queues_count = 0,
> + .mq_queues_max = DFLT_QUEUESMAX,
> + .mq_msg_max = DFLT_MSGMAX,
> + .mq_msgsize_max = DFLT_MSGSIZEMAX,
> +#endif
> +};
> +
> +atomic_t nr_ipc_ns = ATOMIC_INIT(1);
> +
> struct msg_msgseg {
> struct msg_msgseg* next;
> /* the next part of the message follows immediately */
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index 9171d94..4b4dc6d 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -25,6 +25,7 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
> sem_init_ns(ns);
> msg_init_ns(ns);
> shm_init_ns(ns);
> + mq_init_ns(ns);
>
> /*
> * msgmni has already been computed for the new ipc ns.
> @@ -101,6 +102,7 @@ void free_ipc_ns(struct kref *kref)
> sem_exit_ns(ns);
> msg_exit_ns(ns);
> shm_exit_ns(ns);
> + mq_exit_ns(ns);
> kfree(ns);
> atomic_dec(&nr_ipc_ns);
>
> diff --git a/ipc/util.c b/ipc/util.c
> index 7585a72..b8e4ba9 100644
> --- a/ipc/util.c
> +++ b/ipc/util.c
> @@ -47,15 +47,6 @@ struct ipc_proc_iface {
> int (*show)(struct seq_file *, void *);
> };
>
> -struct ipc_namespace init_ipc_ns = {
> - .kref = {
> - .refcount = ATOMIC_INIT(2),

OK, I missed that last time ;)

> - },
> -};
> -
> -atomic_t nr_ipc_ns = ATOMIC_INIT(1);
> -
> -
> #ifdef CONFIG_MEMORY_HOTPLUG
>
> static void ipc_memory_notifier(struct work_struct *work)
> diff --git a/ipc/util.h b/ipc/util.h
> index 3646b45..9b6cc33 100644
> --- a/ipc/util.h
> +++ b/ipc/util.h
> @@ -20,6 +20,13 @@ void shm_init (void);
>
> struct ipc_namespace;
>
> +#ifdef CONFIG_POSIX_MQUEUE
> +void mq_exit_ns(struct ipc_namespace *ns);
> +#else
> +#define mq_exit_ns(ns) ((void) 0)
> +#endif
> +
> +#ifdef CONFIG_SYSVIPC
> void sem_init_ns(struct ipc_namespace *ns);
> void msg_init_ns(struct ipc_namespace *ns);
> void shm_init_ns(struct ipc_namespace *ns);
> @@ -27,6 +34,14 @@ void shm_init_ns(struct ipc_namespace *ns);
> void sem_exit_ns(struct ipc_namespace *ns);
> void msg_exit_ns(struct ipc_namespace *ns);
> void shm_exit_ns(struct ipc_namespace *ns);
> +#else
> +#define sem_init_ns(ns) ((void) 0)
> +#define msg_init_ns(ns) ((void) 0)
> +#define shm_init_ns(ns) ((void) 0)
> +#define sem_exit_ns(ns) ((void) 0)
> +#define msg_exit_ns(ns) ((void) 0)
> +#define shm_exit_ns(ns) ((void) 0)

Again, could be written in C.

> +#endif

2009-01-26 23:29:47

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 2/3] ipc namespaces: implement support for posix msqueues

On Fri, 16 Jan 2009 20:03:32 -0600
"Serge E. Hallyn" <[email protected]> wrote:

> Implement multiple mounts of the mqueue file system, and
> link it to usage of CLONE_NEWIPC.
>
> Each ipc ns has a corresponding mqueuefs superblock. When
> a user does clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the
> unshare will cause an internal mount of a new mqueuefs sb
> linked to the new ipc ns.
>
> When a user does 'mount -t mqueue mqueue /dev/mqueue', he
> mounts the mqueuefs superblock.
>
> Posix message queues can be worked with both through the
> mq_* system calls (see mq_overview(7)), and through the VFS
> through the mqueue mount. Any usage of mq_open() and friends
> will work with the acting task's ipc namespace. Any actions
> through the VFS will work with the mqueuefs in which the
> file was created. So if a user doesn't remount mqueuefs
> after unshare(CLONE_NEWIPC), mq_open("/ab") will not be
> reflected in "ls /dev/mqueue".
>
> If task a mounts mqueue for ipc_ns:1, then clones task b with
> a new ipcns, ipcns:2, and then task a is the last task in
> ipc_ns:1 to exit, then (1) ipc_ns:1 will be freed, (2) it's
> superblock will live on until task b umounts the corresponding
> mqueuefs, and vfs actions will continue to succeed, but (3)
> sb->s_fs_info will be NULL for the sb corresponding to the
> deceased ipc_ns:1.

I suppose that stuff like this should find its way into a manpage one
day. That'll be fun for someone.

>
> ...
>
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -25,7 +25,7 @@ struct ipc_ids {
> };
>
> struct ipc_namespace {
> - struct kref kref;
> + atomic_t count;
> struct ipc_ids ids[3];
>
> int sem_ctls[4];
> @@ -56,6 +56,7 @@ struct ipc_namespace {
> extern struct ipc_namespace init_ipc_ns;
> extern atomic_t nr_ipc_ns;
>
> +extern spinlock_t mq_lock;
> #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
> #define INIT_IPC_NS(ns) .ns = &init_ipc_ns,
> #else
> @@ -75,18 +76,18 @@ extern int ipcns_notify(unsigned long);
> #endif /* CONFIG_SYSVIPC */
>
> #ifdef CONFIG_POSIX_MQUEUE
> -extern void mq_init_ns(struct ipc_namespace *ns);
> +extern int mq_init_ns(struct ipc_namespace *ns);
> /* default values */
> #define DFLT_QUEUESMAX 256 /* max number of message queues */
> #define DFLT_MSGMAX 10 /* max number of messages in each queue */
> #define HARD_MSGMAX (131072/sizeof(void *))
> #define DFLT_MSGSIZEMAX 8192 /* max message size */
> #else
> -#define mq_init_ns(ns) ((void) 0)
> +#define mq_init_ns(ns) (0)

argh

> #endif
>
> #if defined(CONFIG_IPC_NS)
> -extern void free_ipc_ns(struct kref *kref);
> +extern void free_ipc_ns(struct ipc_namespace *ns);
> extern struct ipc_namespace *copy_ipcs(unsigned long flags,
> struct ipc_namespace *ns);
> extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
>
> ...
>
> +/*
> + * This routine should be called with the mq_lock held.
> + */
> +static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
> +{
> + return get_ipc_ns(inode->i_sb->s_fs_info);
> }
>
> -void mq_exit_ns(struct ipc_namespace *ns) {
> - /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
> - mntput(ns->mq_mnt);
> +static inline struct ipc_namespace *get_ns_from_inode(struct inode *inode)
> +{
> + struct ipc_namespace *ns;
> +
> + spin_lock(&mq_lock);
> + ns = __get_ns_from_inode(inode);
> + spin_unlock(&mq_lock);
> + return ns;
> }

No need for the explicit inlining here.

>
> ...
>
> +static int compare_sb_single_ns(struct super_block *sb, void *data)
> +{
> + return sb->s_fs_info == data;
> +}
> +
> +static int set_sb_single_ns(struct super_block *sb, void *data)
> +{
> + sb->s_fs_info = data;
> + return set_anon_super(sb, NULL);
> +}
> +
> +static int get_sb_single_ns(struct file_system_type *fs_type,
> + int flags, void *data,
> + int (*fill_super)(struct super_block *, void *, int),
> + struct vfsmount *mnt)
> +{
> + struct super_block *s;
> + int error;
> +
> + s = sget(fs_type, compare_sb_single_ns, set_sb_single_ns, data);
> + if (IS_ERR(s))
> + return PTR_ERR(s);
> + if (!s->s_root) {
> + s->s_flags = flags;
> + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
> + if (error) {
> + up_write(&s->s_umount);
> + deactivate_super(s);
> + return error;
> + }
> + s->s_flags |= MS_ACTIVE;
> + }
> + do_remount_sb(s, flags, data, 0);
> + return simple_set_mnt(mnt, s);
> }

The above doesn't seem specific to mqueue. Is it in the best place?

> static int mqueue_get_sb(struct file_system_type *fs_type,
> int flags, const char *dev_name,
> void *data, struct vfsmount *mnt)
> {
> - return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
> + struct ipc_namespace *ns = data;
> +
> + if (!(flags & MS_KERNMOUNT))
> + ns = current->nsproxy->ipc_ns;
> + return get_sb_single_ns(fs_type, flags, ns, mqueue_fill_super, mnt);
> }
>
> static void init_once(void *foo)
> @@ -245,12 +295,13 @@ static void mqueue_delete_inode(struct inode *inode)
> struct user_struct *user;
> unsigned long mq_bytes;
> int i;
> - struct ipc_namespace *ipc_ns = &init_ipc_ns;
> + struct ipc_namespace *ipc_ns;
>
> if (S_ISDIR(inode->i_mode)) {
> clear_inode(inode);
> return;
> }
> + ipc_ns = get_ns_from_inode(inode);
> info = MQUEUE_I(inode);
> spin_lock(&info->lock);
> for (i = 0; i < info->attr.mq_curmsgs; i++)
> @@ -266,10 +317,12 @@ static void mqueue_delete_inode(struct inode *inode)
> if (user) {
> spin_lock(&mq_lock);
> user->mq_bytes -= mq_bytes;
> - ipc_ns->mq_queues_count--;
> + if (ipc_ns)

The reader might be wondering why ipc_ns==NULL is an acceptable state,
and what that state actually means. This reader is wondering that.

> + ipc_ns->mq_queues_count--;
> spin_unlock(&mq_lock);
> free_uid(user);
> }
> + put_ipc_ns(ipc_ns);
> }
>
>
> ...
>
> +int mq_init_ns(struct ipc_namespace *ns)
> +{
> + ns->mq_queues_count = 0;
> + ns->mq_queues_max = DFLT_QUEUESMAX;
> + ns->mq_msg_max = DFLT_MSGMAX;
> + ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
> +
> + ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
> + if (IS_ERR(ns->mq_mnt))
> + return PTR_ERR(ns->mq_mnt);

It seems dangerous to leave an IS_ERR() pointer sitting in ns->mq_mnt
for someone else to trip over.

> + return 0;
> +}
> +
> +void mq_clear_sbinfo(struct ipc_namespace *ns)
> +{
> + ns->mq_mnt->mnt_sb->s_fs_info = NULL;
> +}
> +
> +void mq_put_mnt(struct ipc_namespace *ns)
> +{
> + mntput(ns->mq_mnt);
> +}
> +
> static int msg_max_limit_min = MIN_MSGMAX;
> static int msg_max_limit_max = MAX_MSGMAX;
>
> @@ -1284,15 +1367,14 @@ static int __init init_mqueue_fs(void)
> if (error)
> goto out_sysctl;
>
> - init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type);
> + spin_lock_init(&mq_lock);
> +
> + init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
> if (IS_ERR(init_ipc_ns.mq_mnt)) {
> error = PTR_ERR(init_ipc_ns.mq_mnt);
> goto out_filesystem;
> }
>
> - /* internal initialization - not common for vfs */
> - spin_lock_init(&mq_lock);
> -
> return 0;
>
> out_filesystem:
> diff --git a/ipc/msgutil.c b/ipc/msgutil.c
> index c197cd1..21475b0 100644
> --- a/ipc/msgutil.c
> +++ b/ipc/msgutil.c
> @@ -18,18 +18,16 @@
>
> #include "util.h"
>
> +spinlock_t mq_lock;

DEFINE_SPINLOCK(), please. For lockdep reasons.

>
> ...
>
> +/*
> + * put_ipc_ns - drop a reference to an ipc namespace.
> + * @ns: the namespace to put
> + *
> + * If this is the last task in the namespace exiting, and
> + * it is dropping the refcount to 0, then it can race with
> + * a task in another ipc namespace but in a mounts namespace
> + * which has this ipcns's mqueuefs mounted, doing some action
> + * with one of the mqueuefs files. That can raise the refcount.
> + * So dropping the refcount, and raising the refcount when
> + * accessing it through the VFS, are protected with mq_lock.
> + *
> + * (Clearly, a task raising the refcount on its own ipc_ns
> + * needn't take mq_lock since it can't race with the last task
> + * in the ipcns exiting).
> + */
> +void put_ipc_ns(struct ipc_namespace *ns)
> {
> - struct ipc_namespace *ns;
> + if (ns && atomic_dec_and_lock(&ns->count, &mq_lock)) {
> + mq_clear_sbinfo(ns);
> + spin_unlock(&mq_lock);
> + mq_put_mnt(ns);
> + free_ipc_ns(ns);
> + }
> +}

Why are we supporting calls with a NULL arg here?

> - ns = container_of(kref, struct ipc_namespace, kref);
> +void free_ipc_ns(struct ipc_namespace *ns)
> +{
> /*
> * Unregistering the hotplug notifier at the beginning guarantees
> * that the ipc namespace won't be freed while we are inside the
> @@ -102,7 +132,6 @@ void free_ipc_ns(struct kref *kref)
> sem_exit_ns(ns);
> msg_exit_ns(ns);
> shm_exit_ns(ns);
> - mq_exit_ns(ns);
> kfree(ns);
> atomic_dec(&nr_ipc_ns);
>
> diff --git a/ipc/util.h b/ipc/util.h
> index 9b6cc33..236e4ed 100644
> --- a/ipc/util.h
> +++ b/ipc/util.h
> @@ -21,9 +21,11 @@ void shm_init (void);
> struct ipc_namespace;
>
> #ifdef CONFIG_POSIX_MQUEUE
> -void mq_exit_ns(struct ipc_namespace *ns);
> +extern void mq_clear_sbinfo(struct ipc_namespace *ns);
> +extern void mq_put_mnt(struct ipc_namespace *ns);
> #else
> -#define mq_exit_ns(ns) ((void) 0)
> +#define mq_clear_sbinfo(ns) ((void) 0)
> +#define mq_put_mnt(ns) ((void) 0)

argh.

> #endif
>

2009-01-27 21:21:06

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [Patch 0/3] posix mqueue namespace (v14)

Quoting Andrew Morton ([email protected]):
> On Fri, 16 Jan 2009 20:02:48 -0600
> "Serge E. Hallyn" <[email protected]> wrote:
>
> > IPC namespaces are completely disjoint id->object mappings.
> > A task can pass CLONE_NEWIPC to unshare and clone to get
> > a new, empty, IPC namespace. Until now this has supported
> > SYSV IPC.
> >
> > Most Posix IPC is done in userspace. The posix mqueue
> > support, however, is implemented on top of the mqueue fs.
> >
> > This patchset implements multiple mqueue fs instances,
> > one per IPC namespace to be precise.
> >
> > To create a new ipc namespace with posix mq support, you
> > should now:
> >
> > unshare(CLONE_NEWIPC|CLONE_NEWNS);
> > umount /dev/mqueue
> > mount -t mqueue mqueue /dev/mqueue
> >
> > It's perfectly valid to do vfs operations on files
> > in another ipc_namespace's /dev/mqueue, but any use
> > of mq_open(3) and friends will act in your own ipc_ns.
> > After the ipc namespace has exited, you can still
> > unlink but no longer create files in that fs (since
> > accounting is carried.
> >
> > Changelog:
> > v14: (Jan 16 2009) port to linux-next
> > v13: (Dec 28 2009)
> > 1. addressed comments by Dave and Suka
> > 2. ported Cedric's patch to make posix mq sysctls
> > per-namespace
> >
> > When convenient, it would be great to see this tested
> > in -mm.
>
> hm. Who is going to test it?

Everyone using posix mq with an -mm kernel :)

There are ltp testcases which I hope can be pushed once these
patches appear headed upstream.

thanks,
-serge

2009-01-27 21:56:27

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [PATCH 2/3] ipc namespaces: implement support for posix msqueues

Quoting Andrew Morton ([email protected]):
> On Fri, 16 Jan 2009 20:03:32 -0600
> "Serge E. Hallyn" <[email protected]> wrote:
>
> > Implement multiple mounts of the mqueue file system, and
> > link it to usage of CLONE_NEWIPC.
> >
> > Each ipc ns has a corresponding mqueuefs superblock. When
> > a user does clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the
> > unshare will cause an internal mount of a new mqueuefs sb
> > linked to the new ipc ns.
> >
> > When a user does 'mount -t mqueue mqueue /dev/mqueue', he
> > mounts the mqueuefs superblock.
> >
> > Posix message queues can be worked with both through the
> > mq_* system calls (see mq_overview(7)), and through the VFS
> > through the mqueue mount. Any usage of mq_open() and friends
> > will work with the acting task's ipc namespace. Any actions
> > through the VFS will work with the mqueuefs in which the
> > file was created. So if a user doesn't remount mqueuefs
> > after unshare(CLONE_NEWIPC), mq_open("/ab") will not be
> > reflected in "ls /dev/mqueue".
> >
> > If task a mounts mqueue for ipc_ns:1, then clones task b with
> > a new ipcns, ipcns:2, and then task a is the last task in
> > ipc_ns:1 to exit, then (1) ipc_ns:1 will be freed, (2) it's
> > superblock will live on until task b umounts the corresponding
> > mqueuefs, and vfs actions will continue to succeed, but (3)
> > sb->s_fs_info will be NULL for the sb corresponding to the
> > deceased ipc_ns:1.
>
> I suppose that stuff like this should find its way into a manpage one
> day. That'll be fun for someone.

Nadia wrote an update for the mq_overview.7 page. But the
semantics have changed a bit since then so I'll need to
revise that.

> > +static int get_sb_single_ns(struct file_system_type *fs_type,
> > + int flags, void *data,
> > + int (*fill_super)(struct super_block *, void *, int),
> > + struct vfsmount *mnt)
> > +{
> > + struct super_block *s;
> > + int error;
> > +
> > + s = sget(fs_type, compare_sb_single_ns, set_sb_single_ns, data);
> > + if (IS_ERR(s))
> > + return PTR_ERR(s);
> > + if (!s->s_root) {
> > + s->s_flags = flags;
> > + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
> > + if (error) {
> > + up_write(&s->s_umount);
> > + deactivate_super(s);
> > + return error;
> > + }
> > + s->s_flags |= MS_ACTIVE;
> > + }
> > + do_remount_sb(s, flags, data, 0);
> > + return simple_set_mnt(mnt, s);
> > }
>
> The above doesn't seem specific to mqueue. Is it in the best place?

No, I think there is commonality with devpts and perhaps also proc. Now
that devpts namespaces are upstream i should first move it to using a
more generic helper, then introduce posixmq namespaces as another user.

> > @@ -266,10 +317,12 @@ static void mqueue_delete_inode(struct inode *inode)
> > if (user) {
> > spin_lock(&mq_lock);
> > user->mq_bytes -= mq_bytes;
> > - ipc_ns->mq_queues_count--;
> > + if (ipc_ns)
>
> The reader might be wondering why ipc_ns==NULL is an acceptable state,
> and what that state actually means. This reader is wondering that.

That's actually a central part of how we resolved the dependency
between the mqfs mount (pinned by vfs) and mqns (pinned by nsproxy).
So the mqns pins its mqfs vfsmount, but the mnt->mnt_sb->s_fs_info
is not pinned by the mqns. Rather, s_fs_info is protected by the
mq_lock and either non-NULL and valid, or NULL. The mqns takes
the mq_lock to set it to NULL, and any reader of s_fs_info must
dereference it and pin the mqns under mq_lock.

> > + * put_ipc_ns - drop a reference to an ipc namespace.
> > + * @ns: the namespace to put
> > + *
> > + * If this is the last task in the namespace exiting, and
> > + * it is dropping the refcount to 0, then it can race with
> > + * a task in another ipc namespace but in a mounts namespace
> > + * which has this ipcns's mqueuefs mounted, doing some action
> > + * with one of the mqueuefs files. That can raise the refcount.
> > + * So dropping the refcount, and raising the refcount when
> > + * accessing it through the VFS, are protected with mq_lock.
> > + *
> > + * (Clearly, a task raising the refcount on its own ipc_ns
> > + * needn't take mq_lock since it can't race with the last task
> > + * in the ipcns exiting).
> > + */
> > +void put_ipc_ns(struct ipc_namespace *ns)
> > {
> > - struct ipc_namespace *ns;
> > + if (ns && atomic_dec_and_lock(&ns->count, &mq_lock)) {
> > + mq_clear_sbinfo(ns);
> > + spin_unlock(&mq_lock);
> > + mq_put_mnt(ns);
> > + free_ipc_ns(ns);
> > + }
> > +}
>
> Why are we supporting calls with a NULL arg here?

Hmm, I'm not sure that's necessary. Will look into it.

Every other comment I'll fix in the next version. Thanks
very much.

-serge