From: Ran Xiaokai <[email protected]>
When a non-root user process creates a user namespace and ipc namespace
with command "unshare -Ur -i", and map the root user inside
the user namesapce to the global owner of user namespace.
The newly created user namespace OWNS the ipc namespace,
So the root user inside the user namespace should have full access
rights to the ipc namespace resources.
$ id
uid=1200(u1200) gid=1200(u1200) groups=1200(u1200)
$ unshare -Ur -i
$ id
uid=0(root) gid=0(root) groups=0(root)
$ ls -l /proc/sys/fs/mqueue/
total 0
-rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msg_default
-rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msg_max
-rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msgsize_default
-rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msgsize_max
-rw-r--r-- 1 65534 65534 0 Jul 28 19:03 queues_max
-sh: /proc/sys/fs/mqueue/msg_max: Permission denied
As opposite, inside a net namespace,
1. sysctl files owners are set to the local root user
insede the user namespace, not the GLOBAL_ROOT_UID;
2. sysctl files are writable when accessed by root user
inside the user namespace.
$ id
uid=1200(u1200) gid=1200(u1200) groups=1200(u1200)
$ unshare -Ur -n
$ id
uid=0(root) gid=0(root) groups=0(root)
$ ls -l /proc/sys/net/ipv4/ip_forward
-rw-r--r-- 1 root root 0 Jul 28 19:04 /proc/sys/net/ipv4/ip_forward
$ echo 1 > /proc/sys/net/ipv4/ip_forward
$ cat /proc/sys/net/ipv4/ip_forward
1
This patch adds a ctl_table_set per ipc namespace, and also the
set_ownership() and permissions() callbacks for the new ctl_table_root
for ipc mqueue syscgtls.
Signed-off-by: Ran Xiaokai <[email protected]>
---
include/linux/ipc_namespace.h | 14 +++++
ipc/mq_sysctl.c | 116 ++++++++++++++++++++++++++++++++++++------
ipc/mqueue.c | 10 +++-
ipc/namespace.c | 2 +
4 files changed, 124 insertions(+), 18 deletions(-)
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 05e2277..3e8e340 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -10,6 +10,7 @@
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
+#include <linux/sysctl.h>
struct user_namespace;
@@ -67,6 +68,11 @@ struct ipc_namespace {
struct user_namespace *user_ns;
struct ucounts *ucounts;
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+ struct ctl_table_set mq_set;
+ struct ctl_table_header *sysctls;
+#endif
+
struct llist_node mnt_llist;
struct ns_common ns;
@@ -155,7 +161,10 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
struct ctl_table_header;
+extern struct ctl_table_header *mq_sysctl_table;
extern struct ctl_table_header *mq_register_sysctl_table(void);
+bool setup_mq_sysctls(struct ipc_namespace *ns);
+void retire_mq_sysctls(struct ipc_namespace *ns);
#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
@@ -163,6 +172,11 @@ static inline struct ctl_table_header *mq_register_sysctl_table(void)
{
return NULL;
}
+static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+ return true;
+}
+static inline void retire_mq_sysctls(struct ipc_namespace *ns) { }
#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
#endif
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 72a92a0..dbdd428 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -8,6 +8,11 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/capability.h>
+#include <linux/cred.h>
+#include <linux/stat.h>
#ifdef CONFIG_PROC_SYSCTL
static void *get_mq(struct ctl_table *table)
@@ -96,25 +101,104 @@ static struct ctl_table mq_sysctls[] = {
{}
};
-static struct ctl_table mq_sysctl_dir[] = {
- {
- .procname = "mqueue",
- .mode = 0555,
- .child = mq_sysctls,
- },
- {}
-};
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return ¤t->nsproxy->ipc_ns->mq_set == set;
+}
-static struct ctl_table mq_sysctl_root[] = {
- {
- .procname = "fs",
- .mode = 0555,
- .child = mq_sysctl_dir,
- },
- {}
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+ return ¤t->nsproxy->ipc_ns->mq_set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ struct ipc_namespace *ipc_ns =
+ container_of(head->set, struct ipc_namespace, mq_set);
+ struct user_namespace *user_ns = ipc_ns->user_ns;
+ int mode;
+
+ /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+ if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+ mode = (table->mode & S_IRWXU) >> 6;
+ else
+ /* Allow all others at most read-only access */
+ mode = table->mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static void set_ownership(struct ctl_table_header *head,
+ struct ctl_table *table,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct ipc_namespace *ipc_ns =
+ container_of(head->set, struct ipc_namespace, mq_set);
+ struct user_namespace *user_ns = ipc_ns->user_ns;
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ ns_root_gid = make_kgid(user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+}
+
+static struct ctl_table_root mq_sysctl_root = {
+ .lookup = set_lookup,
+ .permissions = set_permissions,
+ .set_ownership = set_ownership,
};
+bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+ struct ctl_table *tbl;
+
+ if (!mq_sysctl_table)
+ return false;
+
+ setup_sysctl_set(&ns->mq_set, &mq_sysctl_root, set_is_seen);
+ tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
+ if (!tbl)
+ goto out;
+
+ ns->sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
+ if (!ns->sysctls)
+ goto out1;
+
+ return true;
+
+out1:
+ kfree(tbl);
+ retire_sysctl_set(&ns->mq_set);
+out:
+ return false;
+}
+
+void retire_mq_sysctls(struct ipc_namespace *ns)
+{
+ struct ctl_table *tbl;
+
+ if (!ns->sysctls)
+ return;
+
+ tbl = ns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(ns->sysctls);
+ retire_sysctl_set(&ns->mq_set);
+ kfree(tbl);
+}
+
struct ctl_table_header *mq_register_sysctl_table(void)
{
- return register_sysctl_table(mq_sysctl_root);
+ static struct ctl_table empty[1];
+
+ /*
+ * Register the fs/mqueue directory in the default set so that
+ * registrations in the child sets work properly.
+ */
+ return register_sysctl("fs/mqueue", empty);
}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4e4e611..3b68564 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -163,7 +163,7 @@ static void remove_notification(struct mqueue_inode_info *info);
static struct kmem_cache *mqueue_inode_cachep;
-static struct ctl_table_header *mq_sysctl_table;
+struct ctl_table_header *mq_sysctl_table;
static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
{
@@ -1713,6 +1713,10 @@ static int __init init_mqueue_fs(void)
/* ignore failures - they are not fatal */
mq_sysctl_table = mq_register_sysctl_table();
+ if (mq_sysctl_table && !setup_mq_sysctls(&init_ipc_ns)) {
+ unregister_sysctl_table(mq_sysctl_table);
+ mq_sysctl_table = NULL;
+ }
error = register_filesystem(&mqueue_fs_type);
if (error)
@@ -1729,8 +1733,10 @@ static int __init init_mqueue_fs(void)
out_filesystem:
unregister_filesystem(&mqueue_fs_type);
out_sysctl:
- if (mq_sysctl_table)
+ if (mq_sysctl_table) {
+ retire_mq_sysctls(&init_ipc_ns);
unregister_sysctl_table(mq_sysctl_table);
+ }
kmem_cache_destroy(mqueue_inode_cachep);
return error;
}
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 7bd0766..c891cc1 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -58,6 +58,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
err = mq_init_ns(ns);
if (err)
goto fail_put;
+ setup_mq_sysctls(ns);
sem_init_ns(ns);
msg_init_ns(ns);
@@ -121,6 +122,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
* uses synchronize_rcu().
*/
mq_put_mnt(ns);
+ retire_mq_sysctls(ns);
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
--
2.15.2
On Wed, Jul 28, 2021 at 08:06:51PM -0700, [email protected] wrote:
> From: Ran Xiaokai <[email protected]>
>
> When a non-root user process creates a user namespace and ipc namespace
> with command "unshare -Ur -i", and map the root user inside
> the user namesapce to the global owner of user namespace.
> The newly created user namespace OWNS the ipc namespace,
> So the root user inside the user namespace should have full access
> rights to the ipc namespace resources.
>
> $ id
> uid=1200(u1200) gid=1200(u1200) groups=1200(u1200)
> $ unshare -Ur -i
> $ id
> uid=0(root) gid=0(root) groups=0(root)
> $ ls -l /proc/sys/fs/mqueue/
> total 0
> -rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msg_default
> -rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msg_max
> -rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msgsize_default
> -rw-r--r-- 1 65534 65534 0 Jul 28 19:03 msgsize_max
> -rw-r--r-- 1 65534 65534 0 Jul 28 19:03 queues_max
> -sh: /proc/sys/fs/mqueue/msg_max: Permission denied
>
> As opposite, inside a net namespace,
> 1. sysctl files owners are set to the local root user
> insede the user namespace, not the GLOBAL_ROOT_UID;
> 2. sysctl files are writable when accessed by root user
> inside the user namespace.
>
> $ id
> uid=1200(u1200) gid=1200(u1200) groups=1200(u1200)
> $ unshare -Ur -n
> $ id
> uid=0(root) gid=0(root) groups=0(root)
> $ ls -l /proc/sys/net/ipv4/ip_forward
> -rw-r--r-- 1 root root 0 Jul 28 19:04 /proc/sys/net/ipv4/ip_forward
> $ echo 1 > /proc/sys/net/ipv4/ip_forward
> $ cat /proc/sys/net/ipv4/ip_forward
> 1
Yeah, we did that work specifically for the network namespace but knew
there were quite a few places that would need fix up. This makes sense
to me.
Please add tests for this patch though. Also make sure to run them in a
tight loop on a kernel with memory and log debugging enabled. The whole
sysctl retire stuff can't be called from rcu contexts and that's easy to
miss. So turn on at least sm like:
CONFIG_HAVE_ARCH_KASAN=y
CONFIG_HAVE_ARCH_KASAN_VMALLOC=y
CONFIG_CC_HAS_KASAN_GENERIC=y
CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y
CONFIG_KASAN=y
CONFIG_KASAN_GENERIC=y
# CONFIG_KASAN_OUTLINE is not set
CONFIG_KASAN_INLINE=y
CONFIG_KASAN_STACK=1
CONFIG_KASAN_VMALLOC=y
# CONFIG_KASAN_MODULE_TEST is not set
CONFIG_HAVE_ARCH_KFENCE=y
CONFIG_KFENCE=y
CONFIG_KFENCE_STATIC_KEYS=y
CONFIG_KFENCE_SAMPLE_INTERVAL=100
CONFIG_KFENCE_NUM_OBJECTS=255
CONFIG_KFENCE_STRESS_TEST_FAULTS=0
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_LOCKDEP=y
On 2021-07-28 20:06, [email protected] wrote:
> This patch adds a ctl_table_set per ipc namespace, and also the
> set_ownership() and permissions() callbacks for the new ctl_table_root
> for ipc mqueue syscgtls.
^^ sysctls
This makes sense to me, just some nits below.
Acked-by: Davidlohr Bueso <[email protected]>
>
> Signed-off-by: Ran Xiaokai <[email protected]>
> ---
...
> +static int set_permissions(struct ctl_table_header *head,
> + struct ctl_table *table)
> +{
> + struct ipc_namespace *ipc_ns =
> + container_of(head->set, struct ipc_namespace, mq_set);
> + struct user_namespace *user_ns = ipc_ns->user_ns;
> + int mode;
> +
> + /* Allow users with CAP_SYS_RESOURCE unrestrained access */
> + if (ns_capable(user_ns, CAP_SYS_RESOURCE))
> + mode = (table->mode & S_IRWXU) >> 6;
> + else
> + /* Allow all others at most read-only access */
> + mode = table->mode & S_IROTH;
Please use curly braces for the else.
> + return (mode << 6) | (mode << 3) | mode;
> +}
> +
> +static void set_ownership(struct ctl_table_header *head,
> + struct ctl_table *table,
> + kuid_t *uid, kgid_t *gid)
> +{
> + struct ipc_namespace *ipc_ns =
> + container_of(head->set, struct ipc_namespace, mq_set);
> + struct user_namespace *user_ns = ipc_ns->user_ns;
> + kuid_t ns_root_uid;
> + kgid_t ns_root_gid;
> +
> + ns_root_uid = make_kuid(user_ns, 0);
> + if (uid_valid(ns_root_uid))
> + *uid = ns_root_uid;
> +
> + ns_root_gid = make_kgid(user_ns, 0);
> + if (gid_valid(ns_root_gid))
> + *gid = ns_root_gid;
> +}
Could set_permissions() and set_ownership() be factored such that we can
avoid duplicated code between ipc and net ns? Something like:
void set_permissions(struct ctl_table_header *head, struct ctl_table
*table)
{
struct ipc_namespace *ipc_ns = container_of(head->set, struct
ipc_namespace, mq_set);
set_permissions_common(ipc_ns->user_ns);
}
Thanks,
Davidlohr
O Thu, Jul 29, 2021 at 04:53:48PM +0200, Christian Brauner wrote:
>
> Yeah, we did that work specifically for the network namespace but knew
> there were quite a few places that would need fix up. This makes sense
> to me.
>
> Please add tests for this patch though. Also make sure to run them in a
> tight loop on a kernel with memory and log debugging enabled.
For now i have rebuilt the kernel turning on the config items you
suggested and some other kernel hacking and locking debug configs.
I tested this by a shell script concurrently writing the mqueue sysctl
files and checking the value. Do you mean that i should add some test code in this patch?
Can you give some examples for this tests code?
> The whole sysctl retire stuff can't be called from rcu contexts and that's easy to
> miss.
for this patch, retire_mq_sysctls() is called by free_ipc_ns(), and free_ipc_ns() is
triggered by schedule_work(&free_ipc_work) in kthread context.
Can you give some comments on the chance this code running on rcu
context?
> So turn on at least sm like:
>
> CONFIG_HAVE_ARCH_KASAN=y
> CONFIG_HAVE_ARCH_KASAN_VMALLOC=y
> CONFIG_CC_HAS_KASAN_GENERIC=y
> CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y
> CONFIG_KASAN=y
> CONFIG_KASAN_GENERIC=y
> # CONFIG_KASAN_OUTLINE is not set
> CONFIG_KASAN_INLINE=y
> CONFIG_KASAN_STACK=1
> CONFIG_KASAN_VMALLOC=y
> # CONFIG_KASAN_MODULE_TEST is not set
> CONFIG_HAVE_ARCH_KFENCE=y
> CONFIG_KFENCE=y
> CONFIG_KFENCE_STATIC_KEYS=y
> CONFIG_KFENCE_SAMPLE_INTERVAL=100
> CONFIG_KFENCE_NUM_OBJECTS=255
> CONFIG_KFENCE_STRESS_TEST_FAULTS=0
> CONFIG_LOCKDEP_SUPPORT=y
> CONFIG_LOCKDEP=y
O Fri, Jul 30, 2021 at 08:09:32AM -0700, Davidlohr Bueso wrote:
> On 2021-07-28 20:06, [email protected] wrote:
> > This patch adds a ctl_table_set per ipc namespace, and also the
> > set_ownership() and permissions() callbacks for the new ctl_table_root
> > for ipc mqueue syscgtls.
> ^^ sysctls
>
> This makes sense to me, just some nits below.
>
> Acked-by: Davidlohr Bueso <[email protected]>
>
> >
> > Signed-off-by: Ran Xiaokai <[email protected]>
> > ---
> ...
> > +static int set_permissions(struct ctl_table_header *head,
> > + struct ctl_table *table)
> > +{
>
> Please use curly braces for the else.
>
> > + return (mode << 6) | (mode << 3) | mode;
> > +}
> > +
> > +static void set_ownership(struct ctl_table_header *head,
> > + struct ctl_table *table,
> > + kuid_t *uid, kgid_t *gid)
> > +{
> > + struct ipc_namespace *ipc_ns =
> > + container_of(head->set, struct ipc_namespace, mq_set);
> > + struct user_namespace *user_ns = ipc_ns->user_ns;
> > + kuid_t ns_root_uid;
> > + kgid_t ns_root_gid;
> > +
> > + ns_root_uid = make_kuid(user_ns, 0);
> > + if (uid_valid(ns_root_uid))
> > + *uid = ns_root_uid;
> > +
> > + ns_root_gid = make_kgid(user_ns, 0);
> > + if (gid_valid(ns_root_gid))
> > + *gid = ns_root_gid;
> > +}
>
> Could set_permissions() and set_ownership() be factored such that we can
> avoid duplicated code between ipc and net ns? Something like:
>
> void set_permissions(struct ctl_table_header *head, struct ctl_table *table)
> {
> struct ipc_namespace *ipc_ns = container_of(head->set, struct
> ipc_namespace, mq_set);
> set_permissions_common(ipc_ns->user_ns);
> }
>
> Thanks,
> Davidlohr
Hi Davidlohr
Thanks for your comments.
I will sent a v2 patch together with Christian's comments.
On Tue, Aug 03, 2021 at 03:31:50AM -0700, CGEL wrote:
> O Thu, Jul 29, 2021 at 04:53:48PM +0200, Christian Brauner wrote:
> >
> > Yeah, we did that work specifically for the network namespace but knew
> > there were quite a few places that would need fix up. This makes sense
> > to me.
> >
> > Please add tests for this patch though. Also make sure to run them in a
> > tight loop on a kernel with memory and log debugging enabled.
> For now i have rebuilt the kernel turning on the config items you
> suggested and some other kernel hacking and locking debug configs.
> I tested this by a shell script concurrently writing the mqueue sysctl
> files and checking the value. Do you mean that i should add some test code in this patch?
> Can you give some examples for this tests code?
Yep, I'd prefer to see tests with this. This should be fairly easy to
test. There are a bunch of ways. A really trivial skeleton for a test in
tools/testing/selftstes/mqueue/ could be:
- Create a new mount and ipc namespace and mount mqueue in there.
Read and remember the /proc/sys/fs/mqueue/queues_max value.
- Now create a new user + mount namespace pair in a child process.
- Mount mqueue filesystem in there.
- Set /proc/sys/fs/mqueue/queues_max to 1.
- Call mq_open with O_CREAT in the child process the first time and
expect success keeping the fd open.
- Call mq_open with O_CREAT in the child process a second time and
expect failure because of:
if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
!capable(CAP_SYS_RESOURCE)) {
error = -ENOSPC;
goto out_unlock;
}
ipc_ns->mq_queues_count++;
spin_unlock(&mq_lock);
- Reap the child in the parent expecting success.
- Verify that the /proc/sys/fs/mqueue/queues_max value in the parent is
identical to the value you read before creating the child.
This should be a very rough test that at least the fundamentals of this
change are sane.
This doesn't have to be complex.
Here's the code from a test I've written for mount_setattr() to create
an unpriv mount + userns which might be the annoying part:
#include "../../kselftest_harness.h"
#include "../pidfd.h" /* for wait_for_pid() */
static ssize_t write_nointr(int fd, const void *buf, size_t count)
{
ssize_t ret;
do {
ret = write(fd, buf, count);
} while (ret < 0 && errno == EINTR);
return ret;
}
static int write_file(const char *path, const void *buf, size_t count)
{
int fd;
ssize_t ret;
fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW);
if (fd < 0)
return -1;
ret = write_nointr(fd, buf, count);
close(fd);
if (ret < 0 || (size_t)ret != count)
return -1;
return 0;
}
static int create_and_enter_userns(void)
{
uid_t uid;
gid_t gid;
char map[100];
uid = getuid();
gid = getgid();
if (unshare(CLONE_NEWUSER))
return -1;
if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) &&
errno != ENOENT)
return -1;
snprintf(map, sizeof(map), "0 %d 1", uid);
if (write_file("/proc/self/uid_map", map, strlen(map)))
return -1;
snprintf(map, sizeof(map), "0 %d 1", gid);
if (write_file("/proc/self/gid_map", map, strlen(map)))
return -1;
if (setgid(0))
return -1;
if (setuid(0))
return -1;
return 0;
}
static int prepare_unpriv_mountns(void)
{
if (create_and_enter_userns())
return -1;
if (unshare(CLONE_NEWNS))
return -1;
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
return -1;
if (unshare(CLONE_NEWIPC))
return -1;
return 0;
}
TEST(mqueue_sysctls)
{
int ret;
pid_t pid;
if (unshare(CLONE_NEWNS))
return -1;
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
return -1;
if (unshare(CLONE_NEWIPC))
return -1;
/* Mount new mqueue instance, read and stash sysctl value. */
pid = fork();
ASSERT_GE(pid, 0) {
TH_LOG("%s - Failed to fork", strerror(errno));
}
if (pid == 0) {
ASSERT_EQ(prepare_unpriv_mountns(), 0);
/* mount new mqueue instance, setup sysctls and perform mq_open() tests. */
exit(EXIT_SUCCESS);
}
ASSERT_EQ(wait_for_pid(pid), 0);
/* Read sysctl value again making sure it's still the same as before. */
}
TEST_HARNESS_MAIN
>
> > The whole sysctl retire stuff can't be called from rcu contexts and that's easy to
> > miss.
> for this patch, retire_mq_sysctls() is called by free_ipc_ns(), and free_ipc_ns() is
> triggered by schedule_work(&free_ipc_work) in kthread context.
> Can you give some comments on the chance this code running on rcu
> context?
I really just prefer to see sysctl namespacing changes compiled with all
manner of debugging options enabled as they can quite easily uncover
bugs where the sysctl cleanup helpers are called from invalid contexts.
That has happened to me when I worked on some sysctl changes a while
back and thought I had followed all codepaths diligently to make sure
that nothing calls the sysctl cleanup code from invalid contexts. I only
caught the codepath that did because I had all of the options turned on.
On Tue, Aug 03, 2021 at 04:01:33PM +0200, Christian Brauner wrote:
> - Create a new mount and ipc namespace and mount mqueue in there.
> Read and remember the /proc/sys/fs/mqueue/queues_max value.
> - Now create a new user + mount namespace pair in a child process.
> - Mount mqueue filesystem in there.
> - Set /proc/sys/fs/mqueue/queues_max to 1.
> - Call mq_open with O_CREAT in the child process the first time and
> expect success keeping the fd open.
> - Call mq_open with O_CREAT in the child process a second time and
> expect failure because of:
>
> if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
> !capable(CAP_SYS_RESOURCE)) {
> error = -ENOSPC;
> goto out_unlock;
> }
> ipc_ns->mq_queues_count++;
> spin_unlock(&mq_lock);
>
> - Reap the child in the parent expecting success.
> - Verify that the /proc/sys/fs/mqueue/queues_max value in the parent is
> identical to the value you read before creating the child.
Hi, Christian
Thanks for your patient explanation of the kselftest code.
Please give comments on this test code.
int get_mq_queues_max(void)
{
int fd;
char buf[16];
int val = -1;
fd = open("/proc/sys/fs/mqueue/queues_max", O_RDONLY);
if (fd >= 0) {
if (read(fd, buf, sizeof(buf)) > 0)
val = atoi(buf);
close(fd);
return val;
}
return val;
}
TEST(mqueue_sysctl)
{
pid_t pid;
int qmax1, qmax2;
/*
> - Create a new mount and ipc namespace and mount mqueue in there.
This test code is intended to run as non-root user,
so unshare(CLONE_NEWNS) is not allowed, so i skip this step.
*/
chdir(getenv("HOME"));
/* read and stash the original sysctl value */
qmax1 = get_mq_queues_max();
ASSERT_GE(qmax1, 0);
pid = fork();
ASSERT_GE(pid, 0);
if (pid == 0) {
ASSERT_EQ(prepare_unpriv_mountns(), 0);
/*
A new mqueue filesystem instance will be mounted by kernel internally
when a ipc namespace created. I don't quite get the point here why we should
mount mqueue manually?
*/
if (mkdir("./mqueue", 755) && errno != EEXIST)
SKIP(return, "mkdir /dev/mqueue failed");
ASSERT_EQ(mount("none", "./mqueue", "mqueue", MS_NOATIME, NULL), 0);
/* modify the sysctl value in new ipc namesapce */
ASSERT_EQ(write_file("/proc/sys/fs/mqueue/queues_max", "1", 1), 0);
ASSERT_GE(mq_open("/new_ns1", O_RDWR | O_CREAT, 0644, NULL), 0);
/* mq_open() should fail as exceeding of queues_max */
ASSERT_EQ(mq_open("/new_ns2", O_RDWR | O_CREAT, 0644, NULL), -1);
ASSERT_EQ(mq_unlink("/new_ns1"), 0);
ASSERT_EQ(umount("./mqueue"), 0);
exit(0);
}
ASSERT_EQ(wait_for_pid(pid), 0);
qmax2 = get_mq_queues_max();
ASSERT_EQ(qmax1, qmax2);
}
TEST_HARNESS_MAIN
for this test code ,i add a new file mq_sysctl_test.c, a makefile and a config file
with content
CONFIG_USER_NS=y
CONFIG_POSIX_MQUEUE_SYSCTL=y
but i am not sure which directory to place thess files,
for the original tools/testing/selftests/mqueue/
i think this directory don't need a config file, but for this test code,
this config file is needed,
do you have any suggestion on which directory this test code should place?
when a ipc namespace is created in a user namespace, the mqueue sysctl
files should be writalbe to the owner of the user namespace. Even the
owner is not a global privileged user.
Signed-off-by: Ran Xiaokai <[email protected]>
---
tools/testing/selftests/Makefile | 1 +
tools/testing/selftests/mqueue_sysctl/Makefile | 7 +
tools/testing/selftests/mqueue_sysctl/config | 2 +
.../selftests/mqueue_sysctl/mq_sysctl_test.c | 172 +++++++++++++++++++++
4 files changed, 182 insertions(+)
create mode 100644 tools/testing/selftests/mqueue_sysctl/Makefile
create mode 100644 tools/testing/selftests/mqueue_sysctl/config
create mode 100644 tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index bc3299a..2031591 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -36,6 +36,7 @@ TARGETS += mincore
TARGETS += mount
TARGETS += mount_setattr
TARGETS += mqueue
+TARGETS += mqueue_sysctl
TARGETS += nci
TARGETS += net
TARGETS += net/forwarding
diff --git a/tools/testing/selftests/mqueue_sysctl/Makefile b/tools/testing/selftests/mqueue_sysctl/Makefile
new file mode 100644
index 0000000..44b6633
--- /dev/null
+++ b/tools/testing/selftests/mqueue_sysctl/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -O2
+LDLIBS = -lrt
+
+TEST_GEN_PROGS := mq_sysctl_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/mqueue_sysctl/config b/tools/testing/selftests/mqueue_sysctl/config
new file mode 100644
index 0000000..68b2794
--- /dev/null
+++ b/tools/testing/selftests/mqueue_sysctl/config
@@ -0,0 +1,2 @@
+CONFIG_USER_NS=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
\ No newline at end of file
diff --git a/tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c b/tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c
new file mode 100644
index 0000000..48023d5
--- /dev/null
+++ b/tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <mqueue.h>
+#include <sys/wait.h>
+#include <string.h>
+
+#include "../kselftest_harness.h"
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = write(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+static int write_file(const char *path, const void *buf, size_t count)
+{
+ int fd;
+ ssize_t ret;
+
+ fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW);
+ if (fd < 0)
+ return -1;
+
+ ret = write_nointr(fd, buf, count);
+ close(fd);
+ if (ret < 0 || (size_t)ret != count)
+ return -1;
+
+ return 0;
+}
+
+static int create_and_enter_userns(void)
+{
+ uid_t uid;
+ gid_t gid;
+ char map[100];
+
+ uid = getuid();
+ gid = getgid();
+
+ if (unshare(CLONE_NEWUSER))
+ return -1;
+
+ if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) &&
+ errno != ENOENT)
+ return -1;
+
+ snprintf(map, sizeof(map), "0 %d 1", uid);
+ if (write_file("/proc/self/uid_map", map, strlen(map)))
+ return -1;
+
+ snprintf(map, sizeof(map), "0 %d 1", gid);
+ if (write_file("/proc/self/gid_map", map, strlen(map)))
+ return -1;
+
+ if (setgid(0))
+ return -1;
+
+ if (setuid(0))
+ return -1;
+
+ return 0;
+}
+
+static int prepare_unpriv_mountns(void)
+{
+ if (create_and_enter_userns())
+ return -1;
+
+ if (unshare(CLONE_NEWNS))
+ return -1;
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ return -1;
+
+ if (unshare(CLONE_NEWIPC))
+ return -1;
+
+ return 0;
+}
+
+static int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ return -1;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+int get_mq_queues_max(void)
+{
+ int fd;
+ char buf[16];
+ int val = -1;
+
+ fd = open("/proc/sys/fs/mqueue/queues_max", O_RDONLY);
+ if (fd >= 0) {
+ if (read(fd, buf, sizeof(buf)) > 0)
+ val = atoi(buf);
+
+ close(fd);
+ return val;
+ }
+ return val;
+}
+
+TEST(mqueue_sysctl)
+{
+ pid_t pid;
+ int qmax1, qmax2;
+
+ chdir(getenv("HOME"));
+ /* read and stash the original sysctl value */
+ qmax1 = get_mq_queues_max();
+ ASSERT_GE(qmax1, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ASSERT_EQ(prepare_unpriv_mountns(), 0);
+
+ if (mkdir("./mqueue", 755) && errno != EEXIST)
+ SKIP(return, "mkdir /dev/mqueue failed");
+
+ ASSERT_EQ(mount("none", "./mqueue", "mqueue", MS_NOATIME, NULL), 0);
+
+ /* modify the sysctl value in new ipc namesapce */
+ ASSERT_EQ(write_file("/proc/sys/fs/mqueue/queues_max", "1", 1), 0);
+
+ ASSERT_GE(mq_open("/new_ns1", O_RDWR | O_CREAT, 0644, NULL), 0);
+
+ /* mq_open() should fail as exceeding of queues_max */
+ ASSERT_EQ(mq_open("/new_ns2", O_RDWR | O_CREAT, 0644, NULL), -1);
+
+ ASSERT_EQ(mq_unlink("/new_ns1"), 0);
+ ASSERT_EQ(umount("./mqueue"), 0);
+
+ exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+
+ qmax2 = get_mq_queues_max();
+ ASSERT_EQ(qmax1, qmax2);
+}
+
+TEST_HARNESS_MAIN
--
2.15.2
On 2021-08-22 20:29, Ran Xiaokai wrote:
> create mode 100644 tools/testing/selftests/mqueue_sysctl/Makefile
> create mode 100644 tools/testing/selftests/mqueue_sysctl/config
> create mode 100644
> tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c
It would be better to use the already existing mqueue directory, instead
of
creating a new one just for sysctl stuff. Also, while nit, perhaps
mq_sysctl_tests.c (plural) to go with the other naming of the tests...
Thanks,
Davidlohr
On Sun, Aug 22, 2021 at 08:29:09PM -0700, Ran Xiaokai wrote:
> when a ipc namespace is created in a user namespace, the mqueue sysctl
> files should be writalbe to the owner of the user namespace. Even the
> owner is not a global privileged user.
>
> Signed-off-by: Ran Xiaokai <[email protected]>
> ---
Ran,
Sorry for not replying to your other mail earlier. I read it but I
simply did not have time to respond in any meaningful way to it.
Assuming the test-run works and David is happy with the test too I can
pick it up together with the main patch.
> tools/testing/selftests/Makefile | 1 +
> tools/testing/selftests/mqueue_sysctl/Makefile | 7 +
> tools/testing/selftests/mqueue_sysctl/config | 2 +
> .../selftests/mqueue_sysctl/mq_sysctl_test.c | 172 +++++++++++++++++++++
I think you need to add the binary to .gitignore btw.
> 4 files changed, 182 insertions(+)
> create mode 100644 tools/testing/selftests/mqueue_sysctl/Makefile
> create mode 100644 tools/testing/selftests/mqueue_sysctl/config
> create mode 100644 tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c
>
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index bc3299a..2031591 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -36,6 +36,7 @@ TARGETS += mincore
> TARGETS += mount
> TARGETS += mount_setattr
> TARGETS += mqueue
> +TARGETS += mqueue_sysctl
> TARGETS += nci
> TARGETS += net
> TARGETS += net/forwarding
> diff --git a/tools/testing/selftests/mqueue_sysctl/Makefile b/tools/testing/selftests/mqueue_sysctl/Makefile
> new file mode 100644
> index 0000000..44b6633
> --- /dev/null
> +++ b/tools/testing/selftests/mqueue_sysctl/Makefile
> @@ -0,0 +1,7 @@
> +# SPDX-License-Identifier: GPL-2.0
> +CFLAGS += -O2
> +LDLIBS = -lrt
> +
> +TEST_GEN_PROGS := mq_sysctl_test
> +
> +include ../lib.mk
> diff --git a/tools/testing/selftests/mqueue_sysctl/config b/tools/testing/selftests/mqueue_sysctl/config
> new file mode 100644
> index 0000000..68b2794
> --- /dev/null
> +++ b/tools/testing/selftests/mqueue_sysctl/config
> @@ -0,0 +1,2 @@
> +CONFIG_USER_NS=y
> +CONFIG_POSIX_MQUEUE_SYSCTL=y
> \ No newline at end of file
> diff --git a/tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c b/tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c
> new file mode 100644
> index 0000000..48023d5
> --- /dev/null
> +++ b/tools/testing/selftests/mqueue_sysctl/mq_sysctl_test.c
> @@ -0,0 +1,172 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sched.h>
> +#include <sys/mount.h>
> +#include <mqueue.h>
> +#include <sys/wait.h>
> +#include <string.h>
> +
> +#include "../kselftest_harness.h"
> +
> +static ssize_t write_nointr(int fd, const void *buf, size_t count)
> +{
> + ssize_t ret;
> +
> + do {
> + ret = write(fd, buf, count);
> + } while (ret < 0 && errno == EINTR);
> +
> + return ret;
> +}
> +
> +static int write_file(const char *path, const void *buf, size_t count)
> +{
> + int fd;
> + ssize_t ret;
> +
> + fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW);
> + if (fd < 0)
> + return -1;
> +
> + ret = write_nointr(fd, buf, count);
> + close(fd);
> + if (ret < 0 || (size_t)ret != count)
> + return -1;
> +
> + return 0;
> +}
> +
> +static int create_and_enter_userns(void)
> +{
> + uid_t uid;
> + gid_t gid;
> + char map[100];
> +
> + uid = getuid();
> + gid = getgid();
> +
> + if (unshare(CLONE_NEWUSER))
> + return -1;
> +
> + if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) &&
> + errno != ENOENT)
> + return -1;
> +
> + snprintf(map, sizeof(map), "0 %d 1", uid);
> + if (write_file("/proc/self/uid_map", map, strlen(map)))
> + return -1;
> +
> + snprintf(map, sizeof(map), "0 %d 1", gid);
> + if (write_file("/proc/self/gid_map", map, strlen(map)))
> + return -1;
> +
> + if (setgid(0))
> + return -1;
> +
> + if (setuid(0))
> + return -1;
> +
> + return 0;
> +}
> +
> +static int prepare_unpriv_mountns(void)
> +{
> + if (create_and_enter_userns())
> + return -1;
> +
> + if (unshare(CLONE_NEWNS))
> + return -1;
> +
> + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
> + return -1;
> +
> + if (unshare(CLONE_NEWIPC))
> + return -1;
> +
> + return 0;
> +}
> +
> +static int wait_for_pid(pid_t pid)
> +{
> + int status, ret;
> +
> +again:
> + ret = waitpid(pid, &status, 0);
> + if (ret == -1) {
> + if (errno == EINTR)
> + goto again;
> +
> + return -1;
> + }
> +
> + if (!WIFEXITED(status))
> + return -1;
> +
> + return WEXITSTATUS(status);
> +}
> +
> +int get_mq_queues_max(void)
can be made static
> +{
> + int fd;
> + char buf[16];
> + int val = -1;
> +
> + fd = open("/proc/sys/fs/mqueue/queues_max", O_RDONLY);
> + if (fd >= 0) {
> + if (read(fd, buf, sizeof(buf)) > 0)
> + val = atoi(buf);
> +
> + close(fd);
> + return val;
> + }
> + return val;
> +}
> +
> +TEST(mqueue_sysctl)
> +{
> + pid_t pid;
> + int qmax1, qmax2;
> +
> + chdir(getenv("HOME"));
Hm, don't do that. I would suggest using a temporary directory. Sm like:
char template[] = P_tmpdir "/mqueue_sysctl_XXXXXX";
if (!mkdtemp(template))
// handle error
int fd = openat(-1, template, O_CLOEXEC | O_DIRECTORY);
mkdirat(fd, "mqueue", ...);
and then at exit:
unlinkat(fd, "mqueue", AT_REMOVEDIR);
remove(template);
etc.
Christian
> + /* read and stash the original sysctl value */
> + qmax1 = get_mq_queues_max();
> + ASSERT_GE(qmax1, 0);
> +
> + pid = fork();
> + ASSERT_GE(pid, 0);
> +
> + if (pid == 0) {
> + ASSERT_EQ(prepare_unpriv_mountns(), 0);
> +
> + if (mkdir("./mqueue", 755) && errno != EEXIST)
> + SKIP(return, "mkdir /dev/mqueue failed");
> +
> + ASSERT_EQ(mount("none", "./mqueue", "mqueue", MS_NOATIME, NULL), 0);
> +
> + /* modify the sysctl value in new ipc namesapce */
> + ASSERT_EQ(write_file("/proc/sys/fs/mqueue/queues_max", "1", 1), 0);
> +
> + ASSERT_GE(mq_open("/new_ns1", O_RDWR | O_CREAT, 0644, NULL), 0);
> +
> + /* mq_open() should fail as exceeding of queues_max */
> + ASSERT_EQ(mq_open("/new_ns2", O_RDWR | O_CREAT, 0644, NULL), -1);
> +
> + ASSERT_EQ(mq_unlink("/new_ns1"), 0);
> + ASSERT_EQ(umount("./mqueue"), 0);
> +
> + exit(0);
> + }
> +
> + ASSERT_EQ(wait_for_pid(pid), 0);
> +
> + qmax2 = get_mq_queues_max();
> + ASSERT_EQ(qmax1, qmax2);
> +}
> +
> +TEST_HARNESS_MAIN
> --
> 2.15.2
>
From: Ran Xiaokai <[email protected]>
when a ipc namespace is created in a user namespace, the mqueue sysctl
files should be writalbe to the owner of the user namespace. Even the
owner is not a global privileged user.
v2
- add .gitignore change
- move this test codes to the existing mqueue directory
- rename test file name
- use mkdtemp() creating mountpoint
v1
- initial patch
Signed-off-by: Ran Xiaokai <[email protected]>
---
tools/testing/selftests/mqueue/.gitignore | 1 +
tools/testing/selftests/mqueue/Makefile | 2 +-
tools/testing/selftests/mqueue/mq_sysctl_tests.c | 175 +++++++++++++++++++++++
3 files changed, 177 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/mqueue/mq_sysctl_tests.c
diff --git a/tools/testing/selftests/mqueue/.gitignore b/tools/testing/selftests/mqueue/.gitignore
index 72ad8ca..226fe58 100644
--- a/tools/testing/selftests/mqueue/.gitignore
+++ b/tools/testing/selftests/mqueue/.gitignore
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
mq_open_tests
mq_perf_tests
+mq_sysctl_tests
diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile
index 8a58055..31becad 100644
--- a/tools/testing/selftests/mqueue/Makefile
+++ b/tools/testing/selftests/mqueue/Makefile
@@ -2,6 +2,6 @@
CFLAGS += -O2
LDLIBS = -lrt -lpthread -lpopt
-TEST_GEN_PROGS := mq_open_tests mq_perf_tests
+TEST_GEN_PROGS := mq_open_tests mq_perf_tests mq_sysctl_tests
include ../lib.mk
diff --git a/tools/testing/selftests/mqueue/mq_sysctl_tests.c b/tools/testing/selftests/mqueue/mq_sysctl_tests.c
new file mode 100644
index 0000000..931a915
--- /dev/null
+++ b/tools/testing/selftests/mqueue/mq_sysctl_tests.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <mqueue.h>
+#include <sys/wait.h>
+#include <string.h>
+
+#include "../kselftest_harness.h"
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = write(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+static int write_file(const char *path, const void *buf, size_t count)
+{
+ int fd;
+ ssize_t ret;
+
+ fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW);
+ if (fd < 0)
+ return -1;
+
+ ret = write_nointr(fd, buf, count);
+ close(fd);
+ if (ret < 0 || (size_t)ret != count)
+ return -1;
+
+ return 0;
+}
+
+static int create_and_enter_userns(void)
+{
+ uid_t uid;
+ gid_t gid;
+ char map[100];
+
+ uid = getuid();
+ gid = getgid();
+
+ if (unshare(CLONE_NEWUSER))
+ return -1;
+
+ if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) &&
+ errno != ENOENT)
+ return -1;
+
+ snprintf(map, sizeof(map), "0 %d 1", uid);
+ if (write_file("/proc/self/uid_map", map, strlen(map)))
+ return -1;
+
+ snprintf(map, sizeof(map), "0 %d 1", gid);
+ if (write_file("/proc/self/gid_map", map, strlen(map)))
+ return -1;
+
+ if (setgid(0))
+ return -1;
+
+ if (setuid(0))
+ return -1;
+
+ return 0;
+}
+
+static int prepare_unpriv_mountns(void)
+{
+ if (create_and_enter_userns())
+ return -1;
+
+ if (unshare(CLONE_NEWNS))
+ return -1;
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ return -1;
+
+ if (unshare(CLONE_NEWIPC))
+ return -1;
+
+ return 0;
+}
+
+static int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ return -1;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+static int get_mq_queues_max(void)
+{
+ int fd;
+ char buf[16];
+ int val = -1;
+
+ fd = open("/proc/sys/fs/mqueue/queues_max", O_RDONLY);
+ if (fd >= 0) {
+ if (read(fd, buf, sizeof(buf)) > 0)
+ val = atoi(buf);
+
+ close(fd);
+ return val;
+ }
+ return val;
+}
+
+TEST(mqueue_sysctl)
+{
+ pid_t pid;
+ int qmax1, qmax2;
+ int dirfd;
+ char tmpdir[] = "/mqueue_sysctl_XXXXXX";
+
+ if (!mkdtemp(tmpdir))
+ SKIP(return, "create temp dir failed");
+
+ /* read and stash the original sysctl value */
+ qmax1 = get_mq_queues_max();
+ ASSERT_GE(qmax1, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ASSERT_EQ(prepare_unpriv_mountns(), 0);
+
+ ASSERT_EQ(mount("none", tmpdir, "mqueue", MS_NOATIME, NULL), 0);
+
+ /* modify the sysctl value in new ipc namesapce */
+ ASSERT_EQ(write_file("/proc/sys/fs/mqueue/queues_max", "1", 1), 0);
+
+ ASSERT_GE(mq_open("/new_ns1", O_RDWR | O_CREAT, 0644, NULL), 0);
+
+ /* mq_open() should fail as exceeding of queues_max */
+ ASSERT_EQ(mq_open("/new_ns2", O_RDWR | O_CREAT, 0644, NULL), -1);
+
+ ASSERT_EQ(mq_unlink("/new_ns1"), 0);
+ ASSERT_EQ(umount(tmpdir), 0);
+
+ exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+
+ qmax2 = get_mq_queues_max();
+ ASSERT_EQ(qmax1, qmax2);
+
+ remove(tmpdir);
+}
+
+TEST_HARNESS_MAIN
--
2.15.2
From: Ran Xiaokai <[email protected]>
When a non-root user process creates a user namespace and ipc namespace
with command "unshare -Ur -i", and map the root user inside
the user namesapce to the global owner of user namespace.
The newly created user namespace OWNS the ipc namespace,
So the root user inside the user namespace should have full access
rights to the ipc namespace resources and should be writable to
the ipc mqueue sysctls.
v2:
- update commit msg.
- fix the coding style issue.
Signed-off-by: Ran Xiaokai <[email protected]>
---
include/linux/ipc_namespace.h | 14 +++++
ipc/mq_sysctl.c | 118 ++++++++++++++++++++++++++++++++++++------
ipc/mqueue.c | 10 +++-
ipc/namespace.c | 2 +
4 files changed, 126 insertions(+), 18 deletions(-)
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 05e2277..3e8e340 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -10,6 +10,7 @@
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
+#include <linux/sysctl.h>
struct user_namespace;
@@ -67,6 +68,11 @@ struct ipc_namespace {
struct user_namespace *user_ns;
struct ucounts *ucounts;
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+ struct ctl_table_set mq_set;
+ struct ctl_table_header *sysctls;
+#endif
+
struct llist_node mnt_llist;
struct ns_common ns;
@@ -155,7 +161,10 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
struct ctl_table_header;
+extern struct ctl_table_header *mq_sysctl_table;
extern struct ctl_table_header *mq_register_sysctl_table(void);
+bool setup_mq_sysctls(struct ipc_namespace *ns);
+void retire_mq_sysctls(struct ipc_namespace *ns);
#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
@@ -163,6 +172,11 @@ static inline struct ctl_table_header *mq_register_sysctl_table(void)
{
return NULL;
}
+static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+ return true;
+}
+static inline void retire_mq_sysctls(struct ipc_namespace *ns) { }
#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
#endif
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 72a92a0..8d6b8ff 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -8,6 +8,11 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/capability.h>
+#include <linux/cred.h>
+#include <linux/stat.h>
#ifdef CONFIG_PROC_SYSCTL
static void *get_mq(struct ctl_table *table)
@@ -96,25 +101,106 @@ static struct ctl_table mq_sysctls[] = {
{}
};
-static struct ctl_table mq_sysctl_dir[] = {
- {
- .procname = "mqueue",
- .mode = 0555,
- .child = mq_sysctls,
- },
- {}
-};
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return ¤t->nsproxy->ipc_ns->mq_set == set;
+}
-static struct ctl_table mq_sysctl_root[] = {
- {
- .procname = "fs",
- .mode = 0555,
- .child = mq_sysctl_dir,
- },
- {}
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+ return ¤t->nsproxy->ipc_ns->mq_set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ struct ipc_namespace *ipc_ns =
+ container_of(head->set, struct ipc_namespace, mq_set);
+ struct user_namespace *user_ns = ipc_ns->user_ns;
+ int mode;
+
+ /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+ if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+ mode = (table->mode & S_IRWXU) >> 6;
+ else {
+ /* Allow all others at most read-only access */
+ mode = table->mode & S_IROTH;
+ }
+
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static void set_ownership(struct ctl_table_header *head,
+ struct ctl_table *table,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct ipc_namespace *ipc_ns =
+ container_of(head->set, struct ipc_namespace, mq_set);
+ struct user_namespace *user_ns = ipc_ns->user_ns;
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ ns_root_gid = make_kgid(user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+}
+
+static struct ctl_table_root mq_sysctl_root = {
+ .lookup = set_lookup,
+ .permissions = set_permissions,
+ .set_ownership = set_ownership,
};
+bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+ struct ctl_table *tbl;
+
+ if (!mq_sysctl_table)
+ return false;
+
+ setup_sysctl_set(&ns->mq_set, &mq_sysctl_root, set_is_seen);
+ tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
+ if (!tbl)
+ goto out;
+
+ ns->sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
+ if (!ns->sysctls)
+ goto out1;
+
+ return true;
+
+out1:
+ kfree(tbl);
+ retire_sysctl_set(&ns->mq_set);
+out:
+ return false;
+}
+
+void retire_mq_sysctls(struct ipc_namespace *ns)
+{
+ struct ctl_table *tbl;
+
+ if (!ns->sysctls)
+ return;
+
+ tbl = ns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(ns->sysctls);
+ retire_sysctl_set(&ns->mq_set);
+ kfree(tbl);
+}
+
struct ctl_table_header *mq_register_sysctl_table(void)
{
- return register_sysctl_table(mq_sysctl_root);
+ static struct ctl_table empty[1];
+
+ /*
+ * Register the fs/mqueue directory in the default set so that
+ * registrations in the child sets work properly.
+ */
+ return register_sysctl("fs/mqueue", empty);
}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4e4e611..3b68564 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -163,7 +163,7 @@ static void remove_notification(struct mqueue_inode_info *info);
static struct kmem_cache *mqueue_inode_cachep;
-static struct ctl_table_header *mq_sysctl_table;
+struct ctl_table_header *mq_sysctl_table;
static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
{
@@ -1713,6 +1713,10 @@ static int __init init_mqueue_fs(void)
/* ignore failures - they are not fatal */
mq_sysctl_table = mq_register_sysctl_table();
+ if (mq_sysctl_table && !setup_mq_sysctls(&init_ipc_ns)) {
+ unregister_sysctl_table(mq_sysctl_table);
+ mq_sysctl_table = NULL;
+ }
error = register_filesystem(&mqueue_fs_type);
if (error)
@@ -1729,8 +1733,10 @@ static int __init init_mqueue_fs(void)
out_filesystem:
unregister_filesystem(&mqueue_fs_type);
out_sysctl:
- if (mq_sysctl_table)
+ if (mq_sysctl_table) {
+ retire_mq_sysctls(&init_ipc_ns);
unregister_sysctl_table(mq_sysctl_table);
+ }
kmem_cache_destroy(mqueue_inode_cachep);
return error;
}
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 7bd0766..c891cc1 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -58,6 +58,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
err = mq_init_ns(ns);
if (err)
goto fail_put;
+ setup_mq_sysctls(ns);
sem_init_ns(ns);
msg_init_ns(ns);
@@ -121,6 +122,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
* uses synchronize_rcu().
*/
mq_put_mnt(ns);
+ retire_mq_sysctls(ns);
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
--
2.15.2
On Fri, Aug 27, 2021 at 03:12:06AM -0700, CGEL wrote:
> From: Ran Xiaokai <[email protected]>
>
> When a non-root user process creates a user namespace and ipc namespace
> with command "unshare -Ur -i", and map the root user inside
> the user namesapce to the global owner of user namespace.
> The newly created user namespace OWNS the ipc namespace,
> So the root user inside the user namespace should have full access
> rights to the ipc namespace resources and should be writable to
> the ipc mqueue sysctls.
>
> v2:
> - update commit msg.
> - fix the coding style issue.
> Signed-off-by: Ran Xiaokai <[email protected]>
> ---
David,
are you happy with this too? If so I'd pick this up.
> include/linux/ipc_namespace.h | 14 +++++
> ipc/mq_sysctl.c | 118 ++++++++++++++++++++++++++++++++++++------
> ipc/mqueue.c | 10 +++-
> ipc/namespace.c | 2 +
> 4 files changed, 126 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index 05e2277..3e8e340 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -10,6 +10,7 @@
> #include <linux/ns_common.h>
> #include <linux/refcount.h>
> #include <linux/rhashtable-types.h>
> +#include <linux/sysctl.h>
>
> struct user_namespace;
>
> @@ -67,6 +68,11 @@ struct ipc_namespace {
> struct user_namespace *user_ns;
> struct ucounts *ucounts;
>
> +#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
> + struct ctl_table_set mq_set;
> + struct ctl_table_header *sysctls;
> +#endif
> +
> struct llist_node mnt_llist;
>
> struct ns_common ns;
> @@ -155,7 +161,10 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
> #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
>
> struct ctl_table_header;
> +extern struct ctl_table_header *mq_sysctl_table;
> extern struct ctl_table_header *mq_register_sysctl_table(void);
> +bool setup_mq_sysctls(struct ipc_namespace *ns);
> +void retire_mq_sysctls(struct ipc_namespace *ns);
>
> #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
>
> @@ -163,6 +172,11 @@ static inline struct ctl_table_header *mq_register_sysctl_table(void)
> {
> return NULL;
> }
> +static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
> +{
> + return true;
> +}
> +static inline void retire_mq_sysctls(struct ipc_namespace *ns) { }
>
> #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
> #endif
> diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
> index 72a92a0..8d6b8ff 100644
> --- a/ipc/mq_sysctl.c
> +++ b/ipc/mq_sysctl.c
> @@ -8,6 +8,11 @@
> #include <linux/nsproxy.h>
> #include <linux/ipc_namespace.h>
> #include <linux/sysctl.h>
> +#include <linux/slab.h>
> +#include <linux/user_namespace.h>
> +#include <linux/capability.h>
> +#include <linux/cred.h>
> +#include <linux/stat.h>
>
> #ifdef CONFIG_PROC_SYSCTL
> static void *get_mq(struct ctl_table *table)
> @@ -96,25 +101,106 @@ static struct ctl_table mq_sysctls[] = {
> {}
> };
>
> -static struct ctl_table mq_sysctl_dir[] = {
> - {
> - .procname = "mqueue",
> - .mode = 0555,
> - .child = mq_sysctls,
> - },
> - {}
> -};
> +static int set_is_seen(struct ctl_table_set *set)
> +{
> + return ¤t->nsproxy->ipc_ns->mq_set == set;
> +}
>
> -static struct ctl_table mq_sysctl_root[] = {
> - {
> - .procname = "fs",
> - .mode = 0555,
> - .child = mq_sysctl_dir,
> - },
> - {}
> +static struct ctl_table_set *
> +set_lookup(struct ctl_table_root *root)
> +{
> + return ¤t->nsproxy->ipc_ns->mq_set;
> +}
> +
> +static int set_permissions(struct ctl_table_header *head,
> + struct ctl_table *table)
> +{
> + struct ipc_namespace *ipc_ns =
> + container_of(head->set, struct ipc_namespace, mq_set);
> + struct user_namespace *user_ns = ipc_ns->user_ns;
> + int mode;
> +
> + /* Allow users with CAP_SYS_RESOURCE unrestrained access */
> + if (ns_capable(user_ns, CAP_SYS_RESOURCE))
> + mode = (table->mode & S_IRWXU) >> 6;
> + else {
> + /* Allow all others at most read-only access */
> + mode = table->mode & S_IROTH;
> + }
> +
> + return (mode << 6) | (mode << 3) | mode;
> +}
> +
> +static void set_ownership(struct ctl_table_header *head,
> + struct ctl_table *table,
> + kuid_t *uid, kgid_t *gid)
> +{
> + struct ipc_namespace *ipc_ns =
> + container_of(head->set, struct ipc_namespace, mq_set);
> + struct user_namespace *user_ns = ipc_ns->user_ns;
> + kuid_t ns_root_uid;
> + kgid_t ns_root_gid;
> +
> + ns_root_uid = make_kuid(user_ns, 0);
> + if (uid_valid(ns_root_uid))
> + *uid = ns_root_uid;
> +
> + ns_root_gid = make_kgid(user_ns, 0);
> + if (gid_valid(ns_root_gid))
> + *gid = ns_root_gid;
> +}
> +
> +static struct ctl_table_root mq_sysctl_root = {
> + .lookup = set_lookup,
> + .permissions = set_permissions,
> + .set_ownership = set_ownership,
> };
>
> +bool setup_mq_sysctls(struct ipc_namespace *ns)
> +{
> + struct ctl_table *tbl;
> +
> + if (!mq_sysctl_table)
> + return false;
> +
> + setup_sysctl_set(&ns->mq_set, &mq_sysctl_root, set_is_seen);
> + tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
> + if (!tbl)
> + goto out;
> +
> + ns->sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
> + if (!ns->sysctls)
> + goto out1;
> +
> + return true;
> +
> +out1:
> + kfree(tbl);
> + retire_sysctl_set(&ns->mq_set);
> +out:
> + return false;
> +}
> +
> +void retire_mq_sysctls(struct ipc_namespace *ns)
> +{
> + struct ctl_table *tbl;
> +
> + if (!ns->sysctls)
> + return;
> +
> + tbl = ns->sysctls->ctl_table_arg;
> + unregister_sysctl_table(ns->sysctls);
> + retire_sysctl_set(&ns->mq_set);
> + kfree(tbl);
> +}
> +
> struct ctl_table_header *mq_register_sysctl_table(void)
> {
> - return register_sysctl_table(mq_sysctl_root);
> + static struct ctl_table empty[1];
> +
> + /*
> + * Register the fs/mqueue directory in the default set so that
> + * registrations in the child sets work properly.
> + */
> + return register_sysctl("fs/mqueue", empty);
> }
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 4e4e611..3b68564 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -163,7 +163,7 @@ static void remove_notification(struct mqueue_inode_info *info);
>
> static struct kmem_cache *mqueue_inode_cachep;
>
> -static struct ctl_table_header *mq_sysctl_table;
> +struct ctl_table_header *mq_sysctl_table;
>
> static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
> {
> @@ -1713,6 +1713,10 @@ static int __init init_mqueue_fs(void)
>
> /* ignore failures - they are not fatal */
> mq_sysctl_table = mq_register_sysctl_table();
> + if (mq_sysctl_table && !setup_mq_sysctls(&init_ipc_ns)) {
> + unregister_sysctl_table(mq_sysctl_table);
> + mq_sysctl_table = NULL;
> + }
>
> error = register_filesystem(&mqueue_fs_type);
> if (error)
> @@ -1729,8 +1733,10 @@ static int __init init_mqueue_fs(void)
> out_filesystem:
> unregister_filesystem(&mqueue_fs_type);
> out_sysctl:
> - if (mq_sysctl_table)
> + if (mq_sysctl_table) {
> + retire_mq_sysctls(&init_ipc_ns);
> unregister_sysctl_table(mq_sysctl_table);
> + }
> kmem_cache_destroy(mqueue_inode_cachep);
> return error;
> }
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index 7bd0766..c891cc1 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -58,6 +58,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
> err = mq_init_ns(ns);
> if (err)
> goto fail_put;
> + setup_mq_sysctls(ns);
>
> sem_init_ns(ns);
> msg_init_ns(ns);
> @@ -121,6 +122,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
> * uses synchronize_rcu().
> */
> mq_put_mnt(ns);
> + retire_mq_sysctls(ns);
> sem_exit_ns(ns);
> msg_exit_ns(ns);
> shm_exit_ns(ns);
> --
> 2.15.2
>
On 2021-09-13 07:40, Christian Brauner wrote:
> On Fri, Aug 27, 2021 at 03:12:06AM -0700, CGEL wrote:
>> From: Ran Xiaokai <[email protected]>
>>
>> When a non-root user process creates a user namespace and ipc
>> namespace
>> with command "unshare -Ur -i", and map the root user inside
>> the user namesapce to the global owner of user namespace.
>> The newly created user namespace OWNS the ipc namespace,
>> So the root user inside the user namespace should have full access
>> rights to the ipc namespace resources and should be writable to
>> the ipc mqueue sysctls.
>>
>> v2:
>> - update commit msg.
>> - fix the coding style issue.
>> Signed-off-by: Ran Xiaokai <[email protected]>
>> ---
>
> David,
>
> are you happy with this too? If so I'd pick this up.
LGTM:
Acked-by: Davidlohr Bueso <[email protected]>
esOn Mon, Sep 13, 2021 at 04:40:47PM +0200, Christian Brauner wrote:
> On Fri, Aug 27, 2021 at 03:12:06AM -0700, CGEL wrote:
> > From: Ran Xiaokai <[email protected]>
> >
> > When a non-root user process creates a user namespace and ipc namespace
> > with command "unshare -Ur -i", and map the root user inside
> > the user namesapce to the global owner of user namespace.
> > The newly created user namespace OWNS the ipc namespace,
> > So the root user inside the user namespace should have full access
> > rights to the ipc namespace resources and should be writable to
> > the ipc mqueue sysctls.
> >
> > v2:
> > - update commit msg.
> > - fix the coding style issue.
> > Signed-off-by: Ran Xiaokai <[email protected]>
> > ---
>
> David,
>
> are you happy with this too? If so I'd pick this up.
>
Hi Christian,
Is there a xx-next branch for this kind patch?
We will try to fixes other issues like this, so we could tag the follow-up
patches with the branch name.
> > include/linux/ipc_namespace.h | 14 +++++
> > ipc/mq_sysctl.c | 118 ++++++++++++++++++++++++++++++++++++------
> > ipc/mqueue.c | 10 +++-
> > ipc/namespace.c | 2 +
> > 4 files changed, 126 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> > index 05e2277..3e8e340 100644
> > --- a/include/linux/ipc_namespace.h
> > +++ b/include/linux/ipc_namespace.h
> > @@ -10,6 +10,7 @@
> > #include <linux/ns_common.h>
> > #include <linux/refcount.h>
> > #include <linux/rhashtable-types.h>
> > +#include <linux/sysctl.h>
> >
> > struct user_namespace;
> >
> > @@ -67,6 +68,11 @@ struct ipc_namespace {
> > struct user_namespace *user_ns;
> > struct ucounts *ucounts;
> >
> > +#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
> > + struct ctl_table_set mq_set;
> > + struct ctl_table_header *sysctls;
> > +#endif
> > +
> > struct llist_node mnt_llist;
> >
> > struct ns_common ns;
> > @@ -155,7 +161,10 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
> > #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
> >
> > struct ctl_table_header;
> > +extern struct ctl_table_header *mq_sysctl_table;
> > extern struct ctl_table_header *mq_register_sysctl_table(void);
> > +bool setup_mq_sysctls(struct ipc_namespace *ns);
> > +void retire_mq_sysctls(struct ipc_namespace *ns);
> >
> > #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
> >
> > @@ -163,6 +172,11 @@ static inline struct ctl_table_header *mq_register_sysctl_table(void)
> > {
> > return NULL;
> > }
> > +static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
> > +{
> > + return true;
> > +}
> > +static inline void retire_mq_sysctls(struct ipc_namespace *ns) { }
> >
> > #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
> > #endif
> > diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
> > index 72a92a0..8d6b8ff 100644
> > --- a/ipc/mq_sysctl.c
> > +++ b/ipc/mq_sysctl.c
> > @@ -8,6 +8,11 @@
> > #include <linux/nsproxy.h>
> > #include <linux/ipc_namespace.h>
> > #include <linux/sysctl.h>
> > +#include <linux/slab.h>
> > +#include <linux/user_namespace.h>
> > +#include <linux/capability.h>
> > +#include <linux/cred.h>
> > +#include <linux/stat.h>
> >
> > #ifdef CONFIG_PROC_SYSCTL
> > static void *get_mq(struct ctl_table *table)
> > @@ -96,25 +101,106 @@ static struct ctl_table mq_sysctls[] = {
> > {}
> > };
> >
> > -static struct ctl_table mq_sysctl_dir[] = {
> > - {
> > - .procname = "mqueue",
> > - .mode = 0555,
> > - .child = mq_sysctls,
> > - },
> > - {}
> > -};
> > +static int set_is_seen(struct ctl_table_set *set)
> > +{
> > + return ¤t->nsproxy->ipc_ns->mq_set == set;
> > +}
> >
> > -static struct ctl_table mq_sysctl_root[] = {
> > - {
> > - .procname = "fs",
> > - .mode = 0555,
> > - .child = mq_sysctl_dir,
> > - },
> > - {}
> > +static struct ctl_table_set *
> > +set_lookup(struct ctl_table_root *root)
> > +{
> > + return ¤t->nsproxy->ipc_ns->mq_set;
> > +}
> > +
> > +static int set_permissions(struct ctl_table_header *head,
> > + struct ctl_table *table)
> > +{
> > + struct ipc_namespace *ipc_ns =
> > + container_of(head->set, struct ipc_namespace, mq_set);
> > + struct user_namespace *user_ns = ipc_ns->user_ns;
> > + int mode;
> > +
> > + /* Allow users with CAP_SYS_RESOURCE unrestrained access */
> > + if (ns_capable(user_ns, CAP_SYS_RESOURCE))
> > + mode = (table->mode & S_IRWXU) >> 6;
> > + else {
> > + /* Allow all others at most read-only access */
> > + mode = table->mode & S_IROTH;
> > + }
> > +
> > + return (mode << 6) | (mode << 3) | mode;
> > +}
> > +
> > +static void set_ownership(struct ctl_table_header *head,
> > + struct ctl_table *table,
> > + kuid_t *uid, kgid_t *gid)
> > +{
> > + struct ipc_namespace *ipc_ns =
> > + container_of(head->set, struct ipc_namespace, mq_set);
> > + struct user_namespace *user_ns = ipc_ns->user_ns;
> > + kuid_t ns_root_uid;
> > + kgid_t ns_root_gid;
> > +
> > + ns_root_uid = make_kuid(user_ns, 0);
> > + if (uid_valid(ns_root_uid))
> > + *uid = ns_root_uid;
> > +
> > + ns_root_gid = make_kgid(user_ns, 0);
> > + if (gid_valid(ns_root_gid))
> > + *gid = ns_root_gid;
> > +}
> > +
> > +static struct ctl_table_root mq_sysctl_root = {
> > + .lookup = set_lookup,
> > + .permissions = set_permissions,
> > + .set_ownership = set_ownership,
> > };
> >
> > +bool setup_mq_sysctls(struct ipc_namespace *ns)
> > +{
> > + struct ctl_table *tbl;
> > +
> > + if (!mq_sysctl_table)
> > + return false;
> > +
> > + setup_sysctl_set(&ns->mq_set, &mq_sysctl_root, set_is_seen);
> > + tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
> > + if (!tbl)
> > + goto out;
> > +
> > + ns->sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
> > + if (!ns->sysctls)
> > + goto out1;
> > +
> > + return true;
> > +
> > +out1:
> > + kfree(tbl);
> > + retire_sysctl_set(&ns->mq_set);
> > +out:
> > + return false;
> > +}
> > +
> > +void retire_mq_sysctls(struct ipc_namespace *ns)
> > +{
> > + struct ctl_table *tbl;
> > +
> > + if (!ns->sysctls)
> > + return;
> > +
> > + tbl = ns->sysctls->ctl_table_arg;
> > + unregister_sysctl_table(ns->sysctls);
> > + retire_sysctl_set(&ns->mq_set);
> > + kfree(tbl);
> > +}
> > +
> > struct ctl_table_header *mq_register_sysctl_table(void)
> > {
> > - return register_sysctl_table(mq_sysctl_root);
> > + static struct ctl_table empty[1];
> > +
> > + /*
> > + * Register the fs/mqueue directory in the default set so that
> > + * registrations in the child sets work properly.
> > + */
> > + return register_sysctl("fs/mqueue", empty);
> > }
> > diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> > index 4e4e611..3b68564 100644
> > --- a/ipc/mqueue.c
> > +++ b/ipc/mqueue.c
> > @@ -163,7 +163,7 @@ static void remove_notification(struct mqueue_inode_info *info);
> >
> > static struct kmem_cache *mqueue_inode_cachep;
> >
> > -static struct ctl_table_header *mq_sysctl_table;
> > +struct ctl_table_header *mq_sysctl_table;
> >
> > static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
> > {
> > @@ -1713,6 +1713,10 @@ static int __init init_mqueue_fs(void)
> >
> > /* ignore failures - they are not fatal */
> > mq_sysctl_table = mq_register_sysctl_table();
> > + if (mq_sysctl_table && !setup_mq_sysctls(&init_ipc_ns)) {
> > + unregister_sysctl_table(mq_sysctl_table);
> > + mq_sysctl_table = NULL;
> > + }
> >
> > error = register_filesystem(&mqueue_fs_type);
> > if (error)
> > @@ -1729,8 +1733,10 @@ static int __init init_mqueue_fs(void)
> > out_filesystem:
> > unregister_filesystem(&mqueue_fs_type);
> > out_sysctl:
> > - if (mq_sysctl_table)
> > + if (mq_sysctl_table) {
> > + retire_mq_sysctls(&init_ipc_ns);
> > unregister_sysctl_table(mq_sysctl_table);
> > + }
> > kmem_cache_destroy(mqueue_inode_cachep);
> > return error;
> > }
> > diff --git a/ipc/namespace.c b/ipc/namespace.c
> > index 7bd0766..c891cc1 100644
> > --- a/ipc/namespace.c
> > +++ b/ipc/namespace.c
> > @@ -58,6 +58,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
> > err = mq_init_ns(ns);
> > if (err)
> > goto fail_put;
> > + setup_mq_sysctls(ns);
> >
> > sem_init_ns(ns);
> > msg_init_ns(ns);
> > @@ -121,6 +122,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
> > * uses synchronize_rcu().
> > */
> > mq_put_mnt(ns);
> > + retire_mq_sysctls(ns);
> > sem_exit_ns(ns);
> > msg_exit_ns(ns);
> > shm_exit_ns(ns);
> > --
> > 2.15.2
> >
On Thu, Sep 16, 2021 at 01:49:31AM +0000, CGEL wrote:
> esOn Mon, Sep 13, 2021 at 04:40:47PM +0200, Christian Brauner wrote:
> > On Fri, Aug 27, 2021 at 03:12:06AM -0700, CGEL wrote:
> > > From: Ran Xiaokai <[email protected]>
> > >
> > > When a non-root user process creates a user namespace and ipc namespace
> > > with command "unshare -Ur -i", and map the root user inside
> > > the user namesapce to the global owner of user namespace.
> > > The newly created user namespace OWNS the ipc namespace,
> > > So the root user inside the user namespace should have full access
> > > rights to the ipc namespace resources and should be writable to
> > > the ipc mqueue sysctls.
> > >
> > > v2:
> > > - update commit msg.
> > > - fix the coding style issue.
> > > Signed-off-by: Ran Xiaokai <[email protected]>
> > > ---
> >
> > David,
> >
> > are you happy with this too? If so I'd pick this up.
> >
>
> Hi Christian,
>
> Is there a xx-next branch for this kind patch?
> We will try to fixes other issues like this, so we could tag the follow-up
> patches with the branch name.
Hm, sorry that message slipped through the pre-mid-and post-conference
cracks. I'll added the patches now for testing. See:
https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=kernel.fixes
Christian
On Mon, Oct 04, 2021 at 12:53:13PM +0200, Christian Brauner wrote:
> On Thu, Sep 16, 2021 at 01:49:31AM +0000, CGEL wrote:
> > esOn Mon, Sep 13, 2021 at 04:40:47PM +0200, Christian Brauner wrote:
> > > On Fri, Aug 27, 2021 at 03:12:06AM -0700, CGEL wrote:
> > > > From: Ran Xiaokai <[email protected]>
> > > >
> > > > When a non-root user process creates a user namespace and ipc namespace
> > > > with command "unshare -Ur -i", and map the root user inside
> > > > the user namesapce to the global owner of user namespace.
> > > > The newly created user namespace OWNS the ipc namespace,
> > > > So the root user inside the user namespace should have full access
> > > > rights to the ipc namespace resources and should be writable to
> > > > the ipc mqueue sysctls.
> > > >
> > > > v2:
> > > > - update commit msg.
> > > > - fix the coding style issue.
> > > > Signed-off-by: Ran Xiaokai <[email protected]>
> > > > ---
> > >
> > > David,
> > >
> > > are you happy with this too? If so I'd pick this up.
> > >
> >
> > Hi Christian,
> >
> > Is there a xx-next branch for this kind patch?
> > We will try to fixes other issues like this, so we could tag the follow-up
> > patches with the branch name.
>
> Hm, sorry that message slipped through the pre-mid-and post-conference
> cracks. I'll added the patches now for testing. See:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=kernel.fixes
>
> Christian
Hi Christian,
How the the testing goes on?
On Wed, Dec 01, 2021 at 07:14:50AM +0000, CGEL wrote:
> On Mon, Oct 04, 2021 at 12:53:13PM +0200, Christian Brauner wrote:
> > On Thu, Sep 16, 2021 at 01:49:31AM +0000, CGEL wrote:
> > > esOn Mon, Sep 13, 2021 at 04:40:47PM +0200, Christian Brauner wrote:
> > > > On Fri, Aug 27, 2021 at 03:12:06AM -0700, CGEL wrote:
> > > > > From: Ran Xiaokai <[email protected]>
> > > > >
> > > > > When a non-root user process creates a user namespace and ipc namespace
> > > > > with command "unshare -Ur -i", and map the root user inside
> > > > > the user namesapce to the global owner of user namespace.
> > > > > The newly created user namespace OWNS the ipc namespace,
> > > > > So the root user inside the user namespace should have full access
> > > > > rights to the ipc namespace resources and should be writable to
> > > > > the ipc mqueue sysctls.
> > > > >
> > > > > v2:
> > > > > - update commit msg.
> > > > > - fix the coding style issue.
> > > > > Signed-off-by: Ran Xiaokai <[email protected]>
> > > > > ---
> > > >
> > > > David,
> > > >
> > > > are you happy with this too? If so I'd pick this up.
> > > >
> > >
> > > Hi Christian,
> > >
> > > Is there a xx-next branch for this kind patch?
> > > We will try to fixes other issues like this, so we could tag the follow-up
> > > patches with the branch name.
> >
> > Hm, sorry that message slipped through the pre-mid-and post-conference
> > cracks. I'll added the patches now for testing. See:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=kernel.fixes
> >
> > Christian
>
> Hi Christian,
>
> How the the testing goes on?
I'll plan to send this for the next merge window.
Thanks!
Christian
Hi, Christian.
This is a polite question:
how's this patch? It seems that it hasn't been merged into linux-next so far