Preface
-------
These patches are for binding the rlimits to a user in the user namespace.
This patch set can be applied on top of:
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v5.8-2-g43e210d68200
Problem
-------
Some rlimits are set per user: RLIMIT_NPROC, RLIMIT_MEMLOCK, RLIMIT_SIGPENDING,
RLIMIT_MSGQUEUE. When several containers are created from one user then
the processes inside the containers influence each other.
Eric W. Biederman mentioned this issue [1][2][3].
Introduced changes
------------------
To fix this problem, you can bind the counter of the specified rlimits to the
user within the user namespace. By default, to preserve backward compatibility,
only the initial user namespace is used. This patch adds one more prctl
parameter to change the binding to the user namespace.
This will not cause the user to take more resources than allowed in the parent
user namespace because it only virtualizes the rlimit counter. Limits in all
parent user namespaces are taken into account.
For example, this allows us to run multiple containers by the same user and
set the RLIMIT_NPROC to 1 inside.
ToDo
----
* RLIMIT_MEMLOCK, RLIMIT_SIGPENDING and RLIMIT_MSGQUEUE are not implemented.
* No documentation.
* No tests.
[1] https://lore.kernel.org/containers/[email protected]/
[2] https://lists.linuxfoundation.org/pipermail/containers/2020-August/042096.html
[3] https://lists.linuxfoundation.org/pipermail/containers/2020-October/042524.html
Changelog
---------
v1:
* After discussion with Eric W. Biederman, I increased the size of ucounts to
atomic_long_t.
* Added ucount_max to avoid the fork bomb.
--
Alexey Gladkov (4):
Increase size of ucounts to atomic_long_t
Move the user's process counter to ucounts
Do not allow fork if RLIMIT_NPROC is exceeded in the user namespace
tree
Allow to change the user namespace in which user rlimits are counted
fs/exec.c | 13 ++++++---
fs/io-wq.c | 25 +++++++++++++-----
fs/io-wq.h | 1 +
fs/io_uring.c | 1 +
include/linux/cred.h | 8 ++++++
include/linux/sched.h | 3 +++
include/linux/sched/user.h | 1 -
include/linux/user_namespace.h | 12 +++++++--
include/uapi/linux/prctl.h | 5 ++++
kernel/cred.c | 44 ++++++++++++++++++++++++-------
kernel/exit.c | 2 +-
kernel/fork.c | 13 ++++++---
kernel/sys.c | 26 ++++++++++++++++--
kernel/ucount.c | 48 +++++++++++++++++++++++++++++-----
kernel/user.c | 3 ++-
kernel/user_namespace.c | 3 +++
16 files changed, 171 insertions(+), 37 deletions(-)
--
2.25.4
Since RLIMIT_NPROC is counted per user namespace, the existing over-limit
check in the current user namespace is not sufficient. We must consider
exceeding this limit in parent user namespaces.
Signed-off-by: Alexey Gladkov <[email protected]>
---
fs/exec.c | 6 ++++++
fs/io-wq.c | 12 ++++++++----
include/linux/sched.h | 3 +++
kernel/cred.c | 17 ++++++++++-------
kernel/fork.c | 6 +++++-
5 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 3f2071f7b9c7..c45dfc716394 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1831,6 +1831,12 @@ static int __do_execve_file(int fd, struct filename *filename,
if (IS_ERR(filename))
return PTR_ERR(filename);
+ if (current->flags & PF_NPROC_UNS_EXCEEDED) {
+ current->flags &= ~PF_NPROC_UNS_EXCEEDED;
+ retval = -EAGAIN;
+ goto out_ret;
+ }
+
processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 6170aee986db..c3b0843abc9b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -352,10 +352,11 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
} else {
+ if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
+ return;
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
- inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
}
io_wqe_inc_running(wqe, worker);
}
@@ -660,6 +661,12 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
return false;
}
+ if (index == IO_WQ_ACCT_UNBOUND &&
+ !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
+ kfree(worker);
+ return false;
+ }
+
spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
@@ -671,9 +678,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
acct->nr_workers++;
spin_unlock_irq(&wqe->lock);
- if (index == IO_WQ_ACCT_UNBOUND)
- inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC);
-
wake_up_process(worker->task);
return true;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 683372943093..c3cf034b4aa7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1506,6 +1506,9 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
+#define PF_NPROC_UNS_EXCEEDED 0x01000000 /* It means that we have reached the RLIMIT_NPROC
+ * in the current user namespace or in one of
+ * the parent's and we can't fork */
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
diff --git a/kernel/cred.c b/kernel/cred.c
index b6694700e760..748704db1f6b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -345,13 +345,14 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
#endif
clone_flags & CLONE_THREAD
) {
+ if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
+ return -EACCES;
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
alter_cred_subscribers(p->cred, 2);
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
- inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
return 0;
}
@@ -384,7 +385,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC);
+ if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+ return -EACCES;
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
validate_creds(new);
@@ -480,13 +482,14 @@ int commit_creds(struct cred *new)
if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(new);
- /* do it
- * RLIMIT_NPROC limits on user->processes have already been checked
- * in set_user().
+ /*
+ * The RLIMIT_NPROC limits have already been checked in set_user(), but
+ * perhaps this limit is exceeded in the parent user namespace.
*/
alter_cred_subscribers(new, 2);
- if (new->user != old->user)
- inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC);
+ if (new->user != old->user &&
+ !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+ task->flags |= PF_NPROC_UNS_EXCEEDED;
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2bc8bd45179f..d2b28634dc8f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1958,9 +1958,13 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ retval = -EAGAIN;
+ if (current->flags & PF_NPROC_UNS_EXCEEDED) {
+ current->flags &= ~PF_NPROC_UNS_EXCEEDED;
+ goto bad_fork_free;
+ }
processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
UCOUNT_RLIMIT_NPROC);
- retval = -EAGAIN;
if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
--
2.25.4
To count the number of user processes use the counter bound to the user
in the user namespace.
Signed-off-by: Alexey Gladkov <[email protected]>
---
fs/exec.c | 7 ++++---
fs/io-wq.c | 14 +++++++++-----
include/linux/sched/user.h | 1 -
include/linux/user_namespace.h | 8 ++++++++
kernel/cred.c | 8 ++++----
kernel/exit.c | 2 +-
kernel/fork.c | 9 +++++----
kernel/sys.c | 6 ++++--
kernel/ucount.c | 34 ++++++++++++++++++++++++++++++++++
kernel/user.c | 3 ++-
10 files changed, 71 insertions(+), 21 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..3f2071f7b9c7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1826,19 +1826,20 @@ static int __do_execve_file(int fd, struct filename *filename,
char *pathbuf = NULL;
struct linux_binprm *bprm;
struct files_struct *displaced;
- int retval;
+ int retval, processes;
if (IS_ERR(filename))
return PTR_ERR(filename);
+ processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
+
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
* set*uid() to execve() because too many poorly written programs
* don't check setuid() return code. Here we additionally recheck
* whether NPROC limit is still exceeded.
*/
- if ((current->flags & PF_NPROC_EXCEEDED) &&
- atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) {
+ if ((current->flags & PF_NPROC_EXCEEDED) && processes > rlimit(RLIMIT_NPROC)) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 47c5f3aeb460..6170aee986db 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
#include <linux/task_work.h>
+#include <linux/user_namespace.h>
#include "io-wq.h"
@@ -216,7 +217,7 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_RUNNING)
atomic_dec(&acct->nr_running);
if (!(worker->flags & IO_WORKER_F_BOUND))
- atomic_dec(&wqe->wq->user->processes);
+ dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
worker->flags = 0;
preempt_enable();
@@ -349,12 +350,12 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
worker->flags |= IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
- atomic_dec(&wqe->wq->user->processes);
+ dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
} else {
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
- atomic_inc(&wqe->wq->user->processes);
+ inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
}
io_wqe_inc_running(wqe, worker);
}
@@ -671,7 +672,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
spin_unlock_irq(&wqe->lock);
if (index == IO_WQ_ACCT_UNBOUND)
- atomic_inc(&wq->user->processes);
+ inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC);
wake_up_process(worker->task);
return true;
@@ -754,6 +755,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
struct io_wq_work *work)
{
bool free_worker;
+ int processes;
if (!(work->flags & IO_WQ_WORK_UNBOUND))
return true;
@@ -766,7 +768,9 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
if (free_worker)
return true;
- if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
+ processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+
+ if (processes >= acct->max_workers &&
!(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
return false;
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 917d88edb7b9..38e122bc3d07 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -12,7 +12,6 @@
*/
struct user_struct {
refcount_t __count; /* reference count */
- atomic_t processes; /* How many processes does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index fc75af812d73..6d9d180b2c9d 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -50,9 +50,13 @@ enum ucount_type {
UCOUNT_INOTIFY_INSTANCES,
UCOUNT_INOTIFY_WATCHES,
#endif
+ UCOUNT_RLIMIT_NPROC,
UCOUNT_COUNTS,
};
+#define UCOUNT_MIN_RLIMIT UCOUNT_RLIMIT_NPROC
+#define UCOUNT_MAX_RLIMIT UCOUNT_RLIMIT_NPROC
+
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
@@ -104,6 +108,10 @@ void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+long get_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+struct ucounts *inc_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+void dec_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+
#ifdef CONFIG_USER_NS
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
diff --git a/kernel/cred.c b/kernel/cred.c
index 421b1149c651..b6694700e760 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -351,7 +351,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
- atomic_inc(&p->cred->user->processes);
+ inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
return 0;
}
@@ -384,7 +384,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- atomic_inc(&new->user->processes);
+ inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC);
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
validate_creds(new);
@@ -486,11 +486,11 @@ int commit_creds(struct cred *new)
*/
alter_cred_subscribers(new, 2);
if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
- atomic_dec(&old->user->processes);
+ dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
alter_cred_subscribers(old, -2);
/* send notifications */
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..5a0d7dd1ad64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- atomic_dec(&__task_cred(p)->user->processes);
+ dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
rcu_read_unlock();
cgroup_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..2bc8bd45179f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1844,7 +1844,7 @@ static __latent_entropy struct task_struct *copy_process(
int node,
struct kernel_clone_args *args)
{
- int pidfd = -1, retval;
+ int pidfd = -1, retval, processes;
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
@@ -1958,9 +1958,10 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
+ UCOUNT_RLIMIT_NPROC);
retval = -EAGAIN;
- if (atomic_read(&p->real_cred->user->processes) >=
- task_rlimit(p, RLIMIT_NPROC)) {
+ if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
@@ -2361,7 +2362,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- atomic_dec(&p->cred->user->processes);
+ dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
exit_creds(p);
bad_fork_free:
p->state = TASK_DEAD;
diff --git a/kernel/sys.c b/kernel/sys.c
index 00a96746e28a..db780ec32d86 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -461,11 +461,14 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
static int set_user(struct cred *new)
{
struct user_struct *new_user;
+ int processes;
new_user = alloc_uid(new->uid);
if (!new_user)
return -EAGAIN;
+ processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
+
/*
* We don't fail in case of NPROC limit excess here because too many
* poorly written programs don't check set*uid() return code, assuming
@@ -473,8 +476,7 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
- new_user != INIT_USER)
+ if (processes >= rlimit(RLIMIT_NPROC) && new_user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED;
else
current->flags &= ~PF_NPROC_EXCEEDED;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 7b2bca8582ef..e00d644e4ca5 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -74,6 +74,7 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+ { },
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -222,6 +223,39 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
put_ucounts(ucounts);
}
+long get_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type)
+{
+ long v;
+ struct ucounts *ucounts = get_ucounts(ns, uid);
+ if (!ucounts)
+ return LONG_MAX;
+ v = atomic_long_read(&ucounts->ucount[type]);
+ put_ucounts(ucounts);
+ return v;
+}
+
+struct ucounts *inc_rlimit_counter(struct user_namespace *ns, kuid_t uid,
+ enum ucount_type type)
+{
+ if (type < UCOUNT_MIN_RLIMIT || type > UCOUNT_MAX_RLIMIT)
+ return NULL;
+
+ return inc_ucount(ns, uid, type);
+}
+
+void dec_rlimit_counter(struct user_namespace *ns, kuid_t uid, enum ucount_type type)
+{
+ struct ucounts *ucounts;
+
+ if (type < UCOUNT_MIN_RLIMIT || type > UCOUNT_MAX_RLIMIT)
+ return;
+
+ ucounts = get_ucounts(ns, uid);
+
+ if (ucounts)
+ dec_ucount(ucounts, type);
+}
+
static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
diff --git a/kernel/user.c b/kernel/user.c
index b1635d94a1f2..5bb75ebdef4f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .processes = ATOMIC_INIT(1),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
@@ -224,6 +223,8 @@ static int __init uid_cache_init(void)
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
spin_unlock_irq(&uidhash_lock);
+ inc_rlimit_counter(&init_user_ns, GLOBAL_ROOT_UID, UCOUNT_RLIMIT_NPROC);
+
return 0;
}
subsys_initcall(uid_cache_init);
--
2.25.4
Add a new prctl to change the user namespace in which the process
counter is located. A pointer to the user namespace is in cred struct
to be inherited by all child processes.
Signed-off-by: Alexey Gladkov <[email protected]>
---
fs/exec.c | 2 +-
fs/io-wq.c | 13 ++++++++-----
fs/io-wq.h | 1 +
fs/io_uring.c | 1 +
include/linux/cred.h | 8 ++++++++
include/uapi/linux/prctl.h | 5 +++++
kernel/cred.c | 35 +++++++++++++++++++++++++++++------
kernel/exit.c | 2 +-
kernel/fork.c | 4 ++--
kernel/sys.c | 22 +++++++++++++++++++++-
kernel/user_namespace.c | 3 +++
11 files changed, 80 insertions(+), 16 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index c45dfc716394..574b1381276c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename,
goto out_ret;
}
- processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
+ processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
diff --git a/fs/io-wq.c b/fs/io-wq.c
index c3b0843abc9b..19e43ec115cb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -116,6 +116,7 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
+ const struct cred *creds;
refcount_t refs;
struct completion done;
@@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_RUNNING)
atomic_dec(&acct->nr_running);
if (!(worker->flags & IO_WORKER_F_BOUND))
- dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
worker->flags = 0;
preempt_enable();
@@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
worker->flags |= IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
- dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
} else {
- if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
+ if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
return;
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
@@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
}
if (index == IO_WQ_ACCT_UNBOUND &&
- !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
+ !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
kfree(worker);
return false;
}
@@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
if (free_worker)
return true;
- processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+ processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
+ UCOUNT_RLIMIT_NPROC);
if (processes >= acct->max_workers &&
!(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
@@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */
wq->user = data->user;
+ wq->creds = data->creds;
for_each_node(node) {
struct io_wqe *wqe;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 071f1a997800..6acc3a04c38f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **);
struct io_wq_data {
struct user_struct *user;
+ const struct cred *creds;
io_wq_work_fn *do_work;
free_work_fn *free_work;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 493e5047e67c..e419923968b3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
int ret = 0;
data.user = ctx->user;
+ data.creds = ctx->creds;
data.free_work = io_free_work;
data.do_work = io_wq_submit_work;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 18639c069263..43aee68d117f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -144,6 +144,7 @@ struct cred {
#endif
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+ struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
struct group_info *group_info; /* supplementary groups for euid/fsgid */
/* RCU deletion */
union {
@@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
+extern int set_rlimit_ns(struct user_namespace *ns);
/*
* check for validity of credentials
@@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
#define task_uid(task) (task_cred_xxx((task), uid))
#define task_euid(task) (task_cred_xxx((task), euid))
+#define task_rlimit_ns(task) (task_cred_xxx((task), rlimit_ns))
#define current_cred_xxx(xxx) \
({ \
@@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred)
extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns() (current_cred_xxx(user_ns))
+#define current_rlimit_ns() (current_cred_xxx(rlimit_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
return &init_user_ns;
}
+static inline struct user_namespace *current_rlimit_ns(void)
+{
+ return &init_user_ns;
+}
#endif
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 07b4f8131e36..4f853f903415 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -238,4 +238,9 @@ struct prctl_mm_map {
#define PR_SET_IO_FLUSHER 57
#define PR_GET_IO_FLUSHER 58
+#define PR_SET_RLIMIT_USER_NAMESPACE 59
+#define PR_GET_RLIMIT_USER_NAMESPACE 60
+# define PR_RLIMIT_BIND_GLOBAL_USERNS (1UL << 0)
+# define PR_RLIMIT_BIND_CURRENT_USERNS (1UL << 1)
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 748704db1f6b..7b90e1ef9c9a 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -59,6 +59,7 @@ struct cred init_cred = {
.cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
+ .rlimit_ns = &init_user_ns,
.group_info = &init_groups,
};
@@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
put_group_info(cred->group_info);
free_uid(cred->user);
put_user_ns(cred->user_ns);
+ put_user_ns(cred->rlimit_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -270,6 +272,7 @@ struct cred *prepare_creds(void)
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
+ get_user_ns(new->rlimit_ns);
#ifdef CONFIG_KEYS
key_get(new->session_keyring);
@@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
#endif
clone_flags & CLONE_THREAD
) {
- if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
+ if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
return -EACCES;
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
@@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+ if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
return -EACCES;
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
@@ -487,13 +490,13 @@ int commit_creds(struct cred *new)
* perhaps this limit is exceeded in the parent user namespace.
*/
alter_cred_subscribers(new, 2);
- if (new->user != old->user &&
- !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+ if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
+ !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
task->flags |= PF_NPROC_UNS_EXCEEDED;
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
- dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
+ if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
+ dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
alter_cred_subscribers(old, -2);
/* send notifications */
@@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode)
}
EXPORT_SYMBOL(set_create_files_as);
+/*
+ * Change the rlimit user namespace of the current task, replacing the existing
+ * one. If the given namespace is NULL, then initial user namespace will be
+ * used.
+ *
+ * Return: 0 on success; -errno on failure.
+ */
+int set_rlimit_ns(struct user_namespace *ns)
+{
+ struct cred *new;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ new->rlimit_ns = ns ? ns : &init_user_ns;
+
+ return commit_creds(new);
+}
+
#ifdef CONFIG_DEBUG_CREDENTIALS
bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/exit.c b/kernel/exit.c
index 5a0d7dd1ad64..998436d32373 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
rcu_read_unlock();
cgroup_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2b28634dc8f..43f3c54fe4c6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process(
current->flags &= ~PF_NPROC_UNS_EXCEEDED;
goto bad_fork_free;
}
- processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
+ processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
UCOUNT_RLIMIT_NPROC);
if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
@@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
exit_creds(p);
bad_fork_free:
p->state = TASK_DEAD;
diff --git a/kernel/sys.c b/kernel/sys.c
index db780ec32d86..917cbd7fc674 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -467,7 +467,7 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;
- processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
+ processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
/*
* We don't fail in case of NPROC limit excess here because too many
@@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
+ case PR_SET_RLIMIT_USER_NAMESPACE:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ switch (arg2) {
+ case PR_RLIMIT_BIND_GLOBAL_USERNS:
+ error = set_rlimit_ns(&init_user_ns);
+ break;
+ case PR_RLIMIT_BIND_CURRENT_USERNS:
+ error = set_rlimit_ns(current_user_ns());
+ break;
+ default:
+ error = -EINVAL;
+ }
+ break;
+ case PR_GET_RLIMIT_USER_NAMESPACE:
+ error = current_rlimit_ns() == &init_user_ns
+ ? PR_RLIMIT_BIND_GLOBAL_USERNS
+ : PR_RLIMIT_BIND_CURRENT_USERNS;
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 87804e0371fe..346df35ceba9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
#endif
/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
cred->user_ns = user_ns;
+
+ cred->rlimit_ns = &init_user_ns;
}
/*
@@ -121,6 +123,7 @@ int create_user_ns(struct cred *new)
for (i = 0; i < UCOUNT_COUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
+ ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
--
2.25.4
In order to be able to use ucounts for rlimits, the size must be increased.
For example user_struct.mq_bytes (RLIMIT_MSGQUEUE) is unsigned long.
Signed-off-by: Alexey Gladkov <[email protected]>
---
include/linux/user_namespace.h | 4 ++--
kernel/ucount.c | 14 +++++++-------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6ef1c7109fc4..fc75af812d73 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -86,7 +86,7 @@ struct user_namespace {
struct ctl_table_header *sysctls;
#endif
struct ucounts *ucounts;
- int ucount_max[UCOUNT_COUNTS];
+ long ucount_max[UCOUNT_COUNTS];
} __randomize_layout;
struct ucounts {
@@ -94,7 +94,7 @@ struct ucounts {
struct user_namespace *ns;
kuid_t uid;
int count;
- atomic_t ucount[UCOUNT_COUNTS];
+ atomic_long_t ucount[UCOUNT_COUNTS];
};
extern struct user_namespace init_user_ns;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 11b1596e2542..7b2bca8582ef 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -175,14 +175,14 @@ static void put_ucounts(struct ucounts *ucounts)
kfree(ucounts);
}
-static inline bool atomic_inc_below(atomic_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
- int c, old;
- c = atomic_read(v);
+ long c, old;
+ c = atomic_long_read(v);
for (;;) {
if (unlikely(c >= u))
return false;
- old = atomic_cmpxchg(v, c, c+1);
+ old = atomic_long_cmpxchg(v, c, c+1);
if (likely(old == c))
return true;
c = old;
@@ -199,14 +199,14 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
int max;
tns = iter->ns;
max = READ_ONCE(tns->ucount_max[type]);
- if (!atomic_inc_below(&iter->ucount[type], max))
+ if (!atomic_long_inc_below(&iter->ucount[type], max))
goto fail;
}
return ucounts;
fail:
bad = iter;
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
- atomic_dec(&iter->ucount[type]);
+ atomic_long_dec(&iter->ucount[type]);
put_ucounts(ucounts);
return NULL;
@@ -216,7 +216,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
struct ucounts *iter;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ int dec = atomic_long_dec_if_positive(&iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
}
put_ucounts(ucounts);
--
2.25.4
On Mon, Nov 2, 2020 at 5:52 PM Alexey Gladkov <[email protected]> wrote:
> Add a new prctl to change the user namespace in which the process
> counter is located. A pointer to the user namespace is in cred struct
> to be inherited by all child processes.
[...]
> + case PR_SET_RLIMIT_USER_NAMESPACE:
> + if (!capable(CAP_SYS_RESOURCE))
> + return -EPERM;
> +
> + switch (arg2) {
> + case PR_RLIMIT_BIND_GLOBAL_USERNS:
> + error = set_rlimit_ns(&init_user_ns);
> + break;
> + case PR_RLIMIT_BIND_CURRENT_USERNS:
> + error = set_rlimit_ns(current_user_ns());
> + break;
> + default:
> + error = -EINVAL;
> + }
> + break;
I don't see how this can work. capable() requires that
current_user_ns()==&init_user_ns, so you can't use this API to bind
rlimits to any other user namespace.
Fundamentally, if it requires CAP_SYS_RESOURCE, this probably can't be
done as an API that a process uses to change its own rlimit scope. In
that case I would implement this as part of clone3() instead of
prctl(). (Then init_user_ns can set it if the caller has
CAP_SYS_RESOURCE. If you want to have support for doing the same thing
with nested namespaces, you'd also need a flag that the first-level
clone3() can set on the namespace to say "further rlimit splitting
should be allowed".)
Or alternatively, we could say that CAP_SYS_RESOURCE doesn't matter,
and instead you're allowed to move the rlimit scope if your current
hard rlimit is INFINITY. That might make more sense? Maybe?
On Mon, Nov 02, 2020 at 06:10:06PM +0100, Jann Horn wrote:
> On Mon, Nov 2, 2020 at 5:52 PM Alexey Gladkov <[email protected]> wrote:
> > Add a new prctl to change the user namespace in which the process
> > counter is located. A pointer to the user namespace is in cred struct
> > to be inherited by all child processes.
> [...]
> > + case PR_SET_RLIMIT_USER_NAMESPACE:
> > + if (!capable(CAP_SYS_RESOURCE))
> > + return -EPERM;
> > +
> > + switch (arg2) {
> > + case PR_RLIMIT_BIND_GLOBAL_USERNS:
> > + error = set_rlimit_ns(&init_user_ns);
> > + break;
> > + case PR_RLIMIT_BIND_CURRENT_USERNS:
> > + error = set_rlimit_ns(current_user_ns());
> > + break;
> > + default:
> > + error = -EINVAL;
> > + }
> > + break;
>
> I don't see how this can work. capable() requires that
> current_user_ns()==&init_user_ns, so you can't use this API to bind
> rlimits to any other user namespace.
>
> Fundamentally, if it requires CAP_SYS_RESOURCE, this probably can't be
> done as an API that a process uses to change its own rlimit scope. In
> that case I would implement this as part of clone3() instead of
> prctl(). (Then init_user_ns can set it if the caller has
> CAP_SYS_RESOURCE. If you want to have support for doing the same thing
> with nested namespaces, you'd also need a flag that the first-level
> clone3() can set on the namespace to say "further rlimit splitting
> should be allowed".)
>
> Or alternatively, we could say that CAP_SYS_RESOURCE doesn't matter,
> and instead you're allowed to move the rlimit scope if your current
> hard rlimit is INFINITY. That might make more sense? Maybe?
I think you are right. CAP_SYS_RESOURCE is not needed here since you still
cannot exceed the rlimit in the parent user namespace.
--
Rgrds, legion
On Mon, Nov 02, 2020 at 05:50:29PM +0100, Alexey Gladkov wrote:
> Preface
> -------
> These patches are for binding the rlimits to a user in the user namespace.
> This patch set can be applied on top of:
>
> git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v5.8-2-g43e210d68200
>
> Problem
> -------
> Some rlimits are set per user: RLIMIT_NPROC, RLIMIT_MEMLOCK, RLIMIT_SIGPENDING,
> RLIMIT_MSGQUEUE. When several containers are created from one user then
> the processes inside the containers influence each other.
>
> Eric W. Biederman mentioned this issue [1][2][3].
>
> Introduced changes
> ------------------
> To fix this problem, you can bind the counter of the specified rlimits to the
> user within the user namespace. By default, to preserve backward compatibility,
> only the initial user namespace is used. This patch adds one more prctl
> parameter to change the binding to the user namespace.
>
> This will not cause the user to take more resources than allowed in the parent
> user namespace because it only virtualizes the rlimit counter. Limits in all
> parent user namespaces are taken into account.
>
> For example, this allows us to run multiple containers by the same user and
> set the RLIMIT_NPROC to 1 inside.
Thanks for picking this up and working on it. This would definitely fix
many issues for folks running unprivileged containers using a single id
map which is the default behavior for LXC/LXD and so very valuable to
us.
Christian
>
> ToDo
> ----
> * RLIMIT_MEMLOCK, RLIMIT_SIGPENDING and RLIMIT_MSGQUEUE are not implemented.
> * No documentation.
> * No tests.
>
> [1] https://lore.kernel.org/containers/[email protected]/
> [2] https://lists.linuxfoundation.org/pipermail/containers/2020-August/042096.html
> [3] https://lists.linuxfoundation.org/pipermail/containers/2020-October/042524.html
>
> Changelog
> ---------
> v1:
> * After discussion with Eric W. Biederman, I increased the size of ucounts to
> atomic_long_t.
> * Added ucount_max to avoid the fork bomb.
>
> --
>
> Alexey Gladkov (4):
> Increase size of ucounts to atomic_long_t
> Move the user's process counter to ucounts
> Do not allow fork if RLIMIT_NPROC is exceeded in the user namespace
> tree
> Allow to change the user namespace in which user rlimits are counted
>
> fs/exec.c | 13 ++++++---
> fs/io-wq.c | 25 +++++++++++++-----
> fs/io-wq.h | 1 +
> fs/io_uring.c | 1 +
> include/linux/cred.h | 8 ++++++
> include/linux/sched.h | 3 +++
> include/linux/sched/user.h | 1 -
> include/linux/user_namespace.h | 12 +++++++--
> include/uapi/linux/prctl.h | 5 ++++
> kernel/cred.c | 44 ++++++++++++++++++++++++-------
> kernel/exit.c | 2 +-
> kernel/fork.c | 13 ++++++---
> kernel/sys.c | 26 ++++++++++++++++--
> kernel/ucount.c | 48 +++++++++++++++++++++++++++++-----
> kernel/user.c | 3 ++-
> kernel/user_namespace.c | 3 +++
> 16 files changed, 171 insertions(+), 37 deletions(-)
>
> --
> 2.25.4
>
On Mon, Nov 02, 2020 at 05:50:30PM +0100, Alexey Gladkov wrote:
> In order to be able to use ucounts for rlimits, the size must be increased.
> For example user_struct.mq_bytes (RLIMIT_MSGQUEUE) is unsigned long.
I don't have any issues with this change I just wonder what the exact
reason is. It's not immediately obvious to me.
>
> Signed-off-by: Alexey Gladkov <[email protected]>
> ---
> include/linux/user_namespace.h | 4 ++--
> kernel/ucount.c | 14 +++++++-------
> 2 files changed, 9 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 6ef1c7109fc4..fc75af812d73 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -86,7 +86,7 @@ struct user_namespace {
> struct ctl_table_header *sysctls;
> #endif
> struct ucounts *ucounts;
> - int ucount_max[UCOUNT_COUNTS];
> + long ucount_max[UCOUNT_COUNTS];
> } __randomize_layout;
>
> struct ucounts {
> @@ -94,7 +94,7 @@ struct ucounts {
> struct user_namespace *ns;
> kuid_t uid;
> int count;
> - atomic_t ucount[UCOUNT_COUNTS];
> + atomic_long_t ucount[UCOUNT_COUNTS];
> };
>
> extern struct user_namespace init_user_ns;
> diff --git a/kernel/ucount.c b/kernel/ucount.c
> index 11b1596e2542..7b2bca8582ef 100644
> --- a/kernel/ucount.c
> +++ b/kernel/ucount.c
> @@ -175,14 +175,14 @@ static void put_ucounts(struct ucounts *ucounts)
> kfree(ucounts);
> }
>
> -static inline bool atomic_inc_below(atomic_t *v, int u)
> +static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
> {
> - int c, old;
> - c = atomic_read(v);
> + long c, old;
> + c = atomic_long_read(v);
> for (;;) {
> if (unlikely(c >= u))
> return false;
> - old = atomic_cmpxchg(v, c, c+1);
> + old = atomic_long_cmpxchg(v, c, c+1);
> if (likely(old == c))
> return true;
> c = old;
> @@ -199,14 +199,14 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
> int max;
> tns = iter->ns;
> max = READ_ONCE(tns->ucount_max[type]);
> - if (!atomic_inc_below(&iter->ucount[type], max))
> + if (!atomic_long_inc_below(&iter->ucount[type], max))
> goto fail;
> }
> return ucounts;
> fail:
> bad = iter;
> for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
> - atomic_dec(&iter->ucount[type]);
> + atomic_long_dec(&iter->ucount[type]);
>
> put_ucounts(ucounts);
> return NULL;
> @@ -216,7 +216,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
> {
> struct ucounts *iter;
> for (iter = ucounts; iter; iter = iter->ns->ucounts) {
> - int dec = atomic_dec_if_positive(&iter->ucount[type]);
> + int dec = atomic_long_dec_if_positive(&iter->ucount[type]);
> WARN_ON_ONCE(dec < 0);
> }
> put_ucounts(ucounts);
> --
> 2.25.4
>
On Mon, Nov 02, 2020 at 07:03:01PM +0100, Christian Brauner wrote:
> On Mon, Nov 02, 2020 at 05:50:30PM +0100, Alexey Gladkov wrote:
> > In order to be able to use ucounts for rlimits, the size must be increased.
> > For example user_struct.mq_bytes (RLIMIT_MSGQUEUE) is unsigned long.
>
> I don't have any issues with this change I just wonder what the exact
> reason is. It's not immediately obvious to me.
Right now user_struct.mq_bytes that is currently used for checking
RLIMIT_MSGQUEUE is unsigned log, but ucounts is signed int. The rlimit is
also unsigned long. If I migrate RLIMIT_MSGQUEUE to ucounts I will
decrease counter and possibly break backward compatibility. Technically,
it can be violated anyway.
linux/ipc/mqueue.c:376:
mq_bytes += mq_treesize;
spin_lock(&mq_lock);
if (u->mq_bytes + mq_bytes < u->mq_bytes ||
u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
spin_unlock(&mq_lock);
/* mqueue_evict_inode() releases info->messages */
ret = -EMFILE;
goto out_inode;
}
u->mq_bytes += mq_bytes;
spin_unlock(&mq_lock);
> >
> > Signed-off-by: Alexey Gladkov <[email protected]>
> > ---
> > include/linux/user_namespace.h | 4 ++--
> > kernel/ucount.c | 14 +++++++-------
> > 2 files changed, 9 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> > index 6ef1c7109fc4..fc75af812d73 100644
> > --- a/include/linux/user_namespace.h
> > +++ b/include/linux/user_namespace.h
> > @@ -86,7 +86,7 @@ struct user_namespace {
> > struct ctl_table_header *sysctls;
> > #endif
> > struct ucounts *ucounts;
> > - int ucount_max[UCOUNT_COUNTS];
> > + long ucount_max[UCOUNT_COUNTS];
> > } __randomize_layout;
> >
> > struct ucounts {
> > @@ -94,7 +94,7 @@ struct ucounts {
> > struct user_namespace *ns;
> > kuid_t uid;
> > int count;
> > - atomic_t ucount[UCOUNT_COUNTS];
> > + atomic_long_t ucount[UCOUNT_COUNTS];
> > };
> >
> > extern struct user_namespace init_user_ns;
> > diff --git a/kernel/ucount.c b/kernel/ucount.c
> > index 11b1596e2542..7b2bca8582ef 100644
> > --- a/kernel/ucount.c
> > +++ b/kernel/ucount.c
> > @@ -175,14 +175,14 @@ static void put_ucounts(struct ucounts *ucounts)
> > kfree(ucounts);
> > }
> >
> > -static inline bool atomic_inc_below(atomic_t *v, int u)
> > +static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
> > {
> > - int c, old;
> > - c = atomic_read(v);
> > + long c, old;
> > + c = atomic_long_read(v);
> > for (;;) {
> > if (unlikely(c >= u))
> > return false;
> > - old = atomic_cmpxchg(v, c, c+1);
> > + old = atomic_long_cmpxchg(v, c, c+1);
> > if (likely(old == c))
> > return true;
> > c = old;
> > @@ -199,14 +199,14 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
> > int max;
> > tns = iter->ns;
> > max = READ_ONCE(tns->ucount_max[type]);
> > - if (!atomic_inc_below(&iter->ucount[type], max))
> > + if (!atomic_long_inc_below(&iter->ucount[type], max))
> > goto fail;
> > }
> > return ucounts;
> > fail:
> > bad = iter;
> > for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
> > - atomic_dec(&iter->ucount[type]);
> > + atomic_long_dec(&iter->ucount[type]);
> >
> > put_ucounts(ucounts);
> > return NULL;
> > @@ -216,7 +216,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
> > {
> > struct ucounts *iter;
> > for (iter = ucounts; iter; iter = iter->ns->ucounts) {
> > - int dec = atomic_dec_if_positive(&iter->ucount[type]);
> > + int dec = atomic_long_dec_if_positive(&iter->ucount[type]);
> > WARN_ON_ONCE(dec < 0);
> > }
> > put_ucounts(ucounts);
> > --
> > 2.25.4
> >
>
--
Rgrds, legion
On Mon, Nov 02, 2020 at 05:50:33PM +0100, Alexey Gladkov wrote:
> Add a new prctl to change the user namespace in which the process
> counter is located. A pointer to the user namespace is in cred struct
> to be inherited by all child processes.
>
> Signed-off-by: Alexey Gladkov <[email protected]>
> ---
> fs/exec.c | 2 +-
> fs/io-wq.c | 13 ++++++++-----
> fs/io-wq.h | 1 +
> fs/io_uring.c | 1 +
> include/linux/cred.h | 8 ++++++++
> include/uapi/linux/prctl.h | 5 +++++
> kernel/cred.c | 35 +++++++++++++++++++++++++++++------
> kernel/exit.c | 2 +-
> kernel/fork.c | 4 ++--
> kernel/sys.c | 22 +++++++++++++++++++++-
> kernel/user_namespace.c | 3 +++
> 11 files changed, 80 insertions(+), 16 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index c45dfc716394..574b1381276c 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename,
> goto out_ret;
> }
>
> - processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
> + processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);
>
> /*
> * We move the actual failure in case of RLIMIT_NPROC excess from
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index c3b0843abc9b..19e43ec115cb 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -116,6 +116,7 @@ struct io_wq {
>
> struct task_struct *manager;
> struct user_struct *user;
It seems like user would be unused here, and you could use creds->user?
> + const struct cred *creds;
> refcount_t refs;
> struct completion done;
>
> @@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker)
> if (worker->flags & IO_WORKER_F_RUNNING)
> atomic_dec(&acct->nr_running);
> if (!(worker->flags & IO_WORKER_F_BOUND))
> - dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> + dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> worker->flags = 0;
> preempt_enable();
>
> @@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
> worker->flags |= IO_WORKER_F_BOUND;
> wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
> wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
> - dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> + dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> } else {
> - if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> + if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> return;
> worker->flags &= ~IO_WORKER_F_BOUND;
> wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
> @@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
> }
>
> if (index == IO_WQ_ACCT_UNBOUND &&
> - !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> + !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> kfree(worker);
> return false;
> }
> @@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
> if (free_worker)
> return true;
>
> - processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> + processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
> + UCOUNT_RLIMIT_NPROC);
>
> if (processes >= acct->max_workers &&
> !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
> @@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
>
> /* caller must already hold a reference to this */
> wq->user = data->user;
> + wq->creds = data->creds;
>
> for_each_node(node) {
> struct io_wqe *wqe;
> diff --git a/fs/io-wq.h b/fs/io-wq.h
> index 071f1a997800..6acc3a04c38f 100644
> --- a/fs/io-wq.h
> +++ b/fs/io-wq.h
> @@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **);
>
> struct io_wq_data {
> struct user_struct *user;
> + const struct cred *creds;
>
> io_wq_work_fn *do_work;
> free_work_fn *free_work;
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 493e5047e67c..e419923968b3 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
> int ret = 0;
>
> data.user = ctx->user;
> + data.creds = ctx->creds;
> data.free_work = io_free_work;
> data.do_work = io_wq_submit_work;
>
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 18639c069263..43aee68d117f 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -144,6 +144,7 @@ struct cred {
> #endif
> struct user_struct *user; /* real user ID subscription */
> struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
> + struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
> struct group_info *group_info; /* supplementary groups for euid/fsgid */
> /* RCU deletion */
> union {
> @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
> extern int set_create_files_as(struct cred *, struct inode *);
> extern int cred_fscmp(const struct cred *, const struct cred *);
> extern void __init cred_init(void);
> +extern int set_rlimit_ns(struct user_namespace *ns);
>
> /*
> * check for validity of credentials
> @@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
>
> #define task_uid(task) (task_cred_xxx((task), uid))
> #define task_euid(task) (task_cred_xxx((task), euid))
> +#define task_rlimit_ns(task) (task_cred_xxx((task), rlimit_ns))
>
> #define current_cred_xxx(xxx) \
> ({ \
> @@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred)
> extern struct user_namespace init_user_ns;
> #ifdef CONFIG_USER_NS
> #define current_user_ns() (current_cred_xxx(user_ns))
> +#define current_rlimit_ns() (current_cred_xxx(rlimit_ns))
> #else
> static inline struct user_namespace *current_user_ns(void)
> {
> return &init_user_ns;
> }
> +static inline struct user_namespace *current_rlimit_ns(void)
> +{
> + return &init_user_ns;
> +}
> #endif
>
>
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 07b4f8131e36..4f853f903415 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -238,4 +238,9 @@ struct prctl_mm_map {
> #define PR_SET_IO_FLUSHER 57
> #define PR_GET_IO_FLUSHER 58
>
> +#define PR_SET_RLIMIT_USER_NAMESPACE 59
> +#define PR_GET_RLIMIT_USER_NAMESPACE 60
> +# define PR_RLIMIT_BIND_GLOBAL_USERNS (1UL << 0)
> +# define PR_RLIMIT_BIND_CURRENT_USERNS (1UL << 1)
> +
> #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 748704db1f6b..7b90e1ef9c9a 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -59,6 +59,7 @@ struct cred init_cred = {
> .cap_bset = CAP_FULL_SET,
> .user = INIT_USER,
> .user_ns = &init_user_ns,
> + .rlimit_ns = &init_user_ns,
> .group_info = &init_groups,
> };
>
> @@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
> put_group_info(cred->group_info);
> free_uid(cred->user);
> put_user_ns(cred->user_ns);
> + put_user_ns(cred->rlimit_ns);
> kmem_cache_free(cred_jar, cred);
> }
>
> @@ -270,6 +272,7 @@ struct cred *prepare_creds(void)
> get_group_info(new->group_info);
> get_uid(new->user);
> get_user_ns(new->user_ns);
> + get_user_ns(new->rlimit_ns);
>
> #ifdef CONFIG_KEYS
> key_get(new->session_keyring);
> @@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> #endif
> clone_flags & CLONE_THREAD
> ) {
> - if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
> + if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
> return -EACCES;
> p->real_cred = get_cred(p->cred);
> get_cred(p->cred);
> @@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> }
> #endif
>
> - if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> + if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> return -EACCES;
> p->cred = p->real_cred = get_cred(new);
> alter_cred_subscribers(new, 2);
> @@ -487,13 +490,13 @@ int commit_creds(struct cred *new)
> * perhaps this limit is exceeded in the parent user namespace.
> */
> alter_cred_subscribers(new, 2);
> - if (new->user != old->user &&
> - !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> + if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
> + !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> task->flags |= PF_NPROC_UNS_EXCEEDED;
> rcu_assign_pointer(task->real_cred, new);
> rcu_assign_pointer(task->cred, new);
> - if (new->user != old->user)
> - dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> + if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
> + dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> alter_cred_subscribers(old, -2);
>
> /* send notifications */
> @@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode)
> }
> EXPORT_SYMBOL(set_create_files_as);
>
> +/*
> + * Change the rlimit user namespace of the current task, replacing the existing
> + * one. If the given namespace is NULL, then initial user namespace will be
> + * used.
> + *
> + * Return: 0 on success; -errno on failure.
> + */
> +int set_rlimit_ns(struct user_namespace *ns)
> +{
> + struct cred *new;
> +
> + new = prepare_creds();
> + if (!new)
> + return -ENOMEM;
> +
> + new->rlimit_ns = ns ? ns : &init_user_ns;
> +
> + return commit_creds(new);
> +}
> +
> #ifdef CONFIG_DEBUG_CREDENTIALS
>
> bool creds_are_invalid(const struct cred *cred)
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 5a0d7dd1ad64..998436d32373 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
> /* don't need to get the RCU readlock here - the process is dead and
> * can't be modifying its own credentials. But shut RCU-lockdep up */
> rcu_read_lock();
> - dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
> + dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> rcu_read_unlock();
>
> cgroup_release(p);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d2b28634dc8f..43f3c54fe4c6 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process(
> current->flags &= ~PF_NPROC_UNS_EXCEEDED;
> goto bad_fork_free;
> }
> - processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
> + processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
> UCOUNT_RLIMIT_NPROC);
> if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
> if (p->real_cred->user != INIT_USER &&
> @@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process(
> #endif
> delayacct_tsk_free(p);
> bad_fork_cleanup_count:
> - dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
> + dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> exit_creds(p);
> bad_fork_free:
> p->state = TASK_DEAD;
> diff --git a/kernel/sys.c b/kernel/sys.c
> index db780ec32d86..917cbd7fc674 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -467,7 +467,7 @@ static int set_user(struct cred *new)
> if (!new_user)
> return -EAGAIN;
>
> - processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> + processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
>
> /*
> * We don't fail in case of NPROC limit excess here because too many
> @@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>
> error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
> break;
> + case PR_SET_RLIMIT_USER_NAMESPACE:
> + if (!capable(CAP_SYS_RESOURCE))
> + return -EPERM;
Can you have CAP_SYS_RESOURCE in the init user ns while you're in a non-init
user ns? Shouldn't that only matter if you want to set yourself to
PR_RLIMIT_BIND_GLOBAL_USERNS anyways and you just want to make sure they have
CAP_SYS_RESOURCE in the current namespace for PR_RLIMIT_BIND_CURRENT_USERNS?
> +
> + switch (arg2) {
> + case PR_RLIMIT_BIND_GLOBAL_USERNS:
> + error = set_rlimit_ns(&init_user_ns);
> + break;
> + case PR_RLIMIT_BIND_CURRENT_USERNS:
> + error = set_rlimit_ns(current_user_ns());
> + break;
To some degree, this isn't so much "per user namespace rlimits", as much as it
is hierarchical rlimits. I'm not going to nitpick about names, but it might be
worth exploring. Especially because the way the patch is written, it would be
easy to introduce a third user namespace for rlimits with different mappings.
The downside I see with a sysctl is that changing it midway through the user
namespaces lifetime could be confusing, and difficult to get right.
> + default:
> + error = -EINVAL;
> + }
> + break;
> + case PR_GET_RLIMIT_USER_NAMESPACE:
> + error = current_rlimit_ns() == &init_user_ns
> + ? PR_RLIMIT_BIND_GLOBAL_USERNS
> + : PR_RLIMIT_BIND_CURRENT_USERNS;
> + break;
It would be nice to have this be a sysctl, so everyone who does
setns() would get the behaviour, and if a process wants to
> default:
> error = -EINVAL;
> break;
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 87804e0371fe..346df35ceba9 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
> #endif
> /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
> cred->user_ns = user_ns;
> +
> + cred->rlimit_ns = &init_user_ns;
> }
>
> /*
> @@ -121,6 +123,7 @@ int create_user_ns(struct cred *new)
> for (i = 0; i < UCOUNT_COUNTS; i++) {
> ns->ucount_max[i] = INT_MAX;
> }
> + ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
> ns->ucounts = ucounts;
>
> /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
> --
> 2.25.4
>
On Wed, Nov 04, 2020 at 10:03:05AM +0000, Sargun Dhillon wrote:
> On Mon, Nov 02, 2020 at 05:50:33PM +0100, Alexey Gladkov wrote:
> > Add a new prctl to change the user namespace in which the process
> > counter is located. A pointer to the user namespace is in cred struct
> > to be inherited by all child processes.
> >
> > Signed-off-by: Alexey Gladkov <[email protected]>
> > ---
> > fs/exec.c | 2 +-
> > fs/io-wq.c | 13 ++++++++-----
> > fs/io-wq.h | 1 +
> > fs/io_uring.c | 1 +
> > include/linux/cred.h | 8 ++++++++
> > include/uapi/linux/prctl.h | 5 +++++
> > kernel/cred.c | 35 +++++++++++++++++++++++++++++------
> > kernel/exit.c | 2 +-
> > kernel/fork.c | 4 ++--
> > kernel/sys.c | 22 +++++++++++++++++++++-
> > kernel/user_namespace.c | 3 +++
> > 11 files changed, 80 insertions(+), 16 deletions(-)
> >
> > diff --git a/fs/exec.c b/fs/exec.c
> > index c45dfc716394..574b1381276c 100644
> > --- a/fs/exec.c
> > +++ b/fs/exec.c
> > @@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename,
> > goto out_ret;
> > }
> >
> > - processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
> > + processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);
> >
> > /*
> > * We move the actual failure in case of RLIMIT_NPROC excess from
> > diff --git a/fs/io-wq.c b/fs/io-wq.c
> > index c3b0843abc9b..19e43ec115cb 100644
> > --- a/fs/io-wq.c
> > +++ b/fs/io-wq.c
> > @@ -116,6 +116,7 @@ struct io_wq {
> >
> > struct task_struct *manager;
> > struct user_struct *user;
> It seems like user would be unused here, and you could use creds->user?
Yep. I think so.
> > + const struct cred *creds;
> > refcount_t refs;
> > struct completion done;
> >
> > @@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker)
> > if (worker->flags & IO_WORKER_F_RUNNING)
> > atomic_dec(&acct->nr_running);
> > if (!(worker->flags & IO_WORKER_F_BOUND))
> > - dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > + dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > worker->flags = 0;
> > preempt_enable();
> >
> > @@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
> > worker->flags |= IO_WORKER_F_BOUND;
> > wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
> > wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
> > - dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > + dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > } else {
> > - if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> > + if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> > return;
> > worker->flags &= ~IO_WORKER_F_BOUND;
> > wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
> > @@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
> > }
> >
> > if (index == IO_WQ_ACCT_UNBOUND &&
> > - !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> > + !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> > kfree(worker);
> > return false;
> > }
> > @@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
> > if (free_worker)
> > return true;
> >
> > - processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > + processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
> > + UCOUNT_RLIMIT_NPROC);
> >
> > if (processes >= acct->max_workers &&
> > !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
> > @@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
> >
> > /* caller must already hold a reference to this */
> > wq->user = data->user;
> > + wq->creds = data->creds;
> >
> > for_each_node(node) {
> > struct io_wqe *wqe;
> > diff --git a/fs/io-wq.h b/fs/io-wq.h
> > index 071f1a997800..6acc3a04c38f 100644
> > --- a/fs/io-wq.h
> > +++ b/fs/io-wq.h
> > @@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **);
> >
> > struct io_wq_data {
> > struct user_struct *user;
> > + const struct cred *creds;
> >
> > io_wq_work_fn *do_work;
> > free_work_fn *free_work;
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 493e5047e67c..e419923968b3 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
> > int ret = 0;
> >
> > data.user = ctx->user;
> > + data.creds = ctx->creds;
> > data.free_work = io_free_work;
> > data.do_work = io_wq_submit_work;
> >
> > diff --git a/include/linux/cred.h b/include/linux/cred.h
> > index 18639c069263..43aee68d117f 100644
> > --- a/include/linux/cred.h
> > +++ b/include/linux/cred.h
> > @@ -144,6 +144,7 @@ struct cred {
> > #endif
> > struct user_struct *user; /* real user ID subscription */
> > struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
> > + struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
> > struct group_info *group_info; /* supplementary groups for euid/fsgid */
> > /* RCU deletion */
> > union {
> > @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
> > extern int set_create_files_as(struct cred *, struct inode *);
> > extern int cred_fscmp(const struct cred *, const struct cred *);
> > extern void __init cred_init(void);
> > +extern int set_rlimit_ns(struct user_namespace *ns);
> >
> > /*
> > * check for validity of credentials
> > @@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
> >
> > #define task_uid(task) (task_cred_xxx((task), uid))
> > #define task_euid(task) (task_cred_xxx((task), euid))
> > +#define task_rlimit_ns(task) (task_cred_xxx((task), rlimit_ns))
> >
> > #define current_cred_xxx(xxx) \
> > ({ \
> > @@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred)
> > extern struct user_namespace init_user_ns;
> > #ifdef CONFIG_USER_NS
> > #define current_user_ns() (current_cred_xxx(user_ns))
> > +#define current_rlimit_ns() (current_cred_xxx(rlimit_ns))
> > #else
> > static inline struct user_namespace *current_user_ns(void)
> > {
> > return &init_user_ns;
> > }
> > +static inline struct user_namespace *current_rlimit_ns(void)
> > +{
> > + return &init_user_ns;
> > +}
> > #endif
> >
> >
> > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> > index 07b4f8131e36..4f853f903415 100644
> > --- a/include/uapi/linux/prctl.h
> > +++ b/include/uapi/linux/prctl.h
> > @@ -238,4 +238,9 @@ struct prctl_mm_map {
> > #define PR_SET_IO_FLUSHER 57
> > #define PR_GET_IO_FLUSHER 58
> >
> > +#define PR_SET_RLIMIT_USER_NAMESPACE 59
> > +#define PR_GET_RLIMIT_USER_NAMESPACE 60
> > +# define PR_RLIMIT_BIND_GLOBAL_USERNS (1UL << 0)
> > +# define PR_RLIMIT_BIND_CURRENT_USERNS (1UL << 1)
> > +
> > #endif /* _LINUX_PRCTL_H */
> > diff --git a/kernel/cred.c b/kernel/cred.c
> > index 748704db1f6b..7b90e1ef9c9a 100644
> > --- a/kernel/cred.c
> > +++ b/kernel/cred.c
> > @@ -59,6 +59,7 @@ struct cred init_cred = {
> > .cap_bset = CAP_FULL_SET,
> > .user = INIT_USER,
> > .user_ns = &init_user_ns,
> > + .rlimit_ns = &init_user_ns,
> > .group_info = &init_groups,
> > };
> >
> > @@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
> > put_group_info(cred->group_info);
> > free_uid(cred->user);
> > put_user_ns(cred->user_ns);
> > + put_user_ns(cred->rlimit_ns);
> > kmem_cache_free(cred_jar, cred);
> > }
> >
> > @@ -270,6 +272,7 @@ struct cred *prepare_creds(void)
> > get_group_info(new->group_info);
> > get_uid(new->user);
> > get_user_ns(new->user_ns);
> > + get_user_ns(new->rlimit_ns);
> >
> > #ifdef CONFIG_KEYS
> > key_get(new->session_keyring);
> > @@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> > #endif
> > clone_flags & CLONE_THREAD
> > ) {
> > - if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
> > + if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
> > return -EACCES;
> > p->real_cred = get_cred(p->cred);
> > get_cred(p->cred);
> > @@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> > }
> > #endif
> >
> > - if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > + if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > return -EACCES;
> > p->cred = p->real_cred = get_cred(new);
> > alter_cred_subscribers(new, 2);
> > @@ -487,13 +490,13 @@ int commit_creds(struct cred *new)
> > * perhaps this limit is exceeded in the parent user namespace.
> > */
> > alter_cred_subscribers(new, 2);
> > - if (new->user != old->user &&
> > - !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > + if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
> > + !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > task->flags |= PF_NPROC_UNS_EXCEEDED;
> > rcu_assign_pointer(task->real_cred, new);
> > rcu_assign_pointer(task->cred, new);
> > - if (new->user != old->user)
> > - dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> > + if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
> > + dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> > alter_cred_subscribers(old, -2);
> >
> > /* send notifications */
> > @@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode)
> > }
> > EXPORT_SYMBOL(set_create_files_as);
> >
> > +/*
> > + * Change the rlimit user namespace of the current task, replacing the existing
> > + * one. If the given namespace is NULL, then initial user namespace will be
> > + * used.
> > + *
> > + * Return: 0 on success; -errno on failure.
> > + */
> > +int set_rlimit_ns(struct user_namespace *ns)
> > +{
> > + struct cred *new;
> > +
> > + new = prepare_creds();
> > + if (!new)
> > + return -ENOMEM;
> > +
> > + new->rlimit_ns = ns ? ns : &init_user_ns;
> > +
> > + return commit_creds(new);
> > +}
> > +
> > #ifdef CONFIG_DEBUG_CREDENTIALS
> >
> > bool creds_are_invalid(const struct cred *cred)
> > diff --git a/kernel/exit.c b/kernel/exit.c
> > index 5a0d7dd1ad64..998436d32373 100644
> > --- a/kernel/exit.c
> > +++ b/kernel/exit.c
> > @@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
> > /* don't need to get the RCU readlock here - the process is dead and
> > * can't be modifying its own credentials. But shut RCU-lockdep up */
> > rcu_read_lock();
> > - dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
> > + dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> > rcu_read_unlock();
> >
> > cgroup_release(p);
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index d2b28634dc8f..43f3c54fe4c6 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process(
> > current->flags &= ~PF_NPROC_UNS_EXCEEDED;
> > goto bad_fork_free;
> > }
> > - processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
> > + processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
> > UCOUNT_RLIMIT_NPROC);
> > if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
> > if (p->real_cred->user != INIT_USER &&
> > @@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process(
> > #endif
> > delayacct_tsk_free(p);
> > bad_fork_cleanup_count:
> > - dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
> > + dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> > exit_creds(p);
> > bad_fork_free:
> > p->state = TASK_DEAD;
> > diff --git a/kernel/sys.c b/kernel/sys.c
> > index db780ec32d86..917cbd7fc674 100644
> > --- a/kernel/sys.c
> > +++ b/kernel/sys.c
> > @@ -467,7 +467,7 @@ static int set_user(struct cred *new)
> > if (!new_user)
> > return -EAGAIN;
> >
> > - processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> > + processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> >
> > /*
> > * We don't fail in case of NPROC limit excess here because too many
> > @@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >
> > error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
> > break;
> > + case PR_SET_RLIMIT_USER_NAMESPACE:
> > + if (!capable(CAP_SYS_RESOURCE))
> > + return -EPERM;
> Can you have CAP_SYS_RESOURCE in the init user ns while you're in a non-init
> user ns? Shouldn't that only matter if you want to set yourself to
> PR_RLIMIT_BIND_GLOBAL_USERNS anyways and you just want to make sure they have
> CAP_SYS_RESOURCE in the current namespace for PR_RLIMIT_BIND_CURRENT_USERNS?
I think I was wrong to add the CAP_SYS_RESOURCE check because the user
still won't be able to exceed his limit. So since setrlimit does not
require CAP_SYS_RESOURCE, then it is not needed here either.
> > +
> > + switch (arg2) {
> > + case PR_RLIMIT_BIND_GLOBAL_USERNS:
> > + error = set_rlimit_ns(&init_user_ns);
> > + break;
> > + case PR_RLIMIT_BIND_CURRENT_USERNS:
> > + error = set_rlimit_ns(current_user_ns());
> > + break;
> To some degree, this isn't so much "per user namespace rlimits", as much as it
> is hierarchical rlimits. I'm not going to nitpick about names, but it might be
> worth exploring. Especially because the way the patch is written, it would be
> easy to introduce a third user namespace for rlimits with different mappings.
I agree that the naming is a bit confusing but I couldn't think of a
better one. This is probably because the discussion started with the topic
"per userns rlimits".
This change is more about the rlimit counter but not the rlimit itself.
> The downside I see with a sysctl is that changing it midway through the user
> namespaces lifetime could be confusing, and difficult to get right.
sysctl requires procfs. I did prctl to avoid it.
> > + default:
> > + error = -EINVAL;
> > + }
> > + break;
> > + case PR_GET_RLIMIT_USER_NAMESPACE:
> > + error = current_rlimit_ns() == &init_user_ns
> > + ? PR_RLIMIT_BIND_GLOBAL_USERNS
> > + : PR_RLIMIT_BIND_CURRENT_USERNS;
> > + break;
> It would be nice to have this be a sysctl, so everyone who does
> setns() would get the behaviour, and if a process wants to
What's so tricky about using prctl?
> > default:
> > error = -EINVAL;
> > break;
> > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> > index 87804e0371fe..346df35ceba9 100644
> > --- a/kernel/user_namespace.c
> > +++ b/kernel/user_namespace.c
> > @@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
> > #endif
> > /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
> > cred->user_ns = user_ns;
> > +
> > + cred->rlimit_ns = &init_user_ns;
> > }
> >
> > /*
> > @@ -121,6 +123,7 @@ int create_user_ns(struct cred *new)
> > for (i = 0; i < UCOUNT_COUNTS; i++) {
> > ns->ucount_max[i] = INT_MAX;
> > }
> > + ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
> > ns->ucounts = ucounts;
> >
> > /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
> > --
> > 2.25.4
> >
>
--
Rgrds, legion