LinuxLists.cc - [RFC PATCH v2 0/8] Count rlimits in each user namespace

2021-01-10 17:45:48

Subject: [RFC PATCH v2 0/8] Count rlimits in each user namespace

Preface
-------
These patches are for binding the rlimit counters to a user in user namespace.
This patch set can be applied on top of:

git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v5.11-rc2

Problem
-------
Some rlimits are set per user: RLIMIT_NPROC, RLIMIT_MEMLOCK, RLIMIT_SIGPENDING,
RLIMIT_MSGQUEUE. When several containers are created from one user then
the processes inside the containers influence each other.

Eric W. Biederman mentioned this issue [1][2][3].

For example, there are two containers (A and B) created by one user. The
container A sets RLIMIT_NPROC=1 and starts one process. Everything is fine, but
when container B tries to do the same it will fail because the number of
processes is counted globally for each user and user has one process already.

On the other hand, we cannot simply calculate the rlimits for each container
separately. This will lead to the fact that the user creating a new user
namespace can create a fork bomb.

Introduced changes
------------------
To address the problem, we bind rlimit counters to each user namespace. The
result is a tree of rlimit counters with the biggest value at the root (aka
init_user_ns). The rlimit counter increment/decrement occurs in the current and
all parent user namespaces.

ToDo
----
* No documentation.
* No tests.

[1] https://lore.kernel.org/containers/[email protected]/
[2] https://lists.linuxfoundation.org/pipermail/containers/2020-August/042096.html
[3] https://lists.linuxfoundation.org/pipermail/containers/2020-October/042524.html

Changelog
---------
v2:
* RLIMIT_MEMLOCK, RLIMIT_SIGPENDING and RLIMIT_MSGQUEUE are migrated to ucounts.
* Added ucounts for pair uid and user namespace into cred.
* Added the ability to increase ucount by more than 1.

v1:
* After discussion with Eric W. Biederman, I increased the size of ucounts to
atomic_long_t.
* Added ucount_max to avoid the fork bomb.

--

Alexey Gladkov (8):
Use atomic type for ucounts reference counting
Add a reference to ucounts for each user
Increase size of ucounts to atomic_long_t
Move RLIMIT_NPROC counter to ucounts
Move RLIMIT_MSGQUEUE counter to ucounts
Move RLIMIT_SIGPENDING counter to ucounts
Move RLIMIT_MEMLOCK counter to ucounts
Move RLIMIT_NPROC check to the place where we increment the counter

fs/exec.c | 2 +-
fs/hugetlbfs/inode.c | 17 +++---
fs/io-wq.c | 22 ++++----
fs/io-wq.h | 2 +-
fs/io_uring.c | 2 +-
fs/proc/array.c | 2 +-
include/linux/cred.h | 3 ++
include/linux/hugetlb.h | 3 +-
include/linux/mm.h | 4 +-
include/linux/sched/user.h | 6 ---
include/linux/shmem_fs.h | 2 +-
include/linux/signal_types.h | 4 +-
include/linux/user_namespace.h | 31 +++++++++--
ipc/mqueue.c | 29 +++++-----
ipc/shm.c | 31 ++++++-----
kernel/cred.c | 43 +++++++++++----
kernel/exit.c | 2 +-
kernel/fork.c | 12 +++--
kernel/signal.c | 53 ++++++++----------
kernel/sys.c | 13 -----
kernel/ucount.c | 99 +++++++++++++++++++++++++++++-----
kernel/user.c | 2 -
kernel/user_namespace.c | 7 ++-
mm/memfd.c | 4 +-
mm/mlock.c | 35 +++++-------
mm/mmap.c | 3 +-
mm/shmem.c | 8 +--
27 files changed, 268 insertions(+), 173 deletions(-)

--
2.29.2

2021-01-10 17:46:01

by Alexey Gladkov

[permalink] [raw]

Subject: [RFC PATCH v2 5/8] Move RLIMIT_MSGQUEUE counter to ucounts

Signed-off-by: Alexey Gladkov <[email protected]>
---
include/linux/sched/user.h | 4 ----
include/linux/user_namespace.h | 8 ++++++++
ipc/mqueue.c | 29 +++++++++++++++--------------
kernel/fork.c | 1 +
kernel/ucount.c | 1 +
kernel/user_namespace.c | 1 +
6 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index d33d867ad6c1..8a34446681aa 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -18,10 +18,6 @@ struct user_struct {
#endif
#ifdef CONFIG_EPOLL
atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
-#endif
-#ifdef CONFIG_POSIX_MQUEUE
- /* protected by mq_lock */
- unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
#endif
unsigned long locked_shm; /* How many pages of mlocked shm ? */
unsigned long unix_inflight; /* How many files in flight in unix sockets */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 43c83ed2fc61..2980afed6fb7 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -51,6 +51,7 @@ enum ucount_type {
UCOUNT_INOTIFY_WATCHES,
#endif
UCOUNT_RLIMIT_NPROC,
+ UCOUNT_RLIMIT_MSGQUEUE,
UCOUNT_COUNTS,
};

@@ -113,6 +114,13 @@ static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type t
return atomic_long_read(&ucounts->ucount[type]);
}

+static inline struct ucounts *get_ucount(struct ucounts *ucounts)
+{
+ if (ucounts)
+ atomic_inc(&ucounts->count);
+ return ucounts;
+}
+
bool inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
bool inc_rlimit_ucounts_and_test(struct ucounts *ucounts, enum ucount_type type, long v, long max);
void dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index beff0cfcd1e8..67088f1aa6b0 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -144,7 +144,7 @@ struct mqueue_inode_info {
struct pid *notify_owner;
u32 notify_self_exec_id;
struct user_namespace *notify_user_ns;
- struct user_struct *user; /* user who created, for accounting */
+ struct ucounts *ucounts; /* user who created, for accounting */
struct sock *notify_sock;
struct sk_buff *notify_cookie;

@@ -292,7 +292,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
struct ipc_namespace *ipc_ns, umode_t mode,
struct mq_attr *attr)
{
- struct user_struct *u = current_user();
struct inode *inode;
int ret = -ENOMEM;

@@ -309,6 +308,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
if (S_ISREG(mode)) {
struct mqueue_inode_info *info;
unsigned long mq_bytes, mq_treesize;
+ struct ucounts *ucounts;
+ bool overlimit;

inode->i_fop = &mqueue_file_operations;
inode->i_size = FILENT_SIZE;
@@ -321,7 +322,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
info->notify_owner = NULL;
info->notify_user_ns = NULL;
info->qsize = 0;
- info->user = NULL; /* set when all is ok */
+ info->ucounts = NULL; /* set when all is ok */
info->msg_tree = RB_ROOT;
info->msg_tree_rightmost = NULL;
info->node_cache = NULL;
@@ -371,19 +372,19 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
if (mq_bytes + mq_treesize < mq_bytes)
goto out_inode;
mq_bytes += mq_treesize;
+ ucounts = current_ucounts();
spin_lock(&mq_lock);
- if (u->mq_bytes + mq_bytes < u->mq_bytes ||
- u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
+ overlimit = inc_rlimit_ucounts_and_test(ucounts, UCOUNT_RLIMIT_MSGQUEUE,
+ mq_bytes, rlimit(RLIMIT_MSGQUEUE));
+ if (overlimit) {
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
spin_unlock(&mq_lock);
/* mqueue_evict_inode() releases info->messages */
ret = -EMFILE;
goto out_inode;
}
- u->mq_bytes += mq_bytes;
spin_unlock(&mq_lock);
-
- /* all is ok */
- info->user = get_uid(u);
+ info->ucounts = get_ucount(ucounts);
} else if (S_ISDIR(mode)) {
inc_nlink(inode);
/* Some things misbehave if size == 0 on a directory */
@@ -497,7 +498,7 @@ static void mqueue_free_inode(struct inode *inode)
static void mqueue_evict_inode(struct inode *inode)
{
struct mqueue_inode_info *info;
- struct user_struct *user;
+ struct ucounts *ucounts;
struct ipc_namespace *ipc_ns;
struct msg_msg *msg, *nmsg;
LIST_HEAD(tmp_msg);
@@ -520,8 +521,8 @@ static void mqueue_evict_inode(struct inode *inode)
free_msg(msg);
}

- user = info->user;
- if (user) {
+ ucounts = info->ucounts;
+ if (ucounts) {
unsigned long mq_bytes, mq_treesize;

/* Total amount of bytes accounted for the mqueue */
@@ -533,7 +534,7 @@ static void mqueue_evict_inode(struct inode *inode)
info->attr.mq_msgsize);

spin_lock(&mq_lock);
- user->mq_bytes -= mq_bytes;
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
/*
* get_ns_from_inode() ensures that the
* (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
@@ -543,7 +544,7 @@ static void mqueue_evict_inode(struct inode *inode)
if (ipc_ns)
ipc_ns->mq_queues_count--;
spin_unlock(&mq_lock);
- free_uid(user);
+ put_ucounts(ucounts);
}
if (ipc_ns)
put_ipc_ns(ipc_ns);
diff --git a/kernel/fork.c b/kernel/fork.c
index ef7936daeeda..f61a5a3dc02f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -824,6 +824,7 @@ void __init fork_init(void)
}

init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
+ init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);

#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 4222470ca0d1..865fd66c48c4 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -75,6 +75,7 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+ { },
{ },
{ }
};
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 974f10da072c..9ace2a45a25d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -122,6 +122,7 @@ int create_user_ns(struct cred *new)
ns->ucount_max[i] = INT_MAX;
}
ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
+ ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
ns->ucounts = ucounts;

/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
--
2.29.2

2021-01-10 17:46:14

by Alexey Gladkov

[permalink] [raw]

Subject: [RFC PATCH v2 8/8] Move RLIMIT_NPROC check to the place where we increment the counter

After calling set_user(), we always have to call commit_creds() to apply
new credentials upon the current task. There is no need to separate
limit check and counter incrementing.

Signed-off-by: Alexey Gladkov <[email protected]>
---
kernel/cred.c | 22 +++++++++++++++++-----
kernel/sys.c | 13 -------------
2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/kernel/cred.c b/kernel/cred.c
index 89a945571533..770447b4f4de 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -488,14 +488,26 @@ int commit_creds(struct cred *new)
if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(new);

- /* do it
- * RLIMIT_NPROC limits on user->processes have already been checked
- * in set_user().
- */
alter_cred_subscribers(new, 2);
if (new->user != old->user || new->user_ns != old->user_ns) {
+ bool overlimit;
+
set_cred_ucounts(new, new->user_ns, new->euid);
- inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
+ overlimit = inc_rlimit_ucounts_and_test(new->ucounts, UCOUNT_RLIMIT_NPROC,
+ 1, rlimit(RLIMIT_NPROC));
+
+ /*
+ * We don't fail in case of NPROC limit excess here because too many
+ * poorly written programs don't check set*uid() return code, assuming
+ * it never fails if called by root. We may still enforce NPROC limit
+ * for programs doing set*uid()+execve() by harmlessly deferring the
+ * failure to the execve() stage.
+ */
+ if (overlimit && new->user != INIT_USER)
+ current->flags |= PF_NPROC_EXCEEDED;
+ else
+ current->flags &= ~PF_NPROC_EXCEEDED;
}
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
diff --git a/kernel/sys.c b/kernel/sys.c
index c2734ab9474e..180c4e06064f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -467,19 +467,6 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;

- /*
- * We don't fail in case of NPROC limit excess here because too many
- * poorly written programs don't check set*uid() return code, assuming
- * it never fails if called by root. We may still enforce NPROC limit
- * for programs doing set*uid()+execve() by harmlessly deferring the
- * failure to the execve() stage.
- */
- if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
- new_user != INIT_USER)
- current->flags |= PF_NPROC_EXCEEDED;
- else
- current->flags &= ~PF_NPROC_EXCEEDED;
-
free_uid(new->user);
new->user = new_user;
return 0;
--
2.29.2

2021-01-10 17:46:39

by Alexey Gladkov

[permalink] [raw]

Subject: [RFC PATCH v2 3/8] Increase size of ucounts to atomic_long_t

This commit is preparation for migrating rlimits counters to ucounts.

Signed-off-by: Alexey Gladkov <[email protected]>
---
include/linux/user_namespace.h | 4 ++--
kernel/ucount.c | 14 +++++++-------
2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 483568a56f7f..24b850c7b70e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -85,7 +85,7 @@ struct user_namespace {
struct ctl_table_header *sysctls;
#endif
struct ucounts *ucounts;
- int ucount_max[UCOUNT_COUNTS];
+ long ucount_max[UCOUNT_COUNTS];
} __randomize_layout;

struct ucounts {
@@ -93,7 +93,7 @@ struct ucounts {
struct user_namespace *ns;
kuid_t uid;
atomic_t count;
- atomic_t ucount[UCOUNT_COUNTS];
+ atomic_long_t ucount[UCOUNT_COUNTS];
};

extern struct user_namespace init_user_ns;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 80a39073bcef..4c5825ffd2e9 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -185,14 +185,14 @@ void set_cred_ucounts(const struct cred *cred, struct user_namespace *ns, kuid_t
((struct cred *) cred)->ucounts = get_ucounts(ns, uid);
}

-static inline bool atomic_inc_below(atomic_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
- int c, old;
- c = atomic_read(v);
+ long c, old;
+ c = atomic_long_read(v);
for (;;) {
if (unlikely(c >= u))
return false;
- old = atomic_cmpxchg(v, c, c+1);
+ old = atomic_long_cmpxchg(v, c, c+1);
if (likely(old == c))
return true;
c = old;
@@ -209,14 +209,14 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
int max;
tns = iter->ns;
max = READ_ONCE(tns->ucount_max[type]);
- if (!atomic_inc_below(&iter->ucount[type], max))
+ if (!atomic_long_inc_below(&iter->ucount[type], max))
goto fail;
}
return ucounts;
fail:
bad = iter;
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
- atomic_dec(&iter->ucount[type]);
+ atomic_long_dec(&iter->ucount[type]);

put_ucounts(ucounts);
return NULL;
@@ -226,7 +226,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
struct ucounts *iter;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ int dec = atomic_long_dec_if_positive(&iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
}
put_ucounts(ucounts);
--
2.29.2

2021-01-10 17:46:40

by Alexey Gladkov

[permalink] [raw]

Subject: [RFC PATCH v2 6/8] Move RLIMIT_SIGPENDING counter to ucounts

Signed-off-by: Alexey Gladkov <[email protected]>
---
fs/proc/array.c | 2 +-
include/linux/sched/user.h | 1 -
include/linux/signal_types.h | 4 ++-
include/linux/user_namespace.h | 1 +
kernel/fork.c | 1 +
kernel/signal.c | 53 ++++++++++++++--------------------
kernel/ucount.c | 1 +
kernel/user.c | 1 -
kernel/user_namespace.c | 1 +
9 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index bb87e4d89cd8..74b0ea4b7e38 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = atomic_read(&__task_cred(p)->user->sigpending);
+ qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 8a34446681aa..8ba9cec4fb99 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -12,7 +12,6 @@
*/
struct user_struct {
refcount_t __count; /* reference count */
- atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners;
#endif
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index 68e06c75c5b2..34cb28b8f16c 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -13,6 +13,8 @@ typedef struct kernel_siginfo {
__SIGINFO;
} kernel_siginfo_t;

+struct ucounts;
+
/*
* Real Time signals may be queued.
*/
@@ -21,7 +23,7 @@ struct sigqueue {
struct list_head list;
int flags;
kernel_siginfo_t info;
- struct user_struct *user;
+ struct ucounts *ucounts;
};

/* flags values. */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 2980afed6fb7..7719e2163b72 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -52,6 +52,7 @@ enum ucount_type {
#endif
UCOUNT_RLIMIT_NPROC,
UCOUNT_RLIMIT_MSGQUEUE,
+ UCOUNT_RLIMIT_SIGPENDING,
UCOUNT_COUNTS,
};

diff --git a/kernel/fork.c b/kernel/fork.c
index f61a5a3dc02f..a7be5790392e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -825,6 +825,7 @@ void __init fork_init(void)

init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);
+ init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING);

#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/signal.c b/kernel/signal.c
index 5736c55aaa1a..877020481212 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -412,49 +412,40 @@ void task_join_group_stop(struct task_struct *task)
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
{
- struct sigqueue *q = NULL;
- struct user_struct *user;
- int sigpending;
+ struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);

- /*
- * Protect access to @t credentials. This can go away when all
- * callers hold rcu read lock.
- *
- * NOTE! A pending signal will hold on to the user refcount,
- * and we get/put the refcount only when the sigpending count
- * changes from/to zero.
- */
- rcu_read_lock();
- user = __task_cred(t)->user;
- sigpending = atomic_inc_return(&user->sigpending);
- if (sigpending == 1)
- get_uid(user);
- rcu_read_unlock();
+ if (likely(q != NULL)) {
+ bool overlimit;

- if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
- q = kmem_cache_alloc(sigqueue_cachep, flags);
- } else {
- print_dropped_signal(sig);
- }
-
- if (unlikely(q == NULL)) {
- if (atomic_dec_and_test(&user->sigpending))
- free_uid(user);
- } else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
- q->user = user;
+
+ /*
+ * Protect access to @t credentials. This can go away when all
+ * callers hold rcu read lock.
+ */
+ rcu_read_lock();
+ q->ucounts = get_ucount(task_ucounts(t));
+ overlimit = inc_rlimit_ucounts_and_test(q->ucounts, UCOUNT_RLIMIT_SIGPENDING,
+ 1, task_rlimit(t, RLIMIT_SIGPENDING));
+
+ if (override_rlimit || likely(!overlimit)) {
+ rcu_read_unlock();
+ return q;
+ }
+ rcu_read_unlock();
}

- return q;
+ print_dropped_signal(sig);
+ return NULL;
}

static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- if (atomic_dec_and_test(&q->user->sigpending))
- free_uid(q->user);
+ dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
+ put_ucounts(q->ucounts);
kmem_cache_free(sigqueue_cachep, q);
}

diff --git a/kernel/ucount.c b/kernel/ucount.c
index 865fd66c48c4..c79a7155a9b7 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -75,6 +75,7 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+ { },
{ },
{ },
{ }
diff --git a/kernel/user.c b/kernel/user.c
index 7f5ff498207a..6737327f83be 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
.ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9ace2a45a25d..eeff7f6d81c0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -123,6 +123,7 @@ int create_user_ns(struct cred *new)
}
ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
+ ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING);
ns->ucounts = ucounts;

/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
--
2.29.2

2021-01-10 17:46:55

by Alexey Gladkov

[permalink] [raw]

Subject: [RFC PATCH v2 1/8] Use atomic type for ucounts reference counting

Signed-off-by: Alexey Gladkov <[email protected]>
---
include/linux/user_namespace.h | 2 +-
kernel/ucount.c | 10 +++++-----
2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..84fefa9247c4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -92,7 +92,7 @@ struct ucounts {
struct hlist_node node;
struct user_namespace *ns;
kuid_t uid;
- int count;
+ atomic_t count;
atomic_t ucount[UCOUNT_COUNTS];
};

diff --git a/kernel/ucount.c b/kernel/ucount.c
index 11b1596e2542..0f2c7c11df19 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -141,7 +141,8 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)

new->ns = ns;
new->uid = uid;
- new->count = 0;
+
+ atomic_set(&new->count, 0);

spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -152,10 +153,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
ucounts = new;
}
}
- if (ucounts->count == INT_MAX)
+ if (atomic_read(&ucounts->count) == INT_MAX)
ucounts = NULL;
else
- ucounts->count += 1;
+ atomic_inc(&ucounts->count);
spin_unlock_irq(&ucounts_lock);
return ucounts;
}
@@ -165,8 +166,7 @@ static void put_ucounts(struct ucounts *ucounts)
unsigned long flags;

spin_lock_irqsave(&ucounts_lock, flags);
- ucounts->count -= 1;
- if (!ucounts->count)
+ if (atomic_dec_and_test(&ucounts->count))
hlist_del_init(&ucounts->node);
else
ucounts = NULL;
--
2.29.2

2021-01-10 17:48:12

by Alexey Gladkov

[permalink] [raw]

Subject: [RFC PATCH v2 7/8] Move RLIMIT_MEMLOCK counter to ucounts

Signed-off-by: Alexey Gladkov <[email protected]>
---
fs/hugetlbfs/inode.c | 17 ++++++++---------
include/linux/hugetlb.h | 3 +--
include/linux/mm.h | 4 ++--
include/linux/shmem_fs.h | 2 +-
include/linux/user_namespace.h | 1 +
ipc/shm.c | 31 ++++++++++++++++--------------
kernel/fork.c | 1 +
kernel/ucount.c | 1 +
kernel/user_namespace.c | 1 +
mm/memfd.c | 4 +---
mm/mlock.c | 35 +++++++++++++---------------------
mm/mmap.c | 3 +--
mm/shmem.c | 8 ++++----
13 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b5c109703daa..82298412f020 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1451,34 +1451,35 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, struct user_struct **user,
+ vm_flags_t acctflag,
int creat_flags, int page_size_log)
{
struct inode *inode;
struct vfsmount *mnt;
int hstate_idx;
struct file *file;
+ const struct cred *cred;

hstate_idx = get_hstate_idx(page_size_log);
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);

- *user = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt)
return ERR_PTR(-ENOENT);

if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *user = current_user();
- if (user_shm_lock(size, *user)) {
+ cred = current_cred();
+ if (user_shm_lock(size, cred)) {
task_lock(current);
pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
current->comm, current->pid);
task_unlock(current);
} else {
- *user = NULL;
return ERR_PTR(-EPERM);
}
+ } else {
+ cred = NULL;
}

file = ERR_PTR(-ENOSPC);
@@ -1503,10 +1504,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,

iput(inode);
out:
- if (*user) {
- user_shm_unlock(size, *user);
- *user = NULL;
- }
+ if (cred)
+ user_shm_unlock(size, cred);
return file;
}

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ebca2ef02212..fbd36c452648 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -434,8 +434,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
- struct user_struct **user, int creat_flags,
- int page_size_log);
+ int creat_flags, int page_size_log);

static inline bool is_file_hugepages(struct file *file)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8cd6ae..30a37aef1ab9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1628,8 +1628,8 @@ extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
-extern int user_shm_lock(size_t, struct user_struct *);
-extern void user_shm_unlock(size_t, struct user_struct *);
+extern int user_shm_lock(size_t, const struct cred *);
+extern void user_shm_unlock(size_t, const struct cred *);

/*
* Parameter block passed down to zap_pte_range in exceptional cases.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d82b6f396588..10f50b1c4e0e 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -65,7 +65,7 @@ extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
extern int shmem_zero_setup(struct vm_area_struct *);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
-extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern int shmem_lock(struct file *file, int lock, const struct cred *cred);
#ifdef CONFIG_SHMEM
extern const struct address_space_operations shmem_aops;
static inline bool shmem_mapping(struct address_space *mapping)
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 7719e2163b72..320275e44524 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -53,6 +53,7 @@ enum ucount_type {
UCOUNT_RLIMIT_NPROC,
UCOUNT_RLIMIT_MSGQUEUE,
UCOUNT_RLIMIT_SIGPENDING,
+ UCOUNT_RLIMIT_MEMLOCK,
UCOUNT_COUNTS,
};

diff --git a/ipc/shm.c b/ipc/shm.c
index febd88daba8c..40c566cd6f7a 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -60,7 +60,7 @@ struct shmid_kernel /* private to the kernel */
time64_t shm_ctim;
struct pid *shm_cprid;
struct pid *shm_lprid;
- struct user_struct *mlock_user;
+ const struct cred *mlock_cred;

/* The task created the shm object. NULL if the task is dead. */
struct task_struct *shm_creator;
@@ -286,10 +286,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
shm_rmid(ns, shp);
shm_unlock(shp);
if (!is_file_hugepages(shm_file))
- shmem_lock(shm_file, 0, shp->mlock_user);
- else if (shp->mlock_user)
+ shmem_lock(shm_file, 0, shp->mlock_cred);
+ else if (shp->mlock_cred)
user_shm_unlock(i_size_read(file_inode(shm_file)),
- shp->mlock_user);
+ shp->mlock_cred);
fput(shm_file);
ipc_update_pid(&shp->shm_cprid, NULL);
ipc_update_pid(&shp->shm_lprid, NULL);
@@ -625,7 +625,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)

shp->shm_perm.key = key;
shp->shm_perm.mode = (shmflg & S_IRWXUGO);
- shp->mlock_user = NULL;
+ shp->mlock_cred = NULL;

shp->shm_perm.security = NULL;
error = security_shm_alloc(&shp->shm_perm);
@@ -650,8 +650,9 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE;
file = hugetlb_file_setup(name, hugesize, acctflag,
- &shp->mlock_user, HUGETLB_SHMFS_INODE,
+ HUGETLB_SHMFS_INODE,
(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
+ shp->mlock_cred = current_cred();
} else {
/*
* Do not allow no accounting for OVERCOMMIT_NEVER, even
@@ -663,8 +664,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
file = shmem_kernel_file_setup(name, size, acctflag);
}
error = PTR_ERR(file);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ shp->mlock_cred = NULL;
goto no_file;
+ }

shp->shm_cprid = get_pid(task_tgid(current));
shp->shm_lprid = NULL;
@@ -698,8 +701,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
no_id:
ipc_update_pid(&shp->shm_cprid, NULL);
ipc_update_pid(&shp->shm_lprid, NULL);
- if (is_file_hugepages(file) && shp->mlock_user)
- user_shm_unlock(size, shp->mlock_user);
+ if (is_file_hugepages(file) && shp->mlock_cred)
+ user_shm_unlock(size, shp->mlock_cred);
fput(file);
ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
return error;
@@ -1105,12 +1108,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
goto out_unlock0;

if (cmd == SHM_LOCK) {
- struct user_struct *user = current_user();
+ const struct cred *cred = current_cred();

- err = shmem_lock(shm_file, 1, user);
+ err = shmem_lock(shm_file, 1, cred);
if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
shp->shm_perm.mode |= SHM_LOCKED;
- shp->mlock_user = user;
+ shp->mlock_cred = cred;
}
goto out_unlock0;
}
@@ -1118,9 +1121,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
/* SHM_UNLOCK */
if (!(shp->shm_perm.mode & SHM_LOCKED))
goto out_unlock0;
- shmem_lock(shm_file, 0, shp->mlock_user);
+ shmem_lock(shm_file, 0, shp->mlock_cred);
shp->shm_perm.mode &= ~SHM_LOCKED;
- shp->mlock_user = NULL;
+ shp->mlock_cred = NULL;
get_file(shm_file);
ipc_unlock_object(&shp->shm_perm);
rcu_read_unlock();
diff --git a/kernel/fork.c b/kernel/fork.c
index a7be5790392e..8104870f67c0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -826,6 +826,7 @@ void __init fork_init(void)
init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);
init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING);
+ init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK);

#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/ucount.c b/kernel/ucount.c
index c79a7155a9b7..efecf34f49fb 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -78,6 +78,7 @@ static struct ctl_table user_table[] = {
{ },
{ },
{ },
+ { },
{ }
};
#endif /* CONFIG_SYSCTL */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index eeff7f6d81c0..a634ce74988c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -124,6 +124,7 @@ int create_user_ns(struct cred *new)
ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING);
+ ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK);
ns->ucounts = ucounts;

/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
diff --git a/mm/memfd.c b/mm/memfd.c
index 2647c898990c..9f80f162791a 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -297,9 +297,7 @@ SYSCALL_DEFINE2(memfd_create,
}

if (flags & MFD_HUGETLB) {
- struct user_struct *user = NULL;
-
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
diff --git a/mm/mlock.c b/mm/mlock.c
index 55b3b3672977..2d49d1afd7e0 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -812,15 +812,10 @@ SYSCALL_DEFINE0(munlockall)
return ret;
}

-/*
- * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
- * shm segments) get accounted against the user_struct instead.
- */
-static DEFINE_SPINLOCK(shmlock_user_lock);
-
-int user_shm_lock(size_t size, struct user_struct *user)
+int user_shm_lock(size_t size, const struct cred *cred)
{
unsigned long lock_limit, locked;
+ bool overlimit;
int allowed = 0;

locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -828,22 +823,18 @@ int user_shm_lock(size_t size, struct user_struct *user)
if (lock_limit == RLIM_INFINITY)
allowed = 1;
lock_limit >>= PAGE_SHIFT;
- spin_lock(&shmlock_user_lock);
- if (!allowed &&
- locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
- goto out;
- get_uid(user);
- user->locked_shm += locked;
- allowed = 1;
-out:
- spin_unlock(&shmlock_user_lock);
- return allowed;
+
+ overlimit = inc_rlimit_ucounts_and_test(cred->ucounts, UCOUNT_RLIMIT_MEMLOCK,
+ locked, lock_limit);
+
+ if (!allowed && overlimit && !capable(CAP_IPC_LOCK)) {
+ dec_rlimit_ucounts(cred->ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+ return 0;
+ }
+ return 1;
}

-void user_shm_unlock(size_t size, struct user_struct *user)
+void user_shm_unlock(size_t size, const struct cred *cred)
{
- spin_lock(&shmlock_user_lock);
- user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- spin_unlock(&shmlock_user_lock);
- free_uid(user);
+ dec_rlimit_ucounts(cred->ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index dc7206032387..e7980e2c18e8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1607,7 +1607,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
struct hstate *hs;

hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1623,7 +1622,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE,
+ HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c6b6d8f6c39..de9bf6866f51 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2225,7 +2225,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
}
#endif

-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, const struct cred *cred)
{
struct inode *inode = file_inode(file);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2237,13 +2237,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
* no serialization needed when called from shm_destroy().
*/
if (lock && !(info->flags & VM_LOCKED)) {
- if (!user_shm_lock(inode->i_size, user))
+ if (!user_shm_lock(inode->i_size, cred))
goto out_nomem;
info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
- if (!lock && (info->flags & VM_LOCKED) && user) {
- user_shm_unlock(inode->i_size, user);
+ if (!lock && (info->flags & VM_LOCKED) && cred) {
+ user_shm_unlock(inode->i_size, cred);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
}
--
2.29.2

2021-01-10 18:48:36

by Linus Torvalds

[permalink] [raw]

Subject: Re: [RFC PATCH v2 0/8] Count rlimits in each user namespace

On Sun, Jan 10, 2021 at 9:34 AM Alexey Gladkov <[email protected]> wrote:
>
> To address the problem, we bind rlimit counters to each user namespace. The
> result is a tree of rlimit counters with the biggest value at the root (aka
> init_user_ns). The rlimit counter increment/decrement occurs in the current and
> all parent user namespaces.

I'm not seeing why this is necessary.

Maybe it's the right approach, but none of the patches (or this cover
letter email) really explain it to me.

I understand why you might want the _limits_ themselves would form a
tree like this - with the "master limit" limiting the limits in the
user namespaces under it.

But I don't understand why the _counts_ should do that. The 'struct
user_struct' should be shared across even user namespaces for the same
user.

IOW, the very example of the problem you quote seems to argue against this:

> For example, there are two containers (A and B) created by one user. The
> container A sets RLIMIT_NPROC=1 and starts one process. Everything is fine, but
> when container B tries to do the same it will fail because the number of
> processes is counted globally for each user and user has one process already.

Note how the problem was _not_ that the _count_ was global. That part
was fine and all good.

No, the problem was that the _limit_ in container A also ended up
affecting container B.

So to me, that says that it would make sense to continue to use the
resource counts in 'struct user_struct' (because if user A has a hard
limit of X, then creating a new namespace shouldn't expand that
limit), but then have the ability to make per-container changes to the
resource limits (as long as they are within the bounds of the parent
user namespace resource limit).

Maybe there is some reason for this ucounts approach, but if so, I
feel it was not explained at all.

Linus

2021-01-11 20:21:16

by Eric W. Biederman

[permalink] [raw]

Subject: Re: [RFC PATCH v2 0/8] Count rlimits in each user namespace

Linus Torvalds <[email protected]> writes:

> On Sun, Jan 10, 2021 at 9:34 AM Alexey Gladkov <[email protected]> wrote:
>>
>> To address the problem, we bind rlimit counters to each user namespace. The
>> result is a tree of rlimit counters with the biggest value at the root (aka
>> init_user_ns). The rlimit counter increment/decrement occurs in the current and
>> all parent user namespaces.
>
> I'm not seeing why this is necessary.
>
> Maybe it's the right approach, but none of the patches (or this cover
> letter email) really explain it to me.
>
> I understand why you might want the _limits_ themselves would form a
> tree like this - with the "master limit" limiting the limits in the
> user namespaces under it.
>
> But I don't understand why the _counts_ should do that. The 'struct
> user_struct' should be shared across even user namespaces for the same
> user.
>
> IOW, the very example of the problem you quote seems to argue against this:
>
>> For example, there are two containers (A and B) created by one user. The
>> container A sets RLIMIT_NPROC=1 and starts one process. Everything is fine, but
>> when container B tries to do the same it will fail because the number of
>> processes is counted globally for each user and user has one process already.
>
> Note how the problem was _not_ that the _count_ was global. That part
> was fine and all good.

The problem is fundamentally that the per process RLIMIT_NPROC was
compared against the user_struct->processes.

I have only heard the problem described but I believe it is either the
RLIMIT_NPROC test in fork or at the beginning of do_execveat_common that
is failing.

From fs/exec.c line 1866:
> /*
> * We move the actual failure in case of RLIMIT_NPROC excess from
> * set*uid() to execve() because too many poorly written programs
> * don't check setuid() return code. Here we additionally recheck
> * whether NPROC limit is still exceeded.
> */
> if ((current->flags & PF_NPROC_EXCEEDED) &&
> atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
> retval = -EAGAIN;
> goto out_ret;
> }

From fs/fork.c line 1966:
> retval = -EAGAIN;
> if (atomic_read(&p->real_cred->user->processes) >=
> task_rlimit(p, RLIMIT_NPROC)) {
> if (p->real_cred->user != INIT_USER &&
> !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
> goto bad_fork_free;
> }
> current->flags &= ~PF_NPROC_EXCEEDED;

In both the cases the RLIMIT_NPROC value comes from
task->signal->rlim[RLIMIT_NPROC] and the count of processes
comes from task->cred->user->processes.

> No, the problem was that the _limit_ in container A also ended up
> affecting container B.

The description I have is that both containers run the same service
that set it's RLIMIT_NPROC to 1 in both containers.

> So to me, that says that it would make sense to continue to use the
> resource counts in 'struct user_struct' (because if user A has a hard
> limit of X, then creating a new namespace shouldn't expand that
> limit), but then have the ability to make per-container changes to the
> resource limits (as long as they are within the bounds of the parent
> user namespace resource limit).

I agree that needs to work as well.

> Maybe there is some reason for this ucounts approach, but if so, I
> feel it was not explained at all.

Let me see if I can starte the example a litle more clearly.

Suppose there is a service never_fork that sets RLIMIT_NPROC runs as
never_fork_user and sets RLIMIT_NPROC to 1 in it's systemd service file.

Further suppose there is a user bob who has two containers he wants to
run: container1 and container2. Both containers start the never_fork
service.

Bob first starts container1 and inside it the never_fork service starts.
Bob starts container2 and the never_fork service fails to start.

Does that make it clear that it is the count of the processes that would
exceed 1 if both instances of the never_fork service starts that would
be the problem?

Eric

2021-01-13 16:35:28

by Eric W. Biederman

[permalink] [raw]

Subject: Re: [RFC PATCH v2 1/8] Use atomic type for ucounts reference counting

Alexey Gladkov <[email protected]> writes:

We might want to use refcount_t instead of atomic_t. Not a big deal
either way.

> Signed-off-by: Alexey Gladkov <[email protected]>
> ---
> include/linux/user_namespace.h | 2 +-
> kernel/ucount.c | 10 +++++-----
> 2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 64cf8ebdc4ec..84fefa9247c4 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -92,7 +92,7 @@ struct ucounts {
> struct hlist_node node;
> struct user_namespace *ns;
> kuid_t uid;
> - int count;
> + atomic_t count;
> atomic_t ucount[UCOUNT_COUNTS];
> };
>
> diff --git a/kernel/ucount.c b/kernel/ucount.c
> index 11b1596e2542..0f2c7c11df19 100644
> --- a/kernel/ucount.c
> +++ b/kernel/ucount.c
> @@ -141,7 +141,8 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
>
> new->ns = ns;
> new->uid = uid;
> - new->count = 0;
> +
> + atomic_set(&new->count, 0);
>
> spin_lock_irq(&ucounts_lock);
> ucounts = find_ucounts(ns, uid, hashent);
> @@ -152,10 +153,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
> ucounts = new;
> }
> }
> - if (ucounts->count == INT_MAX)
> + if (atomic_read(&ucounts->count) == INT_MAX)
> ucounts = NULL;
> else
> - ucounts->count += 1;
> + atomic_inc(&ucounts->count);
> spin_unlock_irq(&ucounts_lock);
> return ucounts;
> }
> @@ -165,8 +166,7 @@ static void put_ucounts(struct ucounts *ucounts)
> unsigned long flags;
>
> spin_lock_irqsave(&ucounts_lock, flags);
> - ucounts->count -= 1;
> - if (!ucounts->count)
> + if (atomic_dec_and_test(&ucounts->count))
> hlist_del_init(&ucounts->node);
> else
> ucounts = NULL;

This can become:
static void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;

if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
hlist_del_init(&ucounts->node);
spin_unlock_irqrestore(&ucounts_lock);
kfree(ucounts);
}
}

2021-01-13 18:05:09

by Kees Cook

[permalink] [raw]

Subject: Re: [RFC PATCH v2 1/8] Use atomic type for ucounts reference counting

On Wed, Jan 13, 2021 at 10:31:40AM -0600, Eric W. Biederman wrote:
> Alexey Gladkov <[email protected]> writes:
>
> We might want to use refcount_t instead of atomic_t. Not a big deal
> either way.

Yes, please use refcount_t, and don't use _read() since that introduces
races.

-Kees

>
> > Signed-off-by: Alexey Gladkov <[email protected]>
> > ---
> > include/linux/user_namespace.h | 2 +-
> > kernel/ucount.c | 10 +++++-----
> > 2 files changed, 6 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> > index 64cf8ebdc4ec..84fefa9247c4 100644
> > --- a/include/linux/user_namespace.h
> > +++ b/include/linux/user_namespace.h
> > @@ -92,7 +92,7 @@ struct ucounts {
> > struct hlist_node node;
> > struct user_namespace *ns;
> > kuid_t uid;
> > - int count;
> > + atomic_t count;
> > atomic_t ucount[UCOUNT_COUNTS];
> > };
> >
> > diff --git a/kernel/ucount.c b/kernel/ucount.c
> > index 11b1596e2542..0f2c7c11df19 100644
> > --- a/kernel/ucount.c
> > +++ b/kernel/ucount.c
> > @@ -141,7 +141,8 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
> >
> > new->ns = ns;
> > new->uid = uid;
> > - new->count = 0;
> > +
> > + atomic_set(&new->count, 0);
> >
> > spin_lock_irq(&ucounts_lock);
> > ucounts = find_ucounts(ns, uid, hashent);
> > @@ -152,10 +153,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
> > ucounts = new;
> > }
> > }
> > - if (ucounts->count == INT_MAX)
> > + if (atomic_read(&ucounts->count) == INT_MAX)
> > ucounts = NULL;
> > else
> > - ucounts->count += 1;
> > + atomic_inc(&ucounts->count);
> > spin_unlock_irq(&ucounts_lock);
> > return ucounts;
> > }
> > @@ -165,8 +166,7 @@ static void put_ucounts(struct ucounts *ucounts)
> > unsigned long flags;
> >
> > spin_lock_irqsave(&ucounts_lock, flags);
> > - ucounts->count -= 1;
> > - if (!ucounts->count)
> > + if (atomic_dec_and_test(&ucounts->count))
> > hlist_del_init(&ucounts->node);
> > else
> > ucounts = NULL;
>
>
> This can become:
> static void put_ucounts(struct ucounts *ucounts)
> {
> unsigned long flags;
>
> if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
> hlist_del_init(&ucounts->node);
> spin_unlock_irqrestore(&ucounts_lock);
> kfree(ucounts);
> }
> }
>

--
Kees Cook