It was reported that v5.14 behaves differently when enforcing
RLIMIT_NPROC limit, namely, it allows one more task than previously.
This is consequence of the commit 21d1c5e386bc ("Reimplement
RLIMIT_NPROC on top of ucounts") that missed the sharpness of equality
in the forking path.
In order to accommodate other existing checks of the RLIMIT_NPROC, the
fix comprises of extending the result domain of ucount vs limit
comparison. Forks or setting uid of a saturated user are denied.
(Other RLIMIT_ per-user limits have correct comparison sharpness.)
Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts")
Reported-by: TBD
Signed-off-by: Michal Koutný <[email protected]>
---
fs/exec.c | 2 +-
include/linux/user_namespace.h | 2 +-
kernel/fork.c | 2 +-
kernel/sys.c | 2 +-
kernel/ucount.c | 11 +++++++----
5 files changed, 11 insertions(+), 8 deletions(-)
This change breaks tools/testing/selftests/rlimits/rlimits-per-userns.c between
v5.14..v5.15-rc1~172^2.
The commit 2863643fb8b9 ("set_user: add capability check when
rlimit(RLIMIT_NPROC) exceeds") is an inadvertent "fix".
diff --git a/fs/exec.c b/fs/exec.c
index 79f2c9483302..fc598c2652b2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1881,7 +1881,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ ucounts_limit_cmp(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) > 0) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 33a4240e6a6f..9ccc336196f7 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -129,7 +129,7 @@ long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type);
-bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
+long ucounts_limit_cmp(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
static inline void set_rlimit_ucount_max(struct user_namespace *ns,
enum ucount_type type, unsigned long max)
diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b21..7cb21a70737d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2022,7 +2022,7 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
- if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ if (ucounts_limit_cmp(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
diff --git a/kernel/sys.c b/kernel/sys.c
index ecc4cf019242..8ea20912103a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -479,7 +479,7 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
+ if (ucounts_limit_cmp(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0 &&
new_user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
current->flags |= PF_NPROC_EXCEEDED;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 65b597431c86..53ccd96387dd 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -343,18 +343,21 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
return 0;
}
-bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit)
+long ucounts_limit_cmp(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit)
{
struct ucounts *iter;
long max = rlimit;
+ long excess = LONG_MIN;
if (rlimit > LONG_MAX)
max = LONG_MAX;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- if (get_ucounts_value(iter, type) > max)
- return true;
+ /* we already WARN_ON negative ucounts, the subtraction result fits */
+ excess = max_t(long, excess, get_ucounts_value(iter, type) - max);
+ if (excess > 0)
+ return excess;
max = READ_ONCE(iter->ns->ucount_max[type]);
}
- return false;
+ return excess;
}
static __init int user_namespace_sysctl_init(void)
--
2.34.1