2022-02-08 06:53:08

by Michal Koutný

[permalink] [raw]
Subject: [RFC PATCH 0/6] RLIMIT_NPROC in ucounts fixups

This series is a result of looking deeper into breakage of
tools/testing/selftests/rlimits/rlimits-per-userns.c after
https://lore.kernel.org/r/[email protected]/
is applied.

The description of the original problem that lead to RLIMIT_NPROC et al.
ucounts rewrite could be ambiguously interpretted as supporting either
the case of:
- never-fork service or
- fork (RLIMIT_NPROC-1) times service.

The scenario is weird anyway given existence of pids controller.

The realization of that scenario relies not only on tracking number of
processes per user_ns but also newly allows the root to override limit through
set*uid. The commit message didn't mention that, so it's unclear if it
was the intention too.

I also noticed that the RLIMIT_NPROC enforcing in fork seems subject to TOCTOU
race (check(nr_tasks),...,nr_tasks++) so the limit is rather advisory (but
that's not a new thing related to ucounts rewrite).

This series is RFC to discuss relevance of the subtle changes RLIMIT_NPROC to
ucounts rewrite introduced.

Michal Koutný (6):
set_user: Perform RLIMIT_NPROC capability check against new user
credentials
set*uid: Check RLIMIT_PROC against new credentials
cred: Count tasks by their real uid into RLIMIT_NPROC
ucounts: Allow root to override RLIMIT_NPROC
selftests: Challenge RLIMIT_NPROC in user namespaces
selftests: Test RLIMIT_NPROC in clone-created user namespaces

fs/exec.c | 2 +-
include/linux/cred.h | 2 +-
kernel/cred.c | 29 ++-
kernel/fork.c | 2 +-
kernel/sys.c | 20 +-
kernel/ucount.c | 3 +
kernel/user_namespace.c | 2 +-
.../selftests/rlimits/rlimits-per-userns.c | 233 +++++++++++++++---
8 files changed, 229 insertions(+), 64 deletions(-)

--
2.34.1



2022-02-08 14:14:10

by Michal Koutný

[permalink] [raw]
Subject: [RFC PATCH 5/6] selftests: Challenge RLIMIT_NPROC in user namespaces

The services are started in descendant user namepaces, each of them
should honor the RLIMIT_NPROC that's passed during user namespace
creation.

main [user_ns_0]
` service [user_ns_1]
` worker 1
` worker 2
...
` worker k
...
` service [user_ns_n]
` worker 1
` worker 2
...
` worker k

Test uses explicit synchronization, to make sure original parent's limit
does not interfere with descendants.

Signed-off-by: Michal Koutný <[email protected]>
---
.../selftests/rlimits/rlimits-per-userns.c | 154 ++++++++++++++----
1 file changed, 125 insertions(+), 29 deletions(-)

diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns.c b/tools/testing/selftests/rlimits/rlimits-per-userns.c
index 26dc949e93ea..54c1b345e42b 100644
--- a/tools/testing/selftests/rlimits/rlimits-per-userns.c
+++ b/tools/testing/selftests/rlimits/rlimits-per-userns.c
@@ -9,7 +9,9 @@
#include <sys/resource.h>
#include <sys/prctl.h>
#include <sys/stat.h>
+#include <sys/socket.h>

+#include <assert.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
@@ -21,38 +23,74 @@
#include <errno.h>
#include <err.h>

-#define NR_CHILDS 2
+#define THE_LIMIT 4
+#define NR_CHILDREN 5
+
+static_assert(NR_CHILDREN >= THE_LIMIT-1, "Need slots for limit-1 children.");

static char *service_prog;
static uid_t user = 60000;
static uid_t group = 60000;
+static struct rlimit saved_limit;
+
+/* Two uses: main and service */
+static pid_t child[NR_CHILDREN];
+static pid_t pid;

static void setrlimit_nproc(rlim_t n)
{
- pid_t pid = getpid();
struct rlimit limit = {
.rlim_cur = n,
.rlim_max = n
};
-
- warnx("(pid=%d): Setting RLIMIT_NPROC=%ld", pid, n);
+ if (getrlimit(RLIMIT_NPROC, &saved_limit) < 0)
+ err(EXIT_FAILURE, "(pid=%d): getrlimit(RLIMIT_NPROC)", pid);

if (setrlimit(RLIMIT_NPROC, &limit) < 0)
err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid);
+
+ warnx("(pid=%d): Set RLIMIT_NPROC=%ld", pid, n);
+}
+
+static void restore_rlimit_nproc(void)
+{
+ if (setrlimit(RLIMIT_NPROC, &saved_limit) < 0)
+ err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC, saved)", pid);
+ warnx("(pid=%d) Restored RLIMIT_NPROC", pid);
}

-static pid_t fork_child(void)
+enum msg_sync {
+ UNSHARE,
+ RLIMIT_RESTORE,
+};
+
+static void sync_notify(int fd, enum msg_sync m)
{
- pid_t pid = fork();
+ char tmp = m;
+
+ if (write(fd, &tmp, 1) < 0)
+ warnx("(pid=%d): failed sync-write", pid);
+}

- if (pid < 0)
+static void sync_wait(int fd, enum msg_sync m)
+{
+ char tmp;
+
+ if (read(fd, &tmp, 1) < 0)
+ warnx("(pid=%d): failed sync-read", pid);
+}
+
+static pid_t fork_child(int control_fd)
+{
+ pid_t new_pid = fork();
+
+ if (new_pid < 0)
err(EXIT_FAILURE, "fork");

- if (pid > 0)
- return pid;
+ if (new_pid > 0)
+ return new_pid;

pid = getpid();
-
warnx("(pid=%d): New process starting ...", pid);

if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
@@ -73,6 +111,9 @@ static pid_t fork_child(void)
if (unshare(CLONE_NEWUSER) < 0)
err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");

+ sync_notify(control_fd, UNSHARE);
+ sync_wait(control_fd, RLIMIT_RESTORE);
+
char *const argv[] = { "service", NULL };
char *const envp[] = { "I_AM_SERVICE=1", NULL };

@@ -82,37 +123,92 @@ static pid_t fork_child(void)
err(EXIT_FAILURE, "(pid=%d): execve", pid);
}

+static void run_service(void)
+{
+ size_t i;
+ int ret = EXIT_SUCCESS;
+ struct rlimit limit;
+ char user_ns[PATH_MAX];
+
+ if (getrlimit(RLIMIT_NPROC, &limit) < 0)
+ err(EXIT_FAILURE, "(pid=%d) failed getrlimit", pid);
+ if (readlink("/proc/self/ns/user", user_ns, PATH_MAX) < 0)
+ err(EXIT_FAILURE, "(pid=%d) failed readlink", pid);
+
+ warnx("(pid=%d) Service instance attempts %i children, limit %lu:%lu, ns=%s",
+ pid, THE_LIMIT, limit.rlim_cur, limit.rlim_max, user_ns);
+
+ /* test rlimit inside the service, effectively THE_LIMIT-1 becaue of service itself */
+ for (i = 0; i < THE_LIMIT; i++) {
+ child[i] = fork();
+ if (child[i] == 0) {
+ /* service child */
+ pause();
+ exit(EXIT_SUCCESS);
+ }
+ if (child[i] < 0) {
+ warnx("(pid=%d) service fork %lu failed, errno = %i", pid, i+1, errno);
+ if (!(i == THE_LIMIT-1 && errno == EAGAIN))
+ ret = EXIT_FAILURE;
+ } else if (i == THE_LIMIT-1) {
+ warnx("(pid=%d) RLIMIT_NPROC not honored", pid);
+ ret = EXIT_FAILURE;
+ }
+ }
+
+ /* service cleanup */
+ for (i = 0; i < THE_LIMIT; i++)
+ if (child[i] > 0)
+ kill(child[i], SIGUSR1);
+
+ for (i = 0; i < THE_LIMIT; i++)
+ if (child[i] > 0)
+ waitpid(child[i], NULL, WNOHANG);
+
+ if (ret)
+ exit(ret);
+ pause();
+}
+
int main(int argc, char **argv)
{
size_t i;
- pid_t child[NR_CHILDS];
- int wstatus[NR_CHILDS];
- int childs = NR_CHILDS;
- pid_t pid;
+ int control_fd[NR_CHILDREN];
+ int wstatus[NR_CHILDREN];
+ int children = NR_CHILDREN;
+ int sockets[2];
+
+ pid = getpid();

if (getenv("I_AM_SERVICE")) {
- pause();
- exit(EXIT_SUCCESS);
+ run_service();
+ exit(EXIT_FAILURE);
}

service_prog = argv[0];
- pid = getpid();

warnx("(pid=%d) Starting testcase", pid);

- /*
- * This rlimit is not a problem for root because it can be exceeded.
- */
- setrlimit_nproc(1);
-
- for (i = 0; i < NR_CHILDS; i++) {
- child[i] = fork_child();
+ setrlimit_nproc(THE_LIMIT);
+ for (i = 0; i < NR_CHILDREN; i++) {
+ if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, sockets) < 0)
+ err(EXIT_FAILURE, "(pid=%d) socketpair failed", pid);
+ control_fd[i] = sockets[0];
+ child[i] = fork_child(sockets[1]);
wstatus[i] = 0;
+ }
+
+ for (i = 0; i < NR_CHILDREN; i++)
+ sync_wait(control_fd[i], UNSHARE);
+ restore_rlimit_nproc();
+
+ for (i = 0; i < NR_CHILDREN; i++) {
+ sync_notify(control_fd[i], RLIMIT_RESTORE);
usleep(250000);
}

while (1) {
- for (i = 0; i < NR_CHILDS; i++) {
+ for (i = 0; i < NR_CHILDREN; i++) {
if (child[i] <= 0)
continue;

@@ -126,22 +222,22 @@ int main(int argc, char **argv)
warn("(pid=%d): waitpid(%d)", pid, child[i]);

child[i] *= -1;
- childs -= 1;
+ children -= 1;
}

- if (!childs)
+ if (!children)
break;

usleep(250000);

- for (i = 0; i < NR_CHILDS; i++) {
+ for (i = 0; i < NR_CHILDREN; i++) {
if (child[i] <= 0)
continue;
kill(child[i], SIGUSR1);
}
}

- for (i = 0; i < NR_CHILDS; i++) {
+ for (i = 0; i < NR_CHILDREN; i++) {
if (WIFEXITED(wstatus[i]))
warnx("(pid=%d): pid %d exited, status=%d",
pid, -child[i], WEXITSTATUS(wstatus[i]));
--
2.34.1


2022-02-09 06:36:10

by Michal Koutný

[permalink] [raw]
Subject: [RFC PATCH 6/6] selftests: Test RLIMIT_NPROC in clone-created user namespaces

Verify RLIMIT_NPROC observance in user namespaces also in the
clone(CLONE_NEWUSER) path.
Note the such a user_ns is created by the privileged user.

Signed-off-by: Michal Koutný <[email protected]>
---
.../selftests/rlimits/rlimits-per-userns.c | 141 +++++++++++++-----
1 file changed, 101 insertions(+), 40 deletions(-)

diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns.c b/tools/testing/selftests/rlimits/rlimits-per-userns.c
index 54c1b345e42b..46f4cff36b30 100644
--- a/tools/testing/selftests/rlimits/rlimits-per-userns.c
+++ b/tools/testing/selftests/rlimits/rlimits-per-userns.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Author: Alexey Gladkov <[email protected]>
+ * Author: Michal Koutný <[email protected]>
*/
#define _GNU_SOURCE
#include <sys/types.h>
@@ -25,16 +26,25 @@

#define THE_LIMIT 4
#define NR_CHILDREN 5
+#define STACK_SIZE (2 * (1<<20))

-static_assert(NR_CHILDREN >= THE_LIMIT-1, "Need slots for limit-1 children.");
+static_assert(NR_CHILDREN >= THE_LIMIT-1, "Need slots for THE_LIMIT-1 children.");

-static char *service_prog;
static uid_t user = 60000;
static uid_t group = 60000;
static struct rlimit saved_limit;

-/* Two uses: main and service */
-static pid_t child[NR_CHILDREN];
+enum userns_mode {
+ UM_UNSHARE, /* setrlimit,clone(0),setuid,unshare,execve */
+ UM_CLONE_NEWUSER, /* setrlimit,clone(NEWUSER),setuid,execve */
+};
+static struct {
+ int control_fd;
+ char *pathname;
+ enum userns_mode mode;
+} child_args;
+
+/* Cache current pid */
static pid_t pid;

static void setrlimit_nproc(rlim_t n)
@@ -60,6 +70,7 @@ static void restore_rlimit_nproc(void)
}

enum msg_sync {
+ MAP_DEFINE,
UNSHARE,
RLIMIT_RESTORE,
};
@@ -80,15 +91,32 @@ static void sync_wait(int fd, enum msg_sync m)
warnx("(pid=%d): failed sync-read", pid);
}

-static pid_t fork_child(int control_fd)
+static int define_maps(pid_t child_pid)
{
- pid_t new_pid = fork();
+ FILE *f;
+ char filename[PATH_MAX];

- if (new_pid < 0)
- err(EXIT_FAILURE, "fork");
+ if (child_args.mode != UM_CLONE_NEWUSER)
+ return 0;
+
+ snprintf(filename, PATH_MAX, "/proc/%i/uid_map", child_pid);
+ f = fopen(filename, "w");
+ if (fprintf(f, "%i %i 1\n", user, user) < 0)
+ return -1;
+ fclose(f);
+
+ snprintf(filename, PATH_MAX, "/proc/%i/gid_map", child_pid);
+ f = fopen(filename, "w");
+ if (fprintf(f, "%i %i 1\n", group, group) < 0)
+ return -1;
+ fclose(f);
+
+ return 0;
+}

- if (new_pid > 0)
- return new_pid;
+static int setup_and_exec(void *arg)
+{
+ int control_fd = child_args.control_fd;

pid = getpid();
warnx("(pid=%d): New process starting ...", pid);
@@ -98,6 +126,7 @@ static pid_t fork_child(int control_fd)

signal(SIGUSR1, SIG_DFL);

+ sync_wait(control_fd, RLIMIT_RESTORE);
warnx("(pid=%d): Changing to uid=%d, gid=%d", pid, user, group);

if (setgid(group) < 0)
@@ -107,9 +136,11 @@ static pid_t fork_child(int control_fd)

warnx("(pid=%d): Service running ...", pid);

- warnx("(pid=%d): Unshare user namespace", pid);
- if (unshare(CLONE_NEWUSER) < 0)
- err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");
+ if (child_args.mode == UM_UNSHARE) {
+ warnx("(pid=%d): Unshare user namespace", pid);
+ if (unshare(CLONE_NEWUSER) < 0)
+ err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");
+ }

sync_notify(control_fd, UNSHARE);
sync_wait(control_fd, RLIMIT_RESTORE);
@@ -119,14 +150,30 @@ static pid_t fork_child(int control_fd)

warnx("(pid=%d): Executing real service ...", pid);

- execve(service_prog, argv, envp);
+ execve(child_args.pathname, argv, envp);
err(EXIT_FAILURE, "(pid=%d): execve", pid);
}

-static void run_service(void)
+static pid_t start_child(char *pathname, int control_fd)
+{
+ char *stack = malloc(STACK_SIZE);
+ int flags = child_args.mode == UM_CLONE_NEWUSER ? CLONE_NEWUSER : 0;
+ pid_t new_pid;
+
+ child_args.control_fd = control_fd;
+ child_args.pathname = pathname;
+
+ new_pid = clone(setup_and_exec, stack+STACK_SIZE-1, flags, NULL);
+ if (new_pid < 0)
+ err(EXIT_FAILURE, "clone");
+
+ free(stack);
+ close(control_fd);
+ return new_pid;
+}
+
+static void dump_context(size_t n_workers)
{
- size_t i;
- int ret = EXIT_SUCCESS;
struct rlimit limit;
char user_ns[PATH_MAX];

@@ -135,44 +182,55 @@ static void run_service(void)
if (readlink("/proc/self/ns/user", user_ns, PATH_MAX) < 0)
err(EXIT_FAILURE, "(pid=%d) failed readlink", pid);

- warnx("(pid=%d) Service instance attempts %i children, limit %lu:%lu, ns=%s",
- pid, THE_LIMIT, limit.rlim_cur, limit.rlim_max, user_ns);
+ warnx("(pid=%d) Service instance attempts %lu workers, limit %lu:%lu, ns=%s",
+ pid, n_workers, limit.rlim_cur, limit.rlim_max, user_ns);
+}
+
+static int run_service(void)
+{
+ size_t i, n_workers = THE_LIMIT;
+ pid_t worker[NR_CHILDREN];
+ int ret = EXIT_SUCCESS;

- /* test rlimit inside the service, effectively THE_LIMIT-1 becaue of service itself */
- for (i = 0; i < THE_LIMIT; i++) {
- child[i] = fork();
- if (child[i] == 0) {
- /* service child */
+ dump_context(n_workers);
+
+ /* test rlimit inside the service, last worker should fail because of service itself */
+ for (i = 0; i < n_workers; i++) {
+ worker[i] = fork();
+ if (worker[i] == 0) {
+ /* service worker */
pause();
exit(EXIT_SUCCESS);
}
- if (child[i] < 0) {
+ if (worker[i] < 0) {
warnx("(pid=%d) service fork %lu failed, errno = %i", pid, i+1, errno);
- if (!(i == THE_LIMIT-1 && errno == EAGAIN))
+ if (!(i == n_workers-1 && errno == EAGAIN))
ret = EXIT_FAILURE;
- } else if (i == THE_LIMIT-1) {
+ } else if (i == n_workers-1) {
warnx("(pid=%d) RLIMIT_NPROC not honored", pid);
ret = EXIT_FAILURE;
}
}

/* service cleanup */
- for (i = 0; i < THE_LIMIT; i++)
- if (child[i] > 0)
- kill(child[i], SIGUSR1);
+ for (i = 0; i < n_workers; i++)
+ if (worker[i] > 0)
+ kill(worker[i], SIGUSR1);

- for (i = 0; i < THE_LIMIT; i++)
- if (child[i] > 0)
- waitpid(child[i], NULL, WNOHANG);
+ for (i = 0; i < n_workers; i++)
+ if (worker[i] > 0)
+ waitpid(worker[i], NULL, WNOHANG);

if (ret)
- exit(ret);
+ return ret;
pause();
+ return EXIT_FAILURE;
}

int main(int argc, char **argv)
{
size_t i;
+ pid_t child[NR_CHILDREN];
int control_fd[NR_CHILDREN];
int wstatus[NR_CHILDREN];
int children = NR_CHILDREN;
@@ -180,12 +238,11 @@ int main(int argc, char **argv)

pid = getpid();

- if (getenv("I_AM_SERVICE")) {
- run_service();
- exit(EXIT_FAILURE);
- }
+ if (getenv("I_AM_SERVICE"))
+ return run_service();

- service_prog = argv[0];
+ if (argc > 1 && *argv[1] == 'c')
+ child_args.mode = UM_CLONE_NEWUSER;

warnx("(pid=%d) Starting testcase", pid);

@@ -194,8 +251,12 @@ int main(int argc, char **argv)
if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, sockets) < 0)
err(EXIT_FAILURE, "(pid=%d) socketpair failed", pid);
control_fd[i] = sockets[0];
- child[i] = fork_child(sockets[1]);
+ child[i] = start_child(argv[0], sockets[1]);
wstatus[i] = 0;
+
+ if (define_maps(child[i]) < 0)
+ err(EXIT_FAILURE, "(pid=%d) user_ns maps definition failed", pid);
+ sync_notify(control_fd[i], MAP_DEFINE);
}

for (i = 0; i < NR_CHILDREN; i++)
--
2.34.1


2022-02-09 06:42:11

by Michal Koutný

[permalink] [raw]
Subject: [RFC PATCH 1/6] set_user: Perform RLIMIT_NPROC capability check against new user credentials

The check is currently against the current->cred but since those are
going to change and we want to check RLIMIT_NPROC condition after the
switch, supply the capability check with the new cred.
But since we're checking new_user being INIT_USER any new cred's
capability-based allowance may be redundant when the check fails and the
alternative solution would be revert of the commit 2863643fb8b9
("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")

Fixes: 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")

Cc: Solar Designer <[email protected]>
Cc: Christian Brauner <[email protected]>
Signed-off-by: Michal Koutný <[email protected]>
---
kernel/sys.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 8ea20912103a..48c90dcceff3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -481,7 +481,8 @@ static int set_user(struct cred *new)
*/
if (ucounts_limit_cmp(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0 &&
new_user != INIT_USER &&
- !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
+ !security_capable(new, &init_user_ns, CAP_SYS_RESOURCE, CAP_OPT_NONE) &&
+ !security_capable(new, &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NONE))
current->flags |= PF_NPROC_EXCEEDED;
else
current->flags &= ~PF_NPROC_EXCEEDED;
--
2.34.1


2022-02-09 07:04:54

by Michal Koutný

[permalink] [raw]
Subject: [RFC PATCH 4/6] ucounts: Allow root to override RLIMIT_NPROC

Call sites of ucounts_limit_cmp() would allow the global root or capable
user to bypass RLIMIT_NPROC on the bottom level of user_ns tree by not
looking at ucounts at all.

As the traversal up the user_ns tree continues, the ucounts to which the
task is charged may switch the owning user (to the creator of user_ns).
If the new chargee is root, we don't really care about RLIMIT_NPROC
observation, so lift the limit to the max.

The result is that an unprivileged user U can globally run more that
RLIMIT_NPROC (of user_ns) tasks but within each user_ns it is still
limited to RLIMINT_NPROC (as passed into task->signal->rlim) iff the
user_nss are created by the privileged user.

Signed-off-by: Michal Koutný <[email protected]>
---
kernel/ucount.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/kernel/ucount.c b/kernel/ucount.c
index 53ccd96387dd..f52b7273a572 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -356,6 +356,9 @@ long ucounts_limit_cmp(struct ucounts *ucounts, enum ucount_type type, unsigned
if (excess > 0)
return excess;
max = READ_ONCE(iter->ns->ucount_max[type]);
+ /* Next ucounts owned by root? RLIMIT_NPROC is moot */
+ if (type == UCOUNT_RLIMIT_NPROC && uid_eq(iter->ns->owner, GLOBAL_ROOT_UID))
+ max = LONG_MAX;
}
return excess;
}
--
2.34.1


2022-02-09 09:57:53

by Michal Koutný

[permalink] [raw]
Subject: [RFC PATCH 2/6] set*uid: Check RLIMIT_PROC against new credentials

The generic idea is that not even root or capable user can force an
unprivileged user's limit breach. (For historical and security reasons
this check is postponed from set*uid to execve.) During the switch the
resource consumption of target the user has to be checked. The commits
905ae01c4ae2 ("Add a reference to ucounts for each cred") and
21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") made the
check in set_user() look at the old user's consumption.

This version of the fix simply moves the check to the place where the
actual switch of the accounting structure happens -- set_cred_ucounts().

The other callers are kept without the check but with the per-userns
accounting they may be newly subject to the check too.
The set_cred_ucounts() becomes inconsistent since task->flags are
passed by the caller but task_rlimit() is implicitly `current`'s, this
patch is meant to illustrate the issue, nicer implementation is
possible.

Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts")
Signed-off-by: Michal Koutný <[email protected]>
---
fs/exec.c | 2 +-
include/linux/cred.h | 2 +-
kernel/cred.c | 24 +++++++++++++++++++++---
kernel/fork.c | 2 +-
kernel/sys.c | 21 +++------------------
kernel/user_namespace.c | 2 +-
6 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index fc598c2652b2..e759e42c61da 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1363,7 +1363,7 @@ int begin_new_exec(struct linux_binprm * bprm)
WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
flush_signal_handlers(me, 0);

- retval = set_cred_ucounts(bprm->cred);
+ retval = set_cred_ucounts(bprm->cred, NULL);
if (retval < 0)
goto out_unlock;

diff --git a/include/linux/cred.h b/include/linux/cred.h
index fcbc6885cc09..455525ab380d 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -170,7 +170,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
-extern int set_cred_ucounts(struct cred *);
+extern int set_cred_ucounts(struct cred *, unsigned int *);

/*
* check for validity of credentials
diff --git a/kernel/cred.c b/kernel/cred.c
index 473d17c431f3..791cab70b764 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -370,7 +370,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
- ret = set_cred_ucounts(new);
+ ret = set_cred_ucounts(new, NULL);
if (ret < 0)
goto error_put;
}
@@ -492,7 +492,7 @@ int commit_creds(struct cred *new)

/* do it
* RLIMIT_NPROC limits on user->processes have already been checked
- * in set_user().
+ * in set_cred_ucounts().
*/
alter_cred_subscribers(new, 2);
if (new->user != old->user || new->user_ns != old->user_ns)
@@ -663,7 +663,7 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
}
EXPORT_SYMBOL(cred_fscmp);

-int set_cred_ucounts(struct cred *new)
+int set_cred_ucounts(struct cred *new, unsigned int *nproc_flags)
{
struct task_struct *task = current;
const struct cred *old = task->real_cred;
@@ -685,6 +685,24 @@ int set_cred_ucounts(struct cred *new)
new->ucounts = new_ucounts;
put_ucounts(old_ucounts);

+ if (!nproc_flags)
+ return 0;
+
+ /*
+ * We don't fail in case of NPROC limit excess here because too many
+ * poorly written programs don't check set*uid() return code, assuming
+ * it never fails if called by root. We may still enforce NPROC limit
+ * for programs doing set*uid()+execve() by harmlessly deferring the
+ * failure to the execve() stage.
+ */
+ if (ucounts_limit_cmp(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0 &&
+ new->user != INIT_USER &&
+ !security_capable(new, &init_user_ns, CAP_SYS_RESOURCE, CAP_OPT_NONE) &&
+ !security_capable(new, &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NONE))
+ *nproc_flags |= PF_NPROC_EXCEEDED;
+ else
+ *nproc_flags &= ~PF_NPROC_EXCEEDED;
+
return 0;
}

diff --git a/kernel/fork.c b/kernel/fork.c
index 7cb21a70737d..a4005c679d29 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3051,7 +3051,7 @@ int ksys_unshare(unsigned long unshare_flags)
goto bad_unshare_cleanup_cred;

if (new_cred) {
- err = set_cred_ucounts(new_cred);
+ err = set_cred_ucounts(new_cred, NULL);
if (err)
goto bad_unshare_cleanup_cred;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 48c90dcceff3..4e4eea30e235 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -472,21 +472,6 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;

- /*
- * We don't fail in case of NPROC limit excess here because too many
- * poorly written programs don't check set*uid() return code, assuming
- * it never fails if called by root. We may still enforce NPROC limit
- * for programs doing set*uid()+execve() by harmlessly deferring the
- * failure to the execve() stage.
- */
- if (ucounts_limit_cmp(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0 &&
- new_user != INIT_USER &&
- !security_capable(new, &init_user_ns, CAP_SYS_RESOURCE, CAP_OPT_NONE) &&
- !security_capable(new, &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NONE))
- current->flags |= PF_NPROC_EXCEEDED;
- else
- current->flags &= ~PF_NPROC_EXCEEDED;
-
free_uid(new->user);
new->user = new_user;
return 0;
@@ -560,7 +545,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (retval < 0)
goto error;

- retval = set_cred_ucounts(new);
+ retval = set_cred_ucounts(new, &current->flags);
if (retval < 0)
goto error;

@@ -622,7 +607,7 @@ long __sys_setuid(uid_t uid)
if (retval < 0)
goto error;

- retval = set_cred_ucounts(new);
+ retval = set_cred_ucounts(new, &current->flags);
if (retval < 0)
goto error;

@@ -701,7 +686,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (retval < 0)
goto error;

- retval = set_cred_ucounts(new);
+ retval = set_cred_ucounts(new, &current->flags);
if (retval < 0)
goto error;

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6b2e3ca7ee99..f7eec0b0233b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1344,7 +1344,7 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));

- if (set_cred_ucounts(cred) < 0)
+ if (set_cred_ucounts(cred, NULL) < 0)
return -EINVAL;

return 0;
--
2.34.1


2022-02-09 10:05:38

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH 0/6] RLIMIT_NPROC in ucounts fixups

Michal Koutný <[email protected]> writes:

> This series is a result of looking deeper into breakage of
> tools/testing/selftests/rlimits/rlimits-per-userns.c after
> https://lore.kernel.org/r/[email protected]/
> is applied.
>
> The description of the original problem that lead to RLIMIT_NPROC et al.
> ucounts rewrite could be ambiguously interpretted as supporting either
> the case of:
> - never-fork service or
> - fork (RLIMIT_NPROC-1) times service.
>
> The scenario is weird anyway given existence of pids controller.
>
> The realization of that scenario relies not only on tracking number of
> processes per user_ns but also newly allows the root to override limit through
> set*uid. The commit message didn't mention that, so it's unclear if it
> was the intention too.
>
> I also noticed that the RLIMIT_NPROC enforcing in fork seems subject to TOCTOU
> race (check(nr_tasks),...,nr_tasks++) so the limit is rather advisory (but
> that's not a new thing related to ucounts rewrite).
>
> This series is RFC to discuss relevance of the subtle changes RLIMIT_NPROC to
> ucounts rewrite introduced.

A quick reply (because I don't have a lot of time at the moment).

I agree with the issues your first patch before this series addresses
and the issues the first 3 patches address.

I have not looked at the tests.

I actually disagree with most of your fixes. Both because of
intrusiveness and because of awkwardness. My basic problem with
your fixes is I don't think they leave the code in a more maintainable
state.

Hopefully later today I can propose some alternative fixes and we can
continue the discussion.


One thing I think you misunderstood is the capability checks in set_user
have always been there. There is a very good argument they are badly
placed so are not exactly checking the correct credentials. Especially
now.

Your patch 4/6 I don't think makes sense. It has always been the
case that root without capabilities is subject to the rlimit. If you
are in a user namespace you are root without capabilities.


Eric

2022-02-10 01:22:47

by Shuah Khan

[permalink] [raw]
Subject: Re: [RFC PATCH 5/6] selftests: Challenge RLIMIT_NPROC in user namespaces

On 2/7/22 5:17 AM, Michal Koutný wrote:
> The services are started in descendant user namepaces, each of them
> should honor the RLIMIT_NPROC that's passed during user namespace
> creation.
>
> main [user_ns_0]
> ` service [user_ns_1]
> ` worker 1
> ` worker 2
> ...
> ` worker k
> ...
> ` service [user_ns_n]
> ` worker 1
> ` worker 2
> ...
> ` worker k
>
> Test uses explicit synchronization, to make sure original parent's limit
> does not interfere with descendants.
>

Thank you for updating the test with the kernel updates. Please see
comments below. A bit of a concern with how long this test will run.
Did you time it?

> Signed-off-by: Michal Koutný <[email protected]>
> ---
> .../selftests/rlimits/rlimits-per-userns.c | 154 ++++++++++++++----
> 1 file changed, 125 insertions(+), 29 deletions(-)
>
> diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns.c b/tools/testing/selftests/rlimits/rlimits-per-userns.c
> index 26dc949e93ea..54c1b345e42b 100644
> --- a/tools/testing/selftests/rlimits/rlimits-per-userns.c
> +++ b/tools/testing/selftests/rlimits/rlimits-per-userns.c
> @@ -9,7 +9,9 @@
> #include <sys/resource.h>
> #include <sys/prctl.h>
> #include <sys/stat.h>
> +#include <sys/socket.h>
>
> +#include <assert.h>
> #include <unistd.h>
> #include <stdlib.h>
> #include <stdio.h>
> @@ -21,38 +23,74 @@
> #include <errno.h>
> #include <err.h>
>
> -#define NR_CHILDS 2
> +#define THE_LIMIT 4
> +#define NR_CHILDREN 5
> +
> +static_assert(NR_CHILDREN >= THE_LIMIT-1, "Need slots for limit-1 children.");
>
> static char *service_prog;
> static uid_t user = 60000;
> static uid_t group = 60000;
> +static struct rlimit saved_limit;
> +
> +/* Two uses: main and service */
> +static pid_t child[NR_CHILDREN];
> +static pid_t pid;
>
> static void setrlimit_nproc(rlim_t n)
> {
> - pid_t pid = getpid();
> struct rlimit limit = {
> .rlim_cur = n,
> .rlim_max = n
> };
> -
> - warnx("(pid=%d): Setting RLIMIT_NPROC=%ld", pid, n);
> + if (getrlimit(RLIMIT_NPROC, &saved_limit) < 0)
> + err(EXIT_FAILURE, "(pid=%d): getrlimit(RLIMIT_NPROC)", pid);
>
> if (setrlimit(RLIMIT_NPROC, &limit) < 0)
> err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid);
> +
> + warnx("(pid=%d): Set RLIMIT_NPROC=%ld", pid, n);
> +}
> +
> +static void restore_rlimit_nproc(void)
> +{
> + if (setrlimit(RLIMIT_NPROC, &saved_limit) < 0)
> + err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC, saved)", pid);
> + warnx("(pid=%d) Restored RLIMIT_NPROC", pid);
> }
>
> -static pid_t fork_child(void)
> +enum msg_sync {
> + UNSHARE,
> + RLIMIT_RESTORE,
> +};
> +
> +static void sync_notify(int fd, enum msg_sync m)
> {
> - pid_t pid = fork();
> + char tmp = m;
> +
> + if (write(fd, &tmp, 1) < 0)
> + warnx("(pid=%d): failed sync-write", pid);
> +}
>
> - if (pid < 0)
> +static void sync_wait(int fd, enum msg_sync m)
> +{
> + char tmp;
> +
> + if (read(fd, &tmp, 1) < 0)
> + warnx("(pid=%d): failed sync-read", pid);
> +}
> +
> +static pid_t fork_child(int control_fd)
> +{
> + pid_t new_pid = fork();
> +
> + if (new_pid < 0)
> err(EXIT_FAILURE, "fork");
>
> - if (pid > 0)
> - return pid;
> + if (new_pid > 0)
> + return new_pid;
>
> pid = getpid();
> -
> warnx("(pid=%d): New process starting ...", pid);
>
> if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
> @@ -73,6 +111,9 @@ static pid_t fork_child(void)
> if (unshare(CLONE_NEWUSER) < 0)
> err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");
>
> + sync_notify(control_fd, UNSHARE);
> + sync_wait(control_fd, RLIMIT_RESTORE);
> +
> char *const argv[] = { "service", NULL };
> char *const envp[] = { "I_AM_SERVICE=1", NULL };
>
> @@ -82,37 +123,92 @@ static pid_t fork_child(void)
> err(EXIT_FAILURE, "(pid=%d): execve", pid);
> }
>
> +static void run_service(void)
> +{
> + size_t i;
> + int ret = EXIT_SUCCESS;
> + struct rlimit limit;
> + char user_ns[PATH_MAX];
> +
> + if (getrlimit(RLIMIT_NPROC, &limit) < 0)
> + err(EXIT_FAILURE, "(pid=%d) failed getrlimit", pid);
> + if (readlink("/proc/self/ns/user", user_ns, PATH_MAX) < 0)
> + err(EXIT_FAILURE, "(pid=%d) failed readlink", pid);
> +
> + warnx("(pid=%d) Service instance attempts %i children, limit %lu:%lu, ns=%s",
> + pid, THE_LIMIT, limit.rlim_cur, limit.rlim_max, user_ns);
> +
> + /* test rlimit inside the service, effectively THE_LIMIT-1 becaue of service itself */
> + for (i = 0; i < THE_LIMIT; i++) {
> + child[i] = fork();
> + if (child[i] == 0) {
> + /* service child */
> + pause();
> + exit(EXIT_SUCCESS);
> + }
> + if (child[i] < 0) {
> + warnx("(pid=%d) service fork %lu failed, errno = %i", pid, i+1, errno);
> + if (!(i == THE_LIMIT-1 && errno == EAGAIN))
> + ret = EXIT_FAILURE;
> + } else if (i == THE_LIMIT-1) {
> + warnx("(pid=%d) RLIMIT_NPROC not honored", pid);
> + ret = EXIT_FAILURE;
> + }
> + }
> +
> + /* service cleanup */
> + for (i = 0; i < THE_LIMIT; i++)
> + if (child[i] > 0)
> + kill(child[i], SIGUSR1);
> +
> + for (i = 0; i < THE_LIMIT; i++)
> + if (child[i] > 0)
> + waitpid(child[i], NULL, WNOHANG);
> +
> + if (ret)
> + exit(ret);
> + pause();
> +}
> +
> int main(int argc, char **argv)
> {
> size_t i;
> - pid_t child[NR_CHILDS];
> - int wstatus[NR_CHILDS];
> - int childs = NR_CHILDS;
> - pid_t pid;
> + int control_fd[NR_CHILDREN];
> + int wstatus[NR_CHILDREN];
> + int children = NR_CHILDREN;
> + int sockets[2];
> +
> + pid = getpid();
>
> if (getenv("I_AM_SERVICE")) {
> - pause();
> - exit(EXIT_SUCCESS);
> + run_service();
> + exit(EXIT_FAILURE);

Why is this a failure unconditionally?

> }
>
> service_prog = argv[0];
> - pid = getpid();
>
> warnx("(pid=%d) Starting testcase", pid);
>
> - /*
> - * This rlimit is not a problem for root because it can be exceeded.
> - */
> - setrlimit_nproc(1);
> -
> - for (i = 0; i < NR_CHILDS; i++) {
> - child[i] = fork_child();
> + setrlimit_nproc(THE_LIMIT);
> + for (i = 0; i < NR_CHILDREN; i++) {
> + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, sockets) < 0)
> + err(EXIT_FAILURE, "(pid=%d) socketpair failed", pid);
> + control_fd[i] = sockets[0];
> + child[i] = fork_child(sockets[1]);
> wstatus[i] = 0;
> + }
> +
> + for (i = 0; i < NR_CHILDREN; i++)
> + sync_wait(control_fd[i], UNSHARE);
> + restore_rlimit_nproc();
> +
> + for (i = 0; i < NR_CHILDREN; i++) {
> + sync_notify(control_fd[i], RLIMIT_RESTORE);
> usleep(250000);

How long does this test now run for with this loop?

> }
>
> while (1) {
> - for (i = 0; i < NR_CHILDS; i++) {
> + for (i = 0; i < NR_CHILDREN; i++) {
> if (child[i] <= 0)
> continue;
>
> @@ -126,22 +222,22 @@ int main(int argc, char **argv)
> warn("(pid=%d): waitpid(%d)", pid, child[i]);
>
> child[i] *= -1;
> - childs -= 1;
> + children -= 1;
> }
>
> - if (!childs)
> + if (!children)
> break;
>
> usleep(250000);
>
> - for (i = 0; i < NR_CHILDS; i++) {
> + for (i = 0; i < NR_CHILDREN; i++) {
> if (child[i] <= 0)
> continue;
> kill(child[i], SIGUSR1);
> }
> }
>
> - for (i = 0; i < NR_CHILDS; i++) {
> + for (i = 0; i < NR_CHILDREN; i++) {
> if (WIFEXITED(wstatus[i]))
> warnx("(pid=%d): pid %d exited, status=%d",
> pid, -child[i], WEXITSTATUS(wstatus[i]));
>

Please a add few more comments in the code path.

thanks,
-- Shuah

2022-02-10 01:26:06

by Shuah Khan

[permalink] [raw]
Subject: Re: [RFC PATCH 6/6] selftests: Test RLIMIT_NPROC in clone-created user namespaces

On 2/7/22 5:18 AM, Michal Koutný wrote:
> Verify RLIMIT_NPROC observance in user namespaces also in the
> clone(CLONE_NEWUSER) path.
> Note the such a user_ns is created by the privileged user.
>

Does this test run in non-privileged user mode? If it doesn't
let add a check and skip the test.

> Signed-off-by: Michal Koutný <[email protected]>

thanks,
-- Shuah

2022-02-10 02:07:06

by Solar Designer

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] set_user: Perform RLIMIT_NPROC capability check against new user credentials

Hi Michal,

On Mon, Feb 07, 2022 at 01:17:55PM +0100, Michal Koutn? wrote:
> The check is currently against the current->cred but since those are
> going to change and we want to check RLIMIT_NPROC condition after the
> switch, supply the capability check with the new cred.
> But since we're checking new_user being INIT_USER any new cred's
> capability-based allowance may be redundant when the check fails and the
> alternative solution would be revert of the commit 2863643fb8b9
> ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")
>
> Fixes: 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")
>
> Cc: Solar Designer <[email protected]>
> Cc: Christian Brauner <[email protected]>
> Signed-off-by: Michal Koutn? <[email protected]>
> ---
> kernel/sys.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 8ea20912103a..48c90dcceff3 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -481,7 +481,8 @@ static int set_user(struct cred *new)
> */
> if (ucounts_limit_cmp(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0 &&
> new_user != INIT_USER &&
> - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
> + !security_capable(new, &init_user_ns, CAP_SYS_RESOURCE, CAP_OPT_NONE) &&
> + !security_capable(new, &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NONE))
> current->flags |= PF_NPROC_EXCEEDED;
> else
> current->flags &= ~PF_NPROC_EXCEEDED;

Thank you for working on this and CC'ing me on it. This is related to
the discussion Christian and I had in September:

https://lore.kernel.org/all/20210913100140.bxqlg47pushoqa3r@wittgenstein/

Christian was going to revert 2863643fb8b9, but apparently that never
happened. Back then, I also suggested:

"Alternatively, we could postpone the set_user() calls until we're
running with the new user's capabilities, but that's an invasive change
that's likely to create its own issues."

The change you propose above is similar to that, but is more limited and
non-invasive. That looks good to me.

However, I think you need to drop the negations of the return value from
security_capable(). security_capable() returns 0 or -EPERM, while
capable() returns a bool, in kernel/capability.c: ns_capable_common():

capable = security_capable(current_cred(), ns, cap, opts);
if (capable == 0) {
current->flags |= PF_SUPERPRIV;
return true;
}
return false;

Also, your change would result in this no longer setting PF_SUPERPRIV.
This may be fine, but you could want to document it.

On a related note, this comment in security/commoncap.c needs an update:

* NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
* and has_capability() functions. That is, it has the reverse semantics:
* cap_has_capability() returns 0 when a task has a capability, but the
* kernel's capable() and has_capability() returns 1 for this case.

cap_has_capability() doesn't actually exist, and perhaps the comment
should refer to cap_capable().

Alexander

2022-02-10 02:08:58

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] set_user: Perform RLIMIT_NPROC capability check against new user credentials

Solar Designer <[email protected]> writes:

> Hi Michal,
>
> On Mon, Feb 07, 2022 at 01:17:55PM +0100, Michal Koutný wrote:
>> The check is currently against the current->cred but since those are
>> going to change and we want to check RLIMIT_NPROC condition after the
>> switch, supply the capability check with the new cred.
>> But since we're checking new_user being INIT_USER any new cred's
>> capability-based allowance may be redundant when the check fails and the
>> alternative solution would be revert of the commit 2863643fb8b9
>> ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")
>>
>> Fixes: 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")
>>
>> Cc: Solar Designer <[email protected]>
>> Cc: Christian Brauner <[email protected]>
>> Signed-off-by: Michal Koutný <[email protected]>
>> ---
>> kernel/sys.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index 8ea20912103a..48c90dcceff3 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -481,7 +481,8 @@ static int set_user(struct cred *new)
>> */
>> if (ucounts_limit_cmp(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0 &&
>> new_user != INIT_USER &&
>> - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
>> + !security_capable(new, &init_user_ns, CAP_SYS_RESOURCE, CAP_OPT_NONE) &&
>> + !security_capable(new, &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NONE))
>> current->flags |= PF_NPROC_EXCEEDED;
>> else
>> current->flags &= ~PF_NPROC_EXCEEDED;
>
> Thank you for working on this and CC'ing me on it. This is related to
> the discussion Christian and I had in September:
>
> https://lore.kernel.org/all/20210913100140.bxqlg47pushoqa3r@wittgenstein/
>
> Christian was going to revert 2863643fb8b9, but apparently that never
> happened. Back then, I also suggested:
>
> "Alternatively, we could postpone the set_user() calls until we're
> running with the new user's capabilities, but that's an invasive change
> that's likely to create its own issues."

I really think we need to do something like that. Probably just set a
flag in commit_creds and test later.

I was working on fixes that looked cleaner and I just recently realized
that the test in fork is almost as bad. The function has_capability can
be used but the same kind of problems exist.

I thought I was very quickly going to have patches to post but I need
to redo everything now that I have noticed the issue in fork, so it will
be a day or so.

Eric


> The change you propose above is similar to that, but is more limited and
> non-invasive. That looks good to me.
>
> However, I think you need to drop the negations of the return value from
> security_capable(). security_capable() returns 0 or -EPERM, while
> capable() returns a bool, in kernel/capability.c: ns_capable_common():
>
> capable = security_capable(current_cred(), ns, cap, opts);
> if (capable == 0) {
> current->flags |= PF_SUPERPRIV;
> return true;
> }
> return false;
>
> Also, your change would result in this no longer setting PF_SUPERPRIV.
> This may be fine, but you could want to document it.
>
> On a related note, this comment in security/commoncap.c needs an update:
>
> * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
> * and has_capability() functions. That is, it has the reverse semantics:
> * cap_has_capability() returns 0 when a task has a capability, but the
> * kernel's capable() and has_capability() returns 1 for this case.
>
> cap_has_capability() doesn't actually exist, and perhaps the comment
> should refer to cap_capable().
>
> Alexander

2022-02-10 05:40:43

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH 4/6] ucounts: Allow root to override RLIMIT_NPROC

Michal Koutný <[email protected]> writes:

> Call sites of ucounts_limit_cmp() would allow the global root or capable
> user to bypass RLIMIT_NPROC on the bottom level of user_ns tree by not
> looking at ucounts at all.
>
> As the traversal up the user_ns tree continues, the ucounts to which the
> task is charged may switch the owning user (to the creator of user_ns).
> If the new chargee is root, we don't really care about RLIMIT_NPROC
> observation, so lift the limit to the max.
>
> The result is that an unprivileged user U can globally run more that
> RLIMIT_NPROC (of user_ns) tasks but within each user_ns it is still
> limited to RLIMINT_NPROC (as passed into task->signal->rlim) iff the
> user_nss are created by the privileged user.

My apologies. When I first looked at this I thought this change was
non-sense. However I had missed the special logic that happens with
RLIMIT_NPROC to carefully allow the root user to bypass the NPROC
limits.

So yes this does look like something that needs to be addressed as well.

Thank you for reporting all of these issues.

Eric

>
> Signed-off-by: Michal Koutný <[email protected]>
> ---
> kernel/ucount.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/kernel/ucount.c b/kernel/ucount.c
> index 53ccd96387dd..f52b7273a572 100644
> --- a/kernel/ucount.c
> +++ b/kernel/ucount.c
> @@ -356,6 +356,9 @@ long ucounts_limit_cmp(struct ucounts *ucounts, enum ucount_type type, unsigned
> if (excess > 0)
> return excess;
> max = READ_ONCE(iter->ns->ucount_max[type]);
> + /* Next ucounts owned by root? RLIMIT_NPROC is moot */
> + if (type == UCOUNT_RLIMIT_NPROC && uid_eq(iter->ns->owner, GLOBAL_ROOT_UID))
> + max = LONG_MAX;
> }
> return excess;
> }

2022-02-11 15:50:01

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 0/8] ucounts: RLIMIT_NPROC fixes


Michal Koutný recently found some bugs in the enforcement of
RLIMIT_NPROC in the recent ucount rlimit implementation.

I saw some additional bugs and some cleaner ways to fix the problem so
instead of starting with his fixes these are my own.

I am aiming to send the first 5 of these to Linus once they have been
reviewed. Two more are fixes in principle but I don't think do anything
in practice. The last one is just a cleanup to prevent future
divergence of RLIMIT_NPROC logic.

Eric W. Biederman (8):
ucounts: Fix RLIMIT_NPROC regression
ucounts: Fix set_cred_ucounts
ucounts: Fix and simplify RLIMIT_NPROC handling during setuid()+execve
ucounts: Only except the root user in init_user_ns from RLIMIT_NPROC
ucounts: Handle wrapping in is_ucounts_overlimit
ucounts: Handle inc_rlimit_ucounts wrapping in fork
rlimit: For RLIMIT_NPROC test the child not the parent for capabilites
ucounts: Use the same code to enforce RLIMIT_NPROC in fork and exec

fs/exec.c | 12 +++++-------
include/linux/sched.h | 2 +-
include/linux/sched/signal.h | 2 ++
kernel/cred.c | 24 +++++++++++-------------
kernel/fork.c | 32 ++++++++++++++++++++++++--------
kernel/sys.c | 14 --------------
kernel/ucount.c | 3 ++-
kernel/user_namespace.c | 2 ++
8 files changed, 47 insertions(+), 44 deletions(-)

Eric

2022-02-11 22:07:50

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/8] ucounts: RLIMIT_NPROC fixes

Shuah Khan <[email protected]> writes:

> On 2/10/22 7:01 PM, Eric W. Biederman wrote:
>> Michal Koutný recently found some bugs in the enforcement of
>> RLIMIT_NPROC in the recent ucount rlimit implementation.
>> I saw some additional bugs and some cleaner ways to fix the problem so
>> instead of starting with his fixes these are my own.
>> I am aiming to send the first 5 of these to Linus once they have been
>> reviewed. Two more are fixes in principle but I don't think do anything
>> in practice. The last one is just a cleanup to prevent future
>> divergence of RLIMIT_NPROC logic.
>> Eric W. Biederman (8):
>> ucounts: Fix RLIMIT_NPROC regression
>> ucounts: Fix set_cred_ucounts
>> ucounts: Fix and simplify RLIMIT_NPROC handling during setuid()+execve
>> ucounts: Only except the root user in init_user_ns from RLIMIT_NPROC
>
> Should this be "Only exempt"?

Yes.
> > ucounts: Handle wrapping in is_ucounts_overlimit
>> ucounts: Handle inc_rlimit_ucounts wrapping in fork
>> rlimit: For RLIMIT_NPROC test the child not the parent for capabilites
>> ucounts: Use the same code to enforce RLIMIT_NPROC in fork and exec
>> fs/exec.c | 12 +++++-------
>> include/linux/sched.h | 2 +-
>> include/linux/sched/signal.h | 2 ++
>> kernel/cred.c | 24 +++++++++++-------------
>> kernel/fork.c | 32 ++++++++++++++++++++++++--------
>> kernel/sys.c | 14 --------------
>> kernel/ucount.c | 3 ++-
>> kernel/user_namespace.c | 2 ++
>> 8 files changed, 47 insertions(+), 44 deletions(-)
>> Eric
>>
>
> Do we need updates to selftests - Michal's patch series included changes to
> selftests/exec

selftests would be good.

Eric


2022-02-12 05:52:28

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 7/8] rlimit: For RLIMIT_NPROC test the child not the parent for capabilites

Ever since capabilities have become user namespace relative the
capability checks to allow overrriing RLIMIT_NPROC in fork has been
wrong. It is desirable to test the capabilities the new process will
have not to test the capabilities of the existing process. In all
cases except when creating a user namespace this does not matter, and
even then not enforcing RLIMIT_NPROC on the root_user probably makes
such a test moot.

Still the test is wrong in principle so fix it to the more stringent test.

Especially now that RLIMIT_NPROC enforcement has become per user namespace.

Fixes: 3486740a4f32 ("userns: security: make capabilities relative to the user namespace")
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
kernel/fork.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 69333078259c..79661678a5bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2030,7 +2030,8 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_count;
if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if ((task_ucounts(p) != &init_ucounts) &&
- !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
+ !has_capability(p, CAP_SYS_RESOURCE) &&
+ !has_capability(p, CAP_SYS_ADMIN))
goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_CHECK;
--
2.29.2

2022-02-13 19:59:43

by Solar Designer

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] set_user: Perform RLIMIT_NPROC capability check against new user credentials

On Fri, Feb 11, 2022 at 02:32:47PM -0600, Eric W. Biederman wrote:
> Solar Designer <[email protected]> writes:
> > https://lore.kernel.org/all/20210913100140.bxqlg47pushoqa3r@wittgenstein/
> >
> > Christian was going to revert 2863643fb8b9, but apparently that never
> > happened. Back then, I also suggested:
> >
> > "Alternatively, we could postpone the set_user() calls until we're
> > running with the new user's capabilities, but that's an invasive change
> > that's likely to create its own issues."
>
> Back then you mentioned that apache suexec was broken. Do you have
> any more details?
>
> I would like to make certain the apache suexec issue is fixed but
> without a few details I can't do that. I tried looking but I can't
> find an public report about apache suexec being broken.

I'm not aware of anyone actually running into this issue and reporting
it. The systems that I personally know use suexec along with rlimits
still run older/distro kernels, so would not yet be affected.

So my mention was based on my understanding of how suexec works, and
code review. Specifically, Apache httpd has the setting RLimitNPROC,
which makes it set RLIMIT_NPROC:

https://httpd.apache.org/docs/2.4/mod/core.html#rlimitnproc

The above documentation for it includes:

"This applies to processes forked from Apache httpd children servicing
requests, not the Apache httpd children themselves. This includes CGI
scripts and SSI exec commands, but not any processes forked from the
Apache httpd parent, such as piped logs."

In code, there are:

./modules/generators/mod_cgid.c: ( (cgid_req.limits.limit_nproc_set) && ((rc = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC,
./modules/generators/mod_cgi.c: ((rc = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC,
./modules/filters/mod_ext_filter.c: rv = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC, conf->limit_nproc);

For example, in mod_cgi.c this is in run_cgi_child().

I think this means an httpd child sets RLIMIT_NPROC shortly before it
execs suexec, which is a SUID root program. suexec then switches to the
target user and execs the CGI script.

Before 2863643fb8b9, the setuid() in suexec would set the flag, and the
target user's process count would be checked against RLIMIT_NPROC on
execve(). After 2863643fb8b9, the setuid() in suexec wouldn't set the
flag because setuid() is (naturally) called when the process is still
running as root (thus, has those limits bypass capabilities), and
accordingly execve() would not check the target user's process count
against RLIMIT_NPROC.

> My goal is to come up with a very careful and conservative set of
> patches that fix all of the known issues with RLIMIT_NPROC.

The most conservative fix for this one would be to revert 2863643fb8b9
(preserving other changes that were made on top of it). I think this
commit did not fix a real issue - it attempted to fix what someone
thought was a discrepancy, but actually made it worse.

However, your recent patch trying to fix that commit looks like it'd
also repair the behavior for suexec.

Thanks,

Alexander

2022-02-14 01:12:20

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] set_user: Perform RLIMIT_NPROC capability check against new user credentials

Solar Designer <[email protected]> writes:

> Hi Michal,
>
> On Mon, Feb 07, 2022 at 01:17:55PM +0100, Michal Koutný wrote:
>> The check is currently against the current->cred but since those are
>> going to change and we want to check RLIMIT_NPROC condition after the
>> switch, supply the capability check with the new cred.
>> But since we're checking new_user being INIT_USER any new cred's
>> capability-based allowance may be redundant when the check fails and the
>> alternative solution would be revert of the commit 2863643fb8b9
>> ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")
>>
>> Fixes: 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds")
>>
>> Cc: Solar Designer <[email protected]>
>> Cc: Christian Brauner <[email protected]>
>> Signed-off-by: Michal Koutný <[email protected]>
>> ---
>> kernel/sys.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index 8ea20912103a..48c90dcceff3 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -481,7 +481,8 @@ static int set_user(struct cred *new)
>> */
>> if (ucounts_limit_cmp(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) >= 0 &&
>> new_user != INIT_USER &&
>> - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
>> + !security_capable(new, &init_user_ns, CAP_SYS_RESOURCE, CAP_OPT_NONE) &&
>> + !security_capable(new, &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NONE))
>> current->flags |= PF_NPROC_EXCEEDED;
>> else
>> current->flags &= ~PF_NPROC_EXCEEDED;
>
> Thank you for working on this and CC'ing me on it. This is related to
> the discussion Christian and I had in September:
>
> https://lore.kernel.org/all/20210913100140.bxqlg47pushoqa3r@wittgenstein/
>
> Christian was going to revert 2863643fb8b9, but apparently that never
> happened. Back then, I also suggested:
>
> "Alternatively, we could postpone the set_user() calls until we're
> running with the new user's capabilities, but that's an invasive change
> that's likely to create its own issues."

Back then you mentioned that apache suexec was broken. Do you have
any more details?

I would like to make certain the apache suexec issue is fixed but
without a few details I can't do that. I tried looking but I can't
find an public report about apache suexec being broken.

My goal is to come up with a very careful and conservative set of
patches that fix all of the known issues with RLIMIT_NPROC.

Eric

2022-02-14 08:19:49

by Shuah Khan

[permalink] [raw]
Subject: Re: [PATCH 0/8] ucounts: RLIMIT_NPROC fixes

On 2/10/22 7:01 PM, Eric W. Biederman wrote:
>
> Michal Koutný recently found some bugs in the enforcement of
> RLIMIT_NPROC in the recent ucount rlimit implementation.
>
> I saw some additional bugs and some cleaner ways to fix the problem so
> instead of starting with his fixes these are my own.
>
> I am aiming to send the first 5 of these to Linus once they have been
> reviewed. Two more are fixes in principle but I don't think do anything
> in practice. The last one is just a cleanup to prevent future
> divergence of RLIMIT_NPROC logic.
>
> Eric W. Biederman (8):
> ucounts: Fix RLIMIT_NPROC regression
> ucounts: Fix set_cred_ucounts
> ucounts: Fix and simplify RLIMIT_NPROC handling during setuid()+execve
> ucounts: Only except the root user in init_user_ns from RLIMIT_NPROC

Should this be "Only exempt"?

> ucounts: Handle wrapping in is_ucounts_overlimit
> ucounts: Handle inc_rlimit_ucounts wrapping in fork
> rlimit: For RLIMIT_NPROC test the child not the parent for capabilites
> ucounts: Use the same code to enforce RLIMIT_NPROC in fork and exec
>
> fs/exec.c | 12 +++++-------
> include/linux/sched.h | 2 +-
> include/linux/sched/signal.h | 2 ++
> kernel/cred.c | 24 +++++++++++-------------
> kernel/fork.c | 32 ++++++++++++++++++++++++--------
> kernel/sys.c | 14 --------------
> kernel/ucount.c | 3 ++-
> kernel/user_namespace.c | 2 ++
> 8 files changed, 47 insertions(+), 44 deletions(-)
>
> Eric
>

Do we need updates to selftests - Michal's patch series included changes to
selftests/exec

thanks,
-- Shuah

2022-02-14 09:37:43

by Etienne Dechamps

[permalink] [raw]
Subject: Re: [RFC PATCH 0/6] RLIMIT_NPROC in ucounts fixups

Hello there,

On 07/02/2022 12:17, Michal Koutný wrote:
> This series is a result of looking deeper into breakage of
> tools/testing/selftests/rlimits/rlimits-per-userns.c after
> https://lore.kernel.org/r/[email protected]/
> is applied.

Pardon the intrusion, but I thought you might be interested to know that
as a humble user I noticed actual user-visible breakage from 59ec715
"ucounts: Fix rlimit max values check":
https://bugzilla.kernel.org/show_bug.cgi?id=215596

I'm not sure I understand everything that's going on in this thread but
it does seem very relevant. You guys might want to double-check the
behavior in the particular scenario described there. I'm mostly sending
this to make sure everything is cross-linked.

2022-02-15 11:01:57

by Michal Koutný

[permalink] [raw]
Subject: Re: [RFC PATCH 6/6] selftests: Test RLIMIT_NPROC in clone-created user namespaces

On Wed, Feb 09, 2022 at 06:25:34PM -0700, Shuah Khan <[email protected]> wrote:
> Does this test run in non-privileged user mode? If it doesn't
> let add a check and skip the test.

It requires user namespaces created by the privileged user (to bypass
RLIMIT_NPROC on the top level).

I'll add the check into code.

Michal

2022-02-15 12:38:57

by Michal Koutný

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] set_user: Perform RLIMIT_NPROC capability check against new user credentials

On Thu, Feb 10, 2022 at 02:14:05AM +0100, Solar Designer <[email protected]> wrote:
> However, I think you need to drop the negations of the return value from
> security_capable().
> security_capable() returns 0 or -EPERM, while capable() returns a
> bool, in kernel/capability.c: ns_capable_common():

Oops. Yeah, I only blindly applied replacement with a predicate for
(new) cred and overlooked this inverse semantics. Thanks for pointing
that out to me!

Nevertheless, this will likely be incorporated via Eric's series
anyway.


Michal

2022-02-15 15:08:00

by Michal Koutný

[permalink] [raw]
Subject: Re: [PATCH 0/8] ucounts: RLIMIT_NPROC fixes

On Fri, Feb 11, 2022 at 11:22:13AM -0700, Shuah Khan <[email protected]> wrote:
> Do we need updates to selftests - Michal's patch series included changes to
> selftests/exec

In my understanding the original rlimits-per-userns.c covers an invalid
use case -- clone(0);setuid();unshare(CLONE_NEWUSER) -- where the
created user_ns is owned by unprivileged user and the global
RLIMIT_NPROC cannot be breached.

My patched variant retains this use-case (should fail) and adds
clone(CLONE_NEWUSER);setuid() [1] variant which should be the valid
use-case for per-user per-user-ns RLIMIT_NPROC.

Michal

[1] In this situation theoretically equivalent to clone(0);unshare(CLONE_NEWUSER);setuid().

2022-02-15 16:47:14

by Michal Koutný

[permalink] [raw]
Subject: Re: [RFC PATCH 0/6] RLIMIT_NPROC in ucounts fixups

On Sat, Feb 12, 2022 at 03:32:30PM +0000, Etienne Dechamps <[email protected]> wrote:
> I'm not sure I understand everything that's going on in this thread but it
> does seem very relevant. You guys might want to double-check the behavior in
> the particular scenario described there. I'm mostly sending this to make
> sure everything is cross-linked.

Thanks for the report with strace.

AFAICT, it's caused by setresuid() after unshare(), i.e. all root's
tasks are (wrongly) compared against the lowered RLIMIT_NPROC.

This is tackled by my RFC patch 2/6 [1] or Eric's variant but 3/8
(equivalent fix for this case but I haven't run that build).

Michal

[1] I could run your test (LimitNPROC=1 actually) against kernel with my
patches and the service starts.

2022-02-16 06:21:04

by Michal Koutný

[permalink] [raw]
Subject: Re: [RFC PATCH 5/6] selftests: Challenge RLIMIT_NPROC in user namespaces

On Wed, Feb 09, 2022 at 06:22:18PM -0700, Shuah Khan <[email protected]> wrote:
> Please see comments below. A bit of a concern with how long this test
> will run. Did you time it?

It runs around 1?s, I didn't measure it and I used it manually only.

> How long does this test now run for with this loop?

I kept this sleep to space output from individual tasks for
better readability of output. It's not necessary for the sake of the
test. I'll remove it in next version.

> > - for (i = 0; i < NR_CHILDS; i++) {
> > + for (i = 0; i < NR_CHILDREN; i++) {
> > if (WIFEXITED(wstatus[i]))
> > warnx("(pid=%d): pid %d exited, status=%d",
> > pid, -child[i], WEXITSTATUS(wstatus[i]));
> >
>
> Please a add few more comments in the code path.

Hehe, this is inherited from the original version. (True, it's not
overly clear on its own.)

Michal

2022-02-16 19:57:39

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH v2 0/5] ucounts: RLIMIT_NPROC fixes


Michal Koutný recently found some bugs in the enforcement of
RLIMIT_NPROC in the recent ucount rlimit implementation.

I saw some additional bugs and some cleaner ways to fix the problem so
instead of starting with his fixes these are my own.

I have rewritten about half my fixes since the last time this was
posted. There is this notion (not entirely wrong) that the code should
be consistent and make sense. When I dug in I discovered that has not
been the case for the last 20 years. Fixing the long standing
inconsistencies is something that seems to warrent wider vetting on
linux-api.

So with this set of patches I have developed a very conservative
approach changing only what is necessary to fix the bugs that I can
see clearly. Cleanups and anything that is making the code more
consistent can follow after we have the code working as it has
historically.

Anyone who can please take a look and tell me if I am doing something silly.

Eric W. Biederman (5):
rlimit: Fix RLIMIT_NPROC enforcement failure caused by capability calls in set_user
ucounts: Enforce RLIMIT_NPROC not RLIMIT_NPROC+1
ucounts: Base set_cred_ucounts changes on the real user
ucounts: Move RLIMIT_NPROC handling after set_user
ucounts: Handle wrapping in is_ucounts_overlimit

kernel/cred.c | 9 ++-------
kernel/fork.c | 10 +++++-----
kernel/sys.c | 20 ++++++++++++++------
kernel/ucount.c | 3 ++-
4 files changed, 23 insertions(+), 19 deletions(-)

Eric

2022-02-18 18:35:56

by Eric W. Biederman

[permalink] [raw]
Subject: [GIT PULL] ucounts: RLIMIT_NPROC fixes for v5.17


Linus,

Please pull the ucount-rlimit-fixes-for-v5.17 branch from the git tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git ucount-rlimit-fixes-for-v5.17

HEAD: 0cbae9e24fa7d6c6e9f828562f084da82217a0c5 ucounts: Handle wrapping in is_ucounts_overlimit

Michal Koutný recently found some bugs in the enforcement of
RLIMIT_NPROC in the recent ucount rlimit implementation.

I saw some additional bugs and some cleaner ways to fix the problem so
instead of starting with his fixes these are my own.

I have rewritten about half my fixes since the last time this was
posted. There is this notion (not entirely wrong) that the code should
be consistent and make sense. When I dug in I discovered that has not
been the case for the last 20 years. Fixing the long standing
inconsistencies is something that seems to warrent wider vetting on
linux-api.

So with this set of patches I have developed a very conservative
approach changing only what is necessary to fix the bugs that I can
see clearly. Cleanups and anything that is making the code more
consistent can follow after we have the code working as it has
historically.

I had hoped to let this sit in linux-next for a few days just to be
doubly certain all is well. But these patches are all trivial and
linux-next is on holiday.

v2: https://lkml.kernel.org/r/[email protected]>
v1: https://lkml.kernel.org/r/[email protected]>

Eric W. Biederman (5):
rlimit: Fix RLIMIT_NPROC enforcement failure caused by capability calls in set_user
ucounts: Enforce RLIMIT_NPROC not RLIMIT_NPROC+1
ucounts: Base set_cred_ucounts changes on the real user
ucounts: Move RLIMIT_NPROC handling after set_user
ucounts: Handle wrapping in is_ucounts_overlimit

kernel/cred.c | 9 ++-------
kernel/fork.c | 10 +++++-----
kernel/sys.c | 20 ++++++++++++++------
kernel/ucount.c | 3 ++-
4 files changed, 23 insertions(+), 19 deletions(-)

p.s. I should say that the problem is not so much inconsistencies
(although those exist) but that it is very difficult to figure out what
the code should be doing in the case of RLIMIT_NPROC.

All other rlimits are only enforced where the resource is acquired
(allocated). RLIMIT_NPROC by necessity needs to be enforced in
an additional location, and our current implementation stumbled
it's way into that implementation.

Eric




2022-02-21 09:22:28

by pr-tracker-bot

[permalink] [raw]
Subject: Re: [GIT PULL] ucounts: RLIMIT_NPROC fixes for v5.17

The pull request you sent on Fri, 18 Feb 2022 09:34:24 -0600:

> git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git ucount-rlimit-fixes-for-v5.17

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/2d3409ebc87f4bc4ed23bd39e78db9ffc29eec44

Thank you!

--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

2022-02-23 02:00:23

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH 0/6] RLIMIT_NPROC in ucounts fixups

Michal Koutný <[email protected]> writes:

> On Sat, Feb 12, 2022 at 03:32:30PM +0000, Etienne Dechamps <[email protected]> wrote:
>> I'm not sure I understand everything that's going on in this thread but it
>> does seem very relevant. You guys might want to double-check the behavior in
>> the particular scenario described there. I'm mostly sending this to make
>> sure everything is cross-linked.
>
> Thanks for the report with strace.
>
> AFAICT, it's caused by setresuid() after unshare(), i.e. all root's
> tasks are (wrongly) compared against the lowered RLIMIT_NPROC.
>
> This is tackled by my RFC patch 2/6 [1] or Eric's variant but 3/8
> (equivalent fix for this case but I haven't run that build).
>
> Michal
>
> [1] I could run your test (LimitNPROC=1 actually) against kernel with my
> patches and the service starts.


So I looked into this and our previous patchsets (but not my final one)
did resolve this.

What fixed it and what is needed to fix this is not enforcing
RLIMIT_NPROC when the user who creates the user namespace is INIT_USER.

AKA something like the patch below. It is a regression so if at all
possible it needs to be fixed, and it is certainly possible.

The patch below feels right at first glance, but I am not convinced that
testing cred->user or cred->ucounts is the proper test so I am going to
sleep on this a little bit.

I did want everyone to know I looked into this and I am going to ensure
this gets fixed.

diff --git a/kernel/fork.c b/kernel/fork.c
index 17d8a8c85e3b..532ce5cbf851 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2027,7 +2027,7 @@ static __latent_entropy struct task_struct *copy_process(

retval = -EAGAIN;
if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
- if (p->real_cred->user != INIT_USER &&
+ if (p->real_cred->ucounts != &init_ucounts &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_cleanup_count;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 97dc9e5d6bf9..7b5d74a7845c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -490,7 +490,7 @@ static void flag_nproc_exceeded(struct cred *new)
* failure to the execve() stage.
*/
if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
- new->user != INIT_USER)
+ new->ucounts != &init_ucounts)
current->flags |= PF_NPROC_EXCEEDED;
else
current->flags &= ~PF_NPROC_EXCEEDED;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6b2e3ca7ee99..925fb3579ef3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -123,6 +123,8 @@ int create_user_ns(struct cred *new)
ns->ucount_max[i] = INT_MAX;
}
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+ if (new->ucounts == &init_ucounts)
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));





2022-02-24 01:27:43

by Eric W. Biederman

[permalink] [raw]
Subject: How should rlimits, suid exec, and capabilities interact?


[CC'd the security list because I really don't know who the right people
are to drag into this discussion]

While looking at some issues that have cropped up with making it so
that RLIMIT_NPROC cannot be escaped by creating a user namespace I have
stumbled upon a very old issue of how rlimits and suid exec interact
poorly.

This specific saga starts with commit 909cc4ae86f3 ("[PATCH] Fix two
bugs with process limits (RLIMIT_NPROC)") from
https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git which
essentially replaced a capable() check with a an open-coded
implementation of suser(), for RLIMIT_NPROC.

The description from Neil Brown was:

1/ If a setuid process swaps it's real and effective uids and then forks,
the fork fails if the new realuid has more processes
than the original process was limited to.
This is particularly a problem if a user with a process limit
(e.g. 256) runs a setuid-root program which does setuid() + fork()
(e.g. lprng) while root already has more than 256 process (which
is quite possible).

The root problem here is that a limit which should be a per-user
limit is being implemented as a per-process limit with
per-process (e.g. CAP_SYS_RESOURCE) controls.
Being a per-user limit, it should be that the root-user can over-ride
it, not just some process with CAP_SYS_RESOURCE.

This patch adds a test to ignore process limits if the real user is root.

The test to see if the real user is root was:
if (p->real_cred->user != INIT_USER) ...
which persists to this day in fs/fork.c:copy_process().

The practical problem with this test is that it works like nothing else
in the kernel, and so does not look like what it is. Saying:
if (!uid_eq(p->real_cred->uid, GLOBAL_ROOT_USER)) ...

would at least be more recognizable.

Really this entire test should be if (!capable(CAP_SYS_RESOURCE) because
CAP_SYS_RESOURCE is the capability that controls if you are allowed to
exceed your rlimits.

Which brings us to the practical issues of how all of these things are
wired together today.

The per-user rlimits are accounted based upon a processes real user, not
the effective user. All other permission checks are based upon the
effective user. This has the practical effect that uids are swapped as
above that the processes are charged to root, but use the permissions of
an ordinary user.

The problems get worse when you realize that suid exec does not reset
any of the rlimits except for RLIMIT_STACK.

The rlimits that are particularly affected and are per-user are:
RLIMIT_NPROC, RLIMIT_MSGQUEUE, RLIMIT_SIGPENDING, RLIMIT_MEMLOCK.

But I think failing to reset rlimits during exec has the potential to
effect any suid exec.

Does anyone have any historical knowledge or sense of how this should
work?

Right now it feels like we have coded ourselves into a corner and will
have to risk breaking userspace to get out of it. AKA I think we need
a policy of reseting rlimits on suid exec, and I think we need to store
global rlimits based upon the effective user not the real user. Those
changes should allow making capable calls where they belong, and
removing the much too magic user == INIT_USER test for RLIMIT_NPROC.

Eric

2022-02-24 02:10:48

by Linus Torvalds

[permalink] [raw]
Subject: Re: How should rlimits, suid exec, and capabilities interact?

On Wed, Feb 23, 2022 at 5:24 PM Eric W. Biederman <[email protected]> wrote:
>
> Question: Running a suid program today charges the activity of that
> program to the user who ran that program, not to the user the program
> runs as. Does anyone see a problem with charging the user the program
> runs as?

So I think that there's actually two independent issues with limits
when you have situations like this where the actual user might be
ambiguous.

- the "who to charge" question

- the "how do we *check* the limit" question

and honestly, I think that when it comes to suid binaries, the first
question is fundamentally ambiguous, because it almost certainly
depends on the user.

Which to me implies that there probably isn't an answer that is always
right, and that what you should look at is that second option.

So I would actually suggest that the "execute a suid binary" should
charge the real user, but *because* it is suid, it should then not
check the limit (or, perhaps, should check the hard limit?).

You have to charge somebody, but at that point it's a bit ambiguous
whether it should be allowed.

Exactly so that if you're over a process limit (or something similar -
think "too many files open" or whatever because you screwed up and
opened everything) you could still log in as yourself (ssh/login
charges some admin thing, which probably has high limits or is
unlimited), and hopefully get shell access, and then be able to "exec
sudo" to actually get admin access that should be disabled from the
network.

The above is just one (traditional) example of a fork/open bomb case
where a user isn't really able to no longer function as himself, but
wants to fix things (maybe the user has another terminal open, but
then he can hopefully use a shell-buiiltin 'kill' instead).

And I'm not saying it's "the thing that needs to work". I'm more
making up an example.

So I'm only saying that the above actually has two examples to the two
sides of the coin: "login" lowering privileges to a user that may be
over some limit - and succeeding despite that - and 'suid' succeeding
despite the original user perhaps being over-committed.

So it's intended exactly as an example of "picking the new or the old
user would be wrong in either case if you check limits at the
transition point".

Hmm?

Linus

2022-02-24 02:20:28

by Eric W. Biederman

[permalink] [raw]
Subject: Re: How should rlimits, suid exec, and capabilities interact?

Linus Torvalds <[email protected]> writes:

> On Wed, Feb 23, 2022 at 5:24 PM Eric W. Biederman <[email protected]> wrote:
>>
>> Question: Running a suid program today charges the activity of that
>> program to the user who ran that program, not to the user the program
>> runs as. Does anyone see a problem with charging the user the program
>> runs as?
>
> So I think that there's actually two independent issues with limits
> when you have situations like this where the actual user might be
> ambiguous.
>
> - the "who to charge" question
>
> - the "how do we *check* the limit" question
>
> and honestly, I think that when it comes to suid binaries, the first
> question is fundamentally ambiguous, because it almost certainly
> depends on the user.
>
> Which to me implies that there probably isn't an answer that is always
> right, and that what you should look at is that second option.
>
> So I would actually suggest that the "execute a suid binary" should
> charge the real user, but *because* it is suid, it should then not
> check the limit (or, perhaps, should check the hard limit?).
>
> You have to charge somebody, but at that point it's a bit ambiguous
> whether it should be allowed.
>
> Exactly so that if you're over a process limit (or something similar -
> think "too many files open" or whatever because you screwed up and
> opened everything) you could still log in as yourself (ssh/login
> charges some admin thing, which probably has high limits or is
> unlimited), and hopefully get shell access, and then be able to "exec
> sudo" to actually get admin access that should be disabled from the
> network.
>
> The above is just one (traditional) example of a fork/open bomb case
> where a user isn't really able to no longer function as himself, but
> wants to fix things (maybe the user has another terminal open, but
> then he can hopefully use a shell-buiiltin 'kill' instead).
>
> And I'm not saying it's "the thing that needs to work". I'm more
> making up an example.
>
> So I'm only saying that the above actually has two examples to the two
> sides of the coin: "login" lowering privileges to a user that may be
> over some limit - and succeeding despite that - and 'suid' succeeding
> despite the original user perhaps being over-committed.
>
> So it's intended exactly as an example of "picking the new or the old
> user would be wrong in either case if you check limits at the
> transition point".
>
> Hmm?

That doesn't really clarify anything for me. We have two checks one in
fork and one in exec and you seem to be talking about the check in exec.

The check I have problems with for a suid executable is the check in
fork. If the new process is accounted to the previous user and we use
the permissions of the effective user for checking it that does not make
sense to me.

If we can sort out that the check in fork. I think I have clarity about
the other cases.




The check in exec while clumsy and needing cleaning up seems to make
sense to me. We have a transition that starts with fork and ends with
exec and has operations like setuid in between. If something like
setuid() is called before exec we check in exec.

The case the check in exec is aimed at supporting are processes spawned
from a parent that have a different user (than the parent) and will
never call fork again. Those processes would be fundamentally immune
to RLIMIT_NPROC if we don't check somewhere besides fork. There is
existing code in apache to use RLIMIT_NPROC this way.



For your login case I have no problems with it in principle. In
practice I think you have to login as root to deal with a fork bomb that
hits RLIMIT_NPROC and does not die gracefully.

What I don't see about your login example is how it is practically
different from the apache cgi script case, that the code has supported
for 20 years, and that would be a regression if stopped supporting.

If we want to stop supporting that case we can just remove all of the
RLIMIT_NPROC tests everywhere except for fork, a nice cleanup.




That still leaves me with mismatched effective vs real uid checks in
fork when the effective and real uids don't match. Which means testing
for root with "capable(CAP_SYS_ADMIN)" does not work. Which today is
make the code a bit of a challenge to understand and work with.

Eric

2022-02-24 03:01:00

by David Laight

[permalink] [raw]
Subject: RE: How should rlimits, suid exec, and capabilities interact?

From: Linus Torvalds
> Sent: 24 February 2022 01:42
>
> On Wed, Feb 23, 2022 at 5:24 PM Eric W. Biederman <[email protected]> wrote:
> >
> > Question: Running a suid program today charges the activity of that
> > program to the user who ran that program, not to the user the program
> > runs as. Does anyone see a problem with charging the user the program
> > runs as?
>
> So I think that there's actually two independent issues with limits
> when you have situations like this where the actual user might be
> ambiguous.
>
> - the "who to charge" question
>
> - the "how do we *check* the limit" question
>
> and honestly, I think that when it comes to suid binaries, the first
> question is fundamentally ambiguous, because it almost certainly
> depends on the user.

Doesn't the rlimit check happen during the fork.
At which time you don't know that a suid exec might follow?

The problem with changing the uid is that when the process exits
you need to "uncharge" the correct uid.
So either you need to remember the original uid or setuid
has to transfer the charge (whichever uid is used).
If you transfer the charge then the setuid system call can't fail.
But a later exec can fail.

Any check will always be done against the process's own rlimit value.
Set that to zero and fork should fail regardless of which uid's
process count is checked.

Now a normal suid program only changes the effective uid.
So keeping the process charged against the real uid makes sense.

If a process changes its real uid you could change the charged uid
but you can't error if over the rlimit value.
OTOH during a later exec you can test things and exec can fail.

At least one unix I've used has three uids for each process.
The 'real uid', 'effective uid' and 'saved by exec uid'.
I suspect the process is always "charged" against the latter.
I think that exec compares the 'real' and 'saved by exec' uids
and, if different, moves the charge to the real uid (which will
check rlimit) then sets the 'saved by exec uid' to the real uid.

So an exec after a setuid() can be allowed to fail if the real user
has too many processes.
But in all other cases exec just works regardless of the process
count for any user.

>
> Which to me implies that there probably isn't an answer that is always
> right, and that what you should look at is that second option.
>
> So I would actually suggest that the "execute a suid binary" should
> charge the real user, but *because* it is suid, it should then not
> check the limit (or, perhaps, should check the hard limit?).
>
> You have to charge somebody, but at that point it's a bit ambiguous
> whether it should be allowed.
>
> Exactly so that if you're over a process limit (or something similar -
> think "too many files open" or whatever because you screwed up and
> opened everything) you could still log in as yourself (ssh/login
> charges some admin thing, which probably has high limits or is
> unlimited), and hopefully get shell access, and then be able to "exec
> sudo" to actually get admin access that should be disabled from the
> network.

You usually have to use 'rsh machine sh -i' to avoid the shell
running all its startup scripts.
But I doubt that will get you past a fork bomb.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2022-02-24 16:26:34

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH] ucounts: Fix systemd LimigtNPROC with private users regression


Long story short recursively enforcing RLIMIT_NPROC when it is not
enforced on the process that creates a new user namespace, causes
currently working code to fail. There is no reason to enforce
RLIMIT_NPROC recursively when we don't enforce it normally so update
the code to detect this case.

I would like to simply use capable(CAP_SYS_RESOURCE) to detect when
RLIMIT_NPROC is not enforced upon the caller. Unfortunately because
RLIMIT_NPROC is charged and checked for enforcement based upon the
real uid, using capable() wich is euid based is inconsistent with reality.
Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by
testing for when the real uid would match the conditions when
CAP_SYS_RESOURCE would be present if the real uid was the effective
uid.

Reported-by: Etienne Dechamps <[email protected]>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596
Link: https://lkml.kernel.org/r/[email protected]
Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts")
Signed-off-by: "Eric W. Biederman" <[email protected]>
---

The previous conversation has given me enough clarity that I can see
which tests I am comfortable with use for this pending regression fix.

I have tested this and it works for me. Does anyone have any concerns
with this change?

kernel/user_namespace.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6b2e3ca7ee99..5481ba44a8d6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->user_ns = user_ns;
}

+static unsigned long enforced_nproc_rlimit(void)
+{
+ unsigned long limit = RLIM_INFINITY;
+
+ /* Is RLIMIT_NPROC currently enforced? */
+ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
+ (current_user_ns() != &init_user_ns))
+ limit = rlimit(RLIMIT_NPROC);
+
+ return limit;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -122,7 +134,7 @@ int create_user_ns(struct cred *new)
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
--
2.29.2

2022-02-24 16:56:26

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH] ucounts: Fix systemd LimigtNPROC with private users regression

typo: Subject's LimigtNPROC -> LimitNPROC

On Thu, Feb 24, 2022 at 09:41:44AM -0600, Eric W. Biederman wrote:
>
> Long story short recursively enforcing RLIMIT_NPROC when it is not
> enforced on the process that creates a new user namespace, causes
> currently working code to fail. There is no reason to enforce
> RLIMIT_NPROC recursively when we don't enforce it normally so update
> the code to detect this case.
>
> I would like to simply use capable(CAP_SYS_RESOURCE) to detect when
> RLIMIT_NPROC is not enforced upon the caller. Unfortunately because
> RLIMIT_NPROC is charged and checked for enforcement based upon the
> real uid, using capable() wich is euid based is inconsistent with reality.

typo: wich -> which

> Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by
> testing for when the real uid would match the conditions when
> CAP_SYS_RESOURCE would be present if the real uid was the effective
> uid.
>
> Reported-by: Etienne Dechamps <[email protected]>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596
> Link: https://lkml.kernel.org/r/[email protected]
> Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts")
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
>
> The previous conversation has given me enough clarity that I can see
> which tests I am comfortable with use for this pending regression fix.
>
> I have tested this and it works for me. Does anyone have any concerns
> with this change?

I'd really love some kind of selftest that exercises the edge cases; do
you have your tests in some form that could be converted?

But otherwise, yes, this looks like the best option here.

Reviewed-by: Kees Cook <[email protected]>

>
> kernel/user_namespace.c | 14 +++++++++++++-
> 1 file changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 6b2e3ca7ee99..5481ba44a8d6 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
> cred->user_ns = user_ns;
> }
>
> +static unsigned long enforced_nproc_rlimit(void)
> +{
> + unsigned long limit = RLIM_INFINITY;
> +
> + /* Is RLIMIT_NPROC currently enforced? */
> + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
> + (current_user_ns() != &init_user_ns))
> + limit = rlimit(RLIMIT_NPROC);
> +
> + return limit;
> +}
> +
> /*
> * Create a new user namespace, deriving the creator from the user in the
> * passed credentials, and replacing that user with the new root user for the
> @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new)
> for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
> ns->ucount_max[i] = INT_MAX;
> }
> - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
> + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
> set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
> set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
> set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
> --
> 2.29.2
>

--
Kees Cook

2022-02-25 00:48:41

by Michal Koutný

[permalink] [raw]
Subject: Re: [PATCH] ucounts: Fix systemd LimigtNPROC with private users regression

On Thu, Feb 24, 2022 at 08:28:41AM -0800, Kees Cook <[email protected]> wrote:
> I'd really love some kind of selftest that exercises the edge cases; do
> you have your tests in some form that could be converted?

There's the original
tools/testing/selftests/rlimits/rlimits-per-userns.c selftest.

I've been rewriting it to cover more situations, I'm sending it as one
monster patch (I'd need spend more time reordering my commits into some
logical patch order) if anyone wishes to try it.

I've tried it on 5c1ee569660d4a205dced9cb4d0306b907fb7599 + this Eric's
patch.

The test rlimit-per-userns-root passes
- together with that I claim this patch

Reviewed-by: Michal Koutný <[email protected]>

The test rlimit-per-userns-nonroot fails. It's similar off-by-one
mistake as was in the fork path, but it's in the do_execveat_common():

if ((current->flags & PF_NPROC_EXCEEDED) &&
is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN;
goto out_ret;
}

(If RLIMIT_NPROC should be strictly honored, setuid+execve should fail
when given uid's ucount is at the limit already.)

Funnily, the original
tools/testing/selftests/rlimits/rlimits-per-userns.c passes thanks to
the off-by-one check even though it should not pass because unshare(2)
is called after setuid(2).

Michal

-- >8 --
From be67d903f1f179f585bf302f6c2d2446f24263d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <[email protected]>
Date: Thu, 20 Jan 2022 19:32:54 +0100
Subject: [RFC PATCH] selftests: Rewrite RLIMIT_NPROC checks (in user
namespaces)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds two test programs:
- rlimit-per-userns-root -- creates user namespaces owned by root,
- rlimit-per-userns-nonroot -- creates user namespaces owned by non-root.

The forking tree:

main [init_user_ns]
` service [user_ns_1]
` worker 1
` worker 2
...
` worker k
...
` service [user_ns_n]
` worker 1
` worker 2
...
` worker k

Expectations rlimit-per-userns-root:

n > RLIMIT_NPROC privileged user can spawn mutliple services in different user namespaces
(k+1) <= RLIMIT_NPROC limit is honored within user namespace
k >= RLIMIT_NPROC-1 separate per-user namespace counters

Expectations rlimit-per-userns-nonroot:

n <= RLIMIT_NPROC global RLIMIT_NPROC is honored
(k+1) <= RLIMIT_NPROC limit is honored within user namespace

Signed-off-by: Michal Koutný <[email protected]>
---
tools/testing/selftests/rlimits/Makefile | 6 +-
.../rlimits/rlimits-per-userns-nonroot.c | 37 ++
.../rlimits/rlimits-per-userns-root.c | 34 ++
.../selftests/rlimits/rlimits-per-userns.c | 161 -------
.../selftests/rlimits/service_common.c | 400 ++++++++++++++++++
.../selftests/rlimits/service_common.h | 24 ++
6 files changed, 500 insertions(+), 162 deletions(-)
create mode 100644 tools/testing/selftests/rlimits/rlimits-per-userns-nonroot.c
create mode 100644 tools/testing/selftests/rlimits/rlimits-per-userns-root.c
delete mode 100644 tools/testing/selftests/rlimits/rlimits-per-userns.c
create mode 100644 tools/testing/selftests/rlimits/service_common.c
create mode 100644 tools/testing/selftests/rlimits/service_common.h

diff --git a/tools/testing/selftests/rlimits/Makefile b/tools/testing/selftests/rlimits/Makefile
index 03aadb406212..8ccb92020206 100644
--- a/tools/testing/selftests/rlimits/Makefile
+++ b/tools/testing/selftests/rlimits/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-or-later

CFLAGS += -Wall -O2 -g
-TEST_GEN_PROGS := rlimits-per-userns
+TEST_GEN_PROGS := rlimits-per-userns-root
+TEST_GEN_PROGS += rlimits-per-userns-nonroot

include ../lib.mk
+
+$(OUTPUT)/rlimits-per-userns-root: service_common.c
+$(OUTPUT)/rlimits-per-userns-nonroot: service_common.c
diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns-nonroot.c b/tools/testing/selftests/rlimits/rlimits-per-userns-nonroot.c
new file mode 100644
index 000000000000..ccf021769f88
--- /dev/null
+++ b/tools/testing/selftests/rlimits/rlimits-per-userns-nonroot.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Alexey Gladkov <[email protected]>
+ * Author: Michal Koutný <[email protected]>
+ */
+#include <err.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "service_common.h"
+
+int main(int argc, char **argv)
+{
+ struct services_ctx *ctx;
+ pid = getpid();
+
+ if (getenv(ENV_PARAM))
+ return run_service(atoi(getenv(ENV_PARAM)));
+
+ if (getuid() > 0)
+ errx(KSFT_SKIP, "This selftest must start as (global) root user.");
+
+ warnx("(pid=%d) Starting testcase", pid);
+
+ ctx = start_services(argv[0], UM_UNSHARE);
+ stop_services(ctx);
+
+ if (count_services(ctx) > THE_LIMIT)
+ errx(KSFT_FAIL, "(pid=%d): Test failed, exec'd services > RLIMIT_NPROC", pid);
+
+ if (check_services(ctx) < count_services(ctx))
+ errx(KSFT_FAIL, "(pid=%d): Test failed, failed services", pid);
+
+ warnx("(pid=%d): Test passed", pid);
+ exit(KSFT_PASS);
+}
diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns-root.c b/tools/testing/selftests/rlimits/rlimits-per-userns-root.c
new file mode 100644
index 000000000000..3bf0149ac93d
--- /dev/null
+++ b/tools/testing/selftests/rlimits/rlimits-per-userns-root.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Alexey Gladkov <[email protected]>
+ * Author: Michal Koutný <[email protected]>
+ */
+#include <err.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "service_common.h"
+
+int main(int argc, char **argv)
+{
+ struct services_ctx *ctx;
+ pid = getpid();
+
+ if (getenv(ENV_PARAM))
+ return run_service(atoi(getenv(ENV_PARAM)));
+
+ if (getuid() > 0)
+ errx(KSFT_SKIP, "This selftest must start as (global) root user.");
+
+ warnx("(pid=%d) Starting testcase", pid);
+
+ ctx = start_services(argv[0], UM_CLONE_NEWUSER);
+ stop_services(ctx);
+
+ if (check_services(ctx) != NR_SERVICES)
+ errx(KSFT_FAIL, "(pid=%d): Test failed, unexpected terminations", pid);
+
+ warnx("(pid=%d): Test passed", pid);
+ exit(KSFT_PASS);
+}
diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns.c b/tools/testing/selftests/rlimits/rlimits-per-userns.c
deleted file mode 100644
index 26dc949e93ea..000000000000
--- a/tools/testing/selftests/rlimits/rlimits-per-userns.c
+++ /dev/null
@@ -1,161 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Author: Alexey Gladkov <[email protected]>
- */
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <sys/prctl.h>
-#include <sys/stat.h>
-
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <signal.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <err.h>
-
-#define NR_CHILDS 2
-
-static char *service_prog;
-static uid_t user = 60000;
-static uid_t group = 60000;
-
-static void setrlimit_nproc(rlim_t n)
-{
- pid_t pid = getpid();
- struct rlimit limit = {
- .rlim_cur = n,
- .rlim_max = n
- };
-
- warnx("(pid=%d): Setting RLIMIT_NPROC=%ld", pid, n);
-
- if (setrlimit(RLIMIT_NPROC, &limit) < 0)
- err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid);
-}
-
-static pid_t fork_child(void)
-{
- pid_t pid = fork();
-
- if (pid < 0)
- err(EXIT_FAILURE, "fork");
-
- if (pid > 0)
- return pid;
-
- pid = getpid();
-
- warnx("(pid=%d): New process starting ...", pid);
-
- if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
- err(EXIT_FAILURE, "(pid=%d): prctl(PR_SET_PDEATHSIG)", pid);
-
- signal(SIGUSR1, SIG_DFL);
-
- warnx("(pid=%d): Changing to uid=%d, gid=%d", pid, user, group);
-
- if (setgid(group) < 0)
- err(EXIT_FAILURE, "(pid=%d): setgid(%d)", pid, group);
- if (setuid(user) < 0)
- err(EXIT_FAILURE, "(pid=%d): setuid(%d)", pid, user);
-
- warnx("(pid=%d): Service running ...", pid);
-
- warnx("(pid=%d): Unshare user namespace", pid);
- if (unshare(CLONE_NEWUSER) < 0)
- err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");
-
- char *const argv[] = { "service", NULL };
- char *const envp[] = { "I_AM_SERVICE=1", NULL };
-
- warnx("(pid=%d): Executing real service ...", pid);
-
- execve(service_prog, argv, envp);
- err(EXIT_FAILURE, "(pid=%d): execve", pid);
-}
-
-int main(int argc, char **argv)
-{
- size_t i;
- pid_t child[NR_CHILDS];
- int wstatus[NR_CHILDS];
- int childs = NR_CHILDS;
- pid_t pid;
-
- if (getenv("I_AM_SERVICE")) {
- pause();
- exit(EXIT_SUCCESS);
- }
-
- service_prog = argv[0];
- pid = getpid();
-
- warnx("(pid=%d) Starting testcase", pid);
-
- /*
- * This rlimit is not a problem for root because it can be exceeded.
- */
- setrlimit_nproc(1);
-
- for (i = 0; i < NR_CHILDS; i++) {
- child[i] = fork_child();
- wstatus[i] = 0;
- usleep(250000);
- }
-
- while (1) {
- for (i = 0; i < NR_CHILDS; i++) {
- if (child[i] <= 0)
- continue;
-
- errno = 0;
- pid_t ret = waitpid(child[i], &wstatus[i], WNOHANG);
-
- if (!ret || (!WIFEXITED(wstatus[i]) && !WIFSIGNALED(wstatus[i])))
- continue;
-
- if (ret < 0 && errno != ECHILD)
- warn("(pid=%d): waitpid(%d)", pid, child[i]);
-
- child[i] *= -1;
- childs -= 1;
- }
-
- if (!childs)
- break;
-
- usleep(250000);
-
- for (i = 0; i < NR_CHILDS; i++) {
- if (child[i] <= 0)
- continue;
- kill(child[i], SIGUSR1);
- }
- }
-
- for (i = 0; i < NR_CHILDS; i++) {
- if (WIFEXITED(wstatus[i]))
- warnx("(pid=%d): pid %d exited, status=%d",
- pid, -child[i], WEXITSTATUS(wstatus[i]));
- else if (WIFSIGNALED(wstatus[i]))
- warnx("(pid=%d): pid %d killed by signal %d",
- pid, -child[i], WTERMSIG(wstatus[i]));
-
- if (WIFSIGNALED(wstatus[i]) && WTERMSIG(wstatus[i]) == SIGUSR1)
- continue;
-
- warnx("(pid=%d): Test failed", pid);
- exit(EXIT_FAILURE);
- }
-
- warnx("(pid=%d): Test passed", pid);
- exit(EXIT_SUCCESS);
-}
diff --git a/tools/testing/selftests/rlimits/service_common.c b/tools/testing/selftests/rlimits/service_common.c
new file mode 100644
index 000000000000..043c59828a03
--- /dev/null
+++ b/tools/testing/selftests/rlimits/service_common.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Alexey Gladkov <[email protected]>
+ * Author: Michal Koutný <[email protected]>
+ */
+/*
+ * The forking tree:
+ *
+ * main [init_user_ns]
+ * ` service [user_ns_1]
+ * ` worker 1
+ * ` worker 2
+ * ...
+ * ` worker k
+ * ...
+ * ` service [user_ns_n]
+ * ` worker 1
+ * ` worker 2
+ * ...
+ * ` worker k
+ *
+ * Sequence (synchronization) diagram:
+ * main service
+ * ---- -------
+ * setrlimit()
+ * service=clone([CLONE_NEWUSER])
+ * define_maps()
+ * MAP_DEFINE ->
+ * setuid()
+ * [unshare(CLONE_NEWUSER)]
+ * <- UNSHARE
+ * rlimit_restore()
+ * RLIMIT_RESTORE ->
+ * execve()
+ * POST_EXEC ->
+ *
+ * Expectations UM_UNSHARE:
+ *
+ * n <= RLIMIT_NPROC global RLIMIT_NPROC is honored
+ * (k+1) <= RLIMIT_NPROC limit is honored within user namespace
+ *
+ * Expectations UM_CLONE_NEWUSER:
+ *
+ * n > RLIMIT_NPROC privileged user can spawn mutliple services in different user namespaces
+ * k >= RLIMIT_NPROC-1 separate per-user namespace counters
+ * (k+1) <= RLIMIT_NPROC limit is honored within user namespace
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "service_common.h"
+
+#define STACK_SIZE (2 * (1<<20))
+#define SERVICE_RUNTIME 250000 /* μs */
+
+static_assert(NR_SERVICES > THE_LIMIT, "Services must exceed THE_LIMIT for effective test.");
+static_assert(NR_WORKERS > THE_LIMIT-1, "Need enough workers to challenge THE_LIIMT.");
+
+static struct services_ctx {
+ int control_fd[NR_SERVICES];
+ pid_t child[NR_SERVICES];
+ int wstatus[NR_SERVICES];
+ int fork_ed;
+ int exec_ed;
+} services_ctx;
+
+static uid_t user = 60000;
+static uid_t group = 60000;
+static struct rlimit saved_limit;
+
+static struct {
+ int control_fd;
+ const char *pathname;
+ enum userns_mode um;
+} child_args;
+
+pid_t pid;
+
+static void setrlimit_nproc(rlim_t n)
+{
+ struct rlimit limit = {
+ .rlim_cur = n,
+ .rlim_max = n
+ };
+ if (getrlimit(RLIMIT_NPROC, &saved_limit) < 0)
+ err(KSFT_FAIL, "(pid=%d): getrlimit(RLIMIT_NPROC)", pid);
+
+ if (setrlimit(RLIMIT_NPROC, &limit) < 0)
+ err(KSFT_FAIL, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid);
+
+ warnx("(pid=%d): Set RLIMIT_NPROC=%ld", pid, n);
+}
+
+static void restore_rlimit_nproc(void)
+{
+ if (setrlimit(RLIMIT_NPROC, &saved_limit) < 0)
+ err(KSFT_FAIL, "(pid=%d): setrlimit(RLIMIT_NPROC, saved)", pid);
+ warnx("(pid=%d) Restored RLIMIT_NPROC", pid);
+}
+
+enum msg_sync {
+ MAP_DEFINE,
+ UNSHARE,
+ RLIMIT_RESTORE,
+ POST_EXEC,
+};
+
+static int _sync_notify(int fd, enum msg_sync m)
+{
+ char tmp = m;
+
+ return write(fd, &tmp, 1);
+}
+static void sync_notify(int fd, enum msg_sync m)
+{
+ if (_sync_notify(fd, m) < 0)
+ warnx("(pid=%d): failed sync-write", pid);
+}
+
+static void sync_wait(int fd, enum msg_sync m)
+{
+ char tmp;
+
+ if (read(fd, &tmp, 1) < 0)
+ warn("(pid=%d): failed sync-read", pid);
+ else if (tmp != m)
+ warnx("(pid=%d): unexpected sync", pid);
+}
+
+static int define_maps(pid_t child_pid, enum userns_mode um)
+{
+ FILE *f;
+ char filename[PATH_MAX];
+
+ if (um != UM_CLONE_NEWUSER)
+ return 0;
+
+ snprintf(filename, PATH_MAX, "/proc/%i/uid_map", child_pid);
+ f = fopen(filename, "w");
+ if (fprintf(f, "%i %i 1\n", user, user) < 0)
+ return -1;
+ fclose(f);
+
+ snprintf(filename, PATH_MAX, "/proc/%i/gid_map", child_pid);
+ f = fopen(filename, "w");
+ if (fprintf(f, "%i %i 1\n", group, group) < 0)
+ return -1;
+ fclose(f);
+
+ return 0;
+}
+
+static int setup_and_exec(void *arg)
+{
+ int control_fd = child_args.control_fd;
+
+ pid = getpid();
+ warnx("(pid=%d): New process starting ...", pid);
+
+ signal(SIGUSR1, SIG_DFL);
+
+ sync_wait(control_fd, MAP_DEFINE);
+ warnx("(pid=%d): Changing to uid=%d, gid=%d", pid, user, group);
+
+ if (setgid(group) < 0)
+ err(EXIT_FAILURE, "(pid=%d): setgid(%d)", pid, group);
+ if (setuid(user) < 0)
+ err(EXIT_FAILURE, "(pid=%d): setuid(%d)", pid, user);
+
+ warnx("(pid=%d): Service running ...", pid);
+
+ if (child_args.um == UM_UNSHARE) {
+ warnx("(pid=%d): Unshare user namespace", pid);
+ if (unshare(CLONE_NEWUSER) < 0)
+ err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");
+ }
+
+ sync_notify(control_fd, UNSHARE);
+ sync_wait(control_fd, RLIMIT_RESTORE);
+
+ char *param = NULL;
+ asprintf(&param, ENV_PARAM "=%i", child_args.um);
+ char *const argv[] = { "service", NULL };
+ char *const envp[] = { param, NULL };
+
+ warnx("(pid=%d): Executing real service ...", pid);
+
+ execve(child_args.pathname, argv, envp);
+
+ /* stay around until parent notifies/signals */
+ warn("(pid=%d): execve failed", pid);
+ sync_wait(control_fd, POST_EXEC);
+ pause();
+ return 0;
+}
+
+static pid_t start_child(const char *pathname, int control_fd, enum userns_mode um)
+{
+ char *stack = malloc(STACK_SIZE);
+ int flags = um == UM_CLONE_NEWUSER ? CLONE_NEWUSER : 0;
+ pid_t new_pid;
+
+ /* Pass via global variable to child */
+ child_args.control_fd = control_fd;
+ child_args.pathname = pathname;
+ child_args.um = um;
+
+ new_pid = clone(setup_and_exec, stack+STACK_SIZE-1, flags, NULL);
+
+ free(stack);
+ close(control_fd);
+ return new_pid;
+}
+
+static void dump_context(size_t n_workers)
+{
+ struct rlimit limit;
+ char user_ns[PATH_MAX];
+ ssize_t len;
+
+ if (getrlimit(RLIMIT_NPROC, &limit) < 0)
+ err(EXIT_FAILURE, "(pid=%d) failed getrlimit", pid);
+ if ((len = readlink("/proc/self/ns/user", user_ns, PATH_MAX)) < 0)
+ err(EXIT_FAILURE, "(pid=%d) failed readlink", pid);
+ user_ns[len] = 0;
+
+ warnx("(pid=%d) Service instance attempts %lu workers, limit %lu:%lu, ns=%s",
+ pid, n_workers, limit.rlim_cur, limit.rlim_max, user_ns);
+}
+
+int run_service(enum userns_mode um)
+{
+ size_t i;
+ pid_t worker[NR_WORKERS];
+ int ret = EXIT_SUCCESS;
+
+ dump_context(NR_WORKERS);
+
+ /* test RLIMIT_NPROC inside the service, last worker should fail because of service itself */
+ for (i = 0; i < NR_WORKERS; i++) {
+ worker[i] = fork();
+ if (worker[i] == 0) {
+ /* service worker */
+ pause();
+ exit(EXIT_SUCCESS);
+ }
+ if (worker[i] < 0) {
+ warn("(pid=%d) service fork %lu failed", pid, i+1);
+ if (um == UM_CLONE_NEWUSER && !(i >= (THE_LIMIT-1) && errno == EAGAIN))
+ ret = EXIT_FAILURE;
+ } else if (i >= (THE_LIMIT-1)) {
+ warnx("(pid=%d) RLIMIT_NPROC not honored", pid);
+ ret = EXIT_FAILURE;
+ }
+ }
+
+ /* service cleanup */
+ for (i = 0; i < NR_WORKERS; i++)
+ if (worker[i] > 0)
+ kill(worker[i], SIGUSR1);
+
+ for (i = 0; i < NR_WORKERS; i++)
+ if (worker[i] > 0)
+ waitpid(worker[i], NULL, WNOHANG);
+
+ if (ret) {
+ warnx("(pid=%d) service failed, ret=%i", pid, ret);
+ return ret;
+ }
+ /* we must get here before SERVICE_RUNTIME elapses */
+ pause();
+ return EXIT_FAILURE;
+}
+
+struct services_ctx *start_services(const char *pathname, enum userns_mode um)
+{
+ size_t i;
+ int sockets[2];
+ struct services_ctx *ctx = &services_ctx;
+
+ signal(SIGPIPE, SIG_IGN);
+ setrlimit_nproc(THE_LIMIT);
+ ctx->fork_ed = 0;
+ ctx->exec_ed = 0;
+ for (i = 0; i < NR_SERVICES; i++) {
+ if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, sockets) < 0)
+ err(KSFT_FAIL, "(pid=%d) socketpair failed", pid);
+ ctx->control_fd[i] = sockets[0];
+ ctx->child[i] = start_child(pathname, sockets[1], um);
+ ctx->wstatus[i] = 0;
+ if (ctx->child[i] < 0)
+ continue;
+ ctx->fork_ed++;
+
+ if (define_maps(ctx->child[i], um) < 0)
+ err(KSFT_FAIL, "(pid=%d) user_ns maps definition failed", pid);
+ sync_notify(ctx->control_fd[i], MAP_DEFINE);
+ }
+
+ for (i = 0; i < NR_SERVICES; i++)
+ sync_wait(ctx->control_fd[i], UNSHARE);
+ restore_rlimit_nproc();
+
+ for (i = 0; i < NR_SERVICES; i++) {
+ sync_notify(ctx->control_fd[i], RLIMIT_RESTORE);
+ }
+
+ return ctx;
+}
+
+void stop_services(struct services_ctx *ctx)
+{
+ size_t i;
+ int children = ctx->fork_ed;
+
+ /* Well behaved service would pause() and wait for our SIGUSR1, if it
+ * failed check it early.
+ */
+ while (1) {
+ for (i = 0; i < NR_SERVICES; i++) {
+ if (ctx->child[i] <= 0)
+ continue;
+
+ errno = 0;
+ pid_t ret = waitpid(ctx->child[i], &ctx->wstatus[i], WNOHANG | __WALL);
+
+ if (!ret)
+ continue;
+
+ if (ret < 0 && errno != ECHILD)
+ warn("(pid=%d): waitpid(%d)", pid, ctx->child[i]);
+
+ ctx->child[i] *= -1;
+ children -= 1;
+ }
+
+ if (!children)
+ break;
+
+ usleep(SERVICE_RUNTIME);
+
+ for (i = 0; i < NR_SERVICES; i++) {
+ if (ctx->child[i] <= 0)
+ continue;
+ if (_sync_notify(ctx->control_fd[i], POST_EXEC) < 0 &&
+ (errno == EPIPE || errno == ECONNREFUSED))
+ ctx->exec_ed++;
+ close(ctx->control_fd[i]);
+ kill(ctx->child[i], SIGUSR1);
+ }
+ }
+
+ warnx("(pid=%d): stats: fork_ed=%i exec_ed=%i", pid, ctx->fork_ed, ctx->exec_ed);
+}
+
+int count_services(struct services_ctx *ctx)
+{
+ return ctx->exec_ed;
+}
+
+int check_services(struct services_ctx *ctx)
+{
+ size_t i;
+ int correct = 0;
+
+ for (i = 0; i < NR_SERVICES; i++) {
+ if (WIFEXITED(ctx->wstatus[i]))
+ warnx("(pid=%d): pid %d exited, status=%d",
+ pid, -ctx->child[i], WEXITSTATUS(ctx->wstatus[i]));
+ else if (WIFSIGNALED(ctx->wstatus[i]))
+ warnx("(pid=%d): pid %d killed by signal %d",
+ pid, -ctx->child[i], WTERMSIG(ctx->wstatus[i]));
+
+ /* The only acceptable service termination */
+ if (WIFSIGNALED(ctx->wstatus[i]) && WTERMSIG(ctx->wstatus[i]) == SIGUSR1)
+ correct++;
+ }
+
+ return correct;
+}
+
+
diff --git a/tools/testing/selftests/rlimits/service_common.h b/tools/testing/selftests/rlimits/service_common.h
new file mode 100644
index 000000000000..4a3cd929d865
--- /dev/null
+++ b/tools/testing/selftests/rlimits/service_common.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <sys/types.h>
+
+#define THE_LIMIT 4
+#define NR_SERVICES 5
+#define NR_WORKERS 5
+
+#define ENV_PARAM "I_AM_SERVICE"
+
+enum userns_mode {
+ UM_UNSHARE, /* setrlimit,clone(0),setuid,unshare,execve */
+ UM_CLONE_NEWUSER, /* setrlimit,clone(NEWUSER),setuid,execve */
+};
+
+struct services_ctx;
+
+/* Cache current pid */
+extern pid_t pid;
+
+int run_service(enum userns_mode um);
+struct services_ctx *start_services(const char *pathname, enum userns_mode um);
+void stop_services(struct services_ctx *ctx);
+int count_services(struct services_ctx *ctx);
+int check_services(struct services_ctx *ctx);
--
2.34.1

2022-02-25 03:28:21

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] ucounts: Fix systemd LimigtNPROC with private users regression

Kees Cook <[email protected]> writes:

> typo: Subject's LimigtNPROC -> LimitNPROC
>
> On Thu, Feb 24, 2022 at 09:41:44AM -0600, Eric W. Biederman wrote:
>>
>> Long story short recursively enforcing RLIMIT_NPROC when it is not
>> enforced on the process that creates a new user namespace, causes
>> currently working code to fail. There is no reason to enforce
>> RLIMIT_NPROC recursively when we don't enforce it normally so update
>> the code to detect this case.
>>
>> I would like to simply use capable(CAP_SYS_RESOURCE) to detect when
>> RLIMIT_NPROC is not enforced upon the caller. Unfortunately because
>> RLIMIT_NPROC is charged and checked for enforcement based upon the
>> real uid, using capable() wich is euid based is inconsistent with reality.
>
> typo: wich -> which

Ahh... Typos.

>> Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by
>> testing for when the real uid would match the conditions when
>> CAP_SYS_RESOURCE would be present if the real uid was the effective
>> uid.
>>
>> Reported-by: Etienne Dechamps <[email protected]>
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596
>> Link: https://lkml.kernel.org/r/[email protected]
>> Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts")
>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>> ---
>>
>> The previous conversation has given me enough clarity that I can see
>> which tests I am comfortable with use for this pending regression fix.
>>
>> I have tested this and it works for me. Does anyone have any concerns
>> with this change?
>
> I'd really love some kind of selftest that exercises the edge cases; do
> you have your tests in some form that could be converted?
>
> But otherwise, yes, this looks like the best option here.

Let's start with Michal Koutný tests. I keep forgetting to look at
them. This cold has really been kicking my butt.

For this issue the test case was a systemd unit file. Which is simple
and demonstrates the real-world regression but not really minimal in the
way a kernel selftest should be.

> Reviewed-by: Kees Cook <[email protected]>
>
>>
>> kernel/user_namespace.c | 14 +++++++++++++-
>> 1 file changed, 13 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
>> index 6b2e3ca7ee99..5481ba44a8d6 100644
>> --- a/kernel/user_namespace.c
>> +++ b/kernel/user_namespace.c
>> @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
>> cred->user_ns = user_ns;
>> }
>>
>> +static unsigned long enforced_nproc_rlimit(void)
>> +{
>> + unsigned long limit = RLIM_INFINITY;
>> +
>> + /* Is RLIMIT_NPROC currently enforced? */
>> + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
>> + (current_user_ns() != &init_user_ns))
>> + limit = rlimit(RLIMIT_NPROC);
>> +
>> + return limit;
>> +}
>> +
>> /*
>> * Create a new user namespace, deriving the creator from the user in the
>> * passed credentials, and replacing that user with the new root user for the
>> @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new)
>> for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
>> ns->ucount_max[i] = INT_MAX;
>> }
>> - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
>> + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
>> set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
>> set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
>> set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
>> --
>> 2.29.2
>>

Eric

2022-03-03 00:50:21

by Eric W. Biederman

[permalink] [raw]
Subject: [GIT PULL] ucounts: Regression fix for v5.17


Linus,

Please pull the ucount-rlimit-fixes-for-v5.17 branch from the git tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git ucount-rlimit-fixes-for-v5.17

HEAD: 0ac983f512033cb7b5e210c9589768ad25b1e36b ucounts: Fix systemd LimitNPROC with private users regression

Etienne Dechamps recently found a regression caused by enforcing
RLIMIT_NPROC for root where the rlimit was not previously enforced.

Michal Koutný had previously pointed out the inconsistency in enforcing
the RLIMIT_NPROC that had been on the root owned process after the root
user creates a user namespace.

Which makes the fix for the regression simply removing the
inconsistency.


From: "Eric W. Biederman" <[email protected]>
Date: Thu, 24 Feb 2022 08:32:28 -0600
Subject: [PATCH] ucounts: Fix systemd LimitNPROC with private users regression

Long story short recursively enforcing RLIMIT_NPROC when it is not
enforced on the process that creates a new user namespace, causes
currently working code to fail. There is no reason to enforce
RLIMIT_NPROC recursively when we don't enforce it normally so update
the code to detect this case.

I would like to simply use capable(CAP_SYS_RESOURCE) to detect when
RLIMIT_NPROC is not enforced upon the caller. Unfortunately because
RLIMIT_NPROC is charged and checked for enforcement based upon the
real uid, using capable() which is euid based is inconsistent with reality.
Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by
testing for when the real uid would match the conditions when
CAP_SYS_RESOURCE would be present if the real uid was the effective
uid.

Reported-by: Etienne Dechamps <[email protected]>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596
Link: https://lkml.kernel.org/r/[email protected]
Link: https://lkml.kernel.org/r/[email protected]
Cc: [email protected]
Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts")
Reviewed-by: Kees Cook <[email protected]>
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
kernel/user_namespace.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6b2e3ca7ee99..5481ba44a8d6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->user_ns = user_ns;
}

+static unsigned long enforced_nproc_rlimit(void)
+{
+ unsigned long limit = RLIM_INFINITY;
+
+ /* Is RLIMIT_NPROC currently enforced? */
+ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
+ (current_user_ns() != &init_user_ns))
+ limit = rlimit(RLIMIT_NPROC);
+
+ return limit;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -122,7 +134,7 @@ int create_user_ns(struct cred *new)
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
--
2.29.2

2022-03-03 01:01:13

by pr-tracker-bot

[permalink] [raw]
Subject: Re: [GIT PULL] ucounts: Regression fix for v5.17

The pull request you sent on Wed, 02 Mar 2022 18:12:40 -0600:

> git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git ucount-rlimit-fixes-for-v5.17

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/5859a2b1991101d6b978f3feb5325dad39421f29

Thank you!

--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html