Right now, a new process can't be forked in another time namespace
if it shares mm with its parent. It is prohibited, because each time
namespace has its own vvar page that is mapped into a process address
space.
When a process calls exec, it gets a new mm and so it could be "legal"
to switch time namespace in that case. This was not implemented and
now if we want to do this, we need to add another clone flag to not
break backward compatibility.
We don't have any user requests to switch times on exec except the
vfork+exec combination, so there is no reason to add a new clone flag.
As for vfork+exec, this should be safe to allow switching timens with
the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails
if a child is forked into another time namespace. With this change,
vfork creates a new process in parent's timens, and the following exec
does the actual switch to the target time namespace.
Suggested-by: Florian Weimer <[email protected]>
Signed-off-by: Andrei Vagin <[email protected]>
---
fs/exec.c | 7 +++++++
kernel/fork.c | 5 ++++-
kernel/nsproxy.c | 3 ++-
3 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 0989fb8472a1..347e8f55bc2b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,6 +65,7 @@
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
+#include <linux/time_namespace.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -982,10 +983,12 @@ static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
+ bool vfork;
int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
+ vfork = !!tsk->vfork_done;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
if (old_mm)
@@ -1030,6 +1033,10 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
+
+ if (vfork)
+ timens_on_fork(tsk->nsproxy, tsk);
+
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..9174146f6812 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2033,8 +2033,11 @@ static __latent_entropy struct task_struct *copy_process(
/*
* If the new process will be in a different time namespace
* do not allow it to share VM or a thread group with the forking task.
+ *
+ * On vfork, the child process enters the target time namespace only
+ * after exec.
*/
- if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
+ if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) {
if (nsp->time_ns != nsp->time_ns_for_children)
return ERR_PTR(-EINVAL);
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index eec72ca962e2..b4cbb406bc28 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -179,7 +179,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- timens_on_fork(new_ns, tsk);
+ if ((flags & CLONE_VM) == 0)
+ timens_on_fork(new_ns, tsk);
tsk->nsproxy = new_ns;
return 0;
--
2.35.1
* check that a child process is in parent's time namespace after vfork.
* check that a child process is in the target namespace after exec.
Output on success:
$ ./vfork_exec
1..1
ok 1 exec
Signed-off-by: Andrei Vagin <[email protected]>
---
tools/testing/selftests/timens/Makefile | 2 +-
tools/testing/selftests/timens/vfork_exec.c | 90 +++++++++++++++++++++
2 files changed, 91 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/timens/vfork_exec.c
diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile
index 3a5936cc10ab..f0d51d4d2c87 100644
--- a/tools/testing/selftests/timens/Makefile
+++ b/tools/testing/selftests/timens/Makefile
@@ -1,4 +1,4 @@
-TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex
+TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex vfork_exec
TEST_GEN_PROGS_EXTENDED := gettime_perf
CFLAGS := -Wall -Werror -pthread
diff --git a/tools/testing/selftests/timens/vfork_exec.c b/tools/testing/selftests/timens/vfork_exec.c
new file mode 100644
index 000000000000..e6ccd900f30a
--- /dev/null
+++ b/tools/testing/selftests/timens/vfork_exec.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "log.h"
+#include "timens.h"
+
+#define OFFSET (36000)
+
+int main(int argc, char *argv[])
+{
+ struct timespec now, tst;
+ int status, i;
+ pid_t pid;
+
+ if (argc > 1) {
+ if (sscanf(argv[1], "%ld", &now.tv_sec) != 1)
+ return pr_perror("sscanf");
+
+ for (i = 0; i < 2; i++) {
+ _gettime(CLOCK_MONOTONIC, &tst, i);
+ if (abs(tst.tv_sec - now.tv_sec) > 5)
+ return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec);
+ }
+ return 0;
+ }
+
+ nscheck();
+
+ ksft_set_plan(1);
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+
+ if (unshare_timens())
+ return 1;
+
+ if (_settime(CLOCK_MONOTONIC, OFFSET))
+ return 1;
+
+ for (i = 0; i < 2; i++) {
+ _gettime(CLOCK_MONOTONIC, &tst, i);
+ if (abs(tst.tv_sec - now.tv_sec) > 5)
+ return pr_fail("%ld %ld\n",
+ now.tv_sec, tst.tv_sec);
+ }
+
+ pid = vfork();
+ if (pid < 0)
+ return pr_perror("fork");
+
+ if (pid == 0) {
+ char now_str[64];
+ char *cargv[] = {"exec", now_str, NULL};
+ char *cenv[] = {NULL};
+
+ // Check that we are still in the source timens.
+ for (i = 0; i < 2; i++) {
+ _gettime(CLOCK_MONOTONIC, &tst, i);
+ if (abs(tst.tv_sec - now.tv_sec) > 5)
+ return pr_fail("%ld %ld\n",
+ now.tv_sec, tst.tv_sec);
+ }
+
+ /* Check for proper vvar offsets after execve. */
+ snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET);
+ execve("/proc/self/exe", cargv, cenv);
+ return pr_perror("execve");
+ }
+
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("waitpid");
+
+ if (status)
+ ksft_exit_fail();
+
+ ksft_test_result_pass("exec\n");
+ ksft_exit_pass();
+ return 0;
+}
--
2.35.1
On Sun, Jun 12, 2022 at 11:07:22PM -0700, Andrei Vagin wrote:
> Right now, a new process can't be forked in another time namespace
> if it shares mm with its parent. It is prohibited, because each time
> namespace has its own vvar page that is mapped into a process address
> space.
>
> When a process calls exec, it gets a new mm and so it could be "legal"
> to switch time namespace in that case. This was not implemented and
> now if we want to do this, we need to add another clone flag to not
> break backward compatibility.
>
> We don't have any user requests to switch times on exec except the
> vfork+exec combination, so there is no reason to add a new clone flag.
> As for vfork+exec, this should be safe to allow switching timens with
> the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails
> if a child is forked into another time namespace. With this change,
> vfork creates a new process in parent's timens, and the following exec
> does the actual switch to the target time namespace.
This seems like a very special case. None of the other namespaces do
this, do they?
How is CLONE_NEWTIME supposed to be used today?
-Kees
--
Kees Cook
On Sun, Jun 12, 2022 at 11:07:22PM -0700, Andrei Vagin wrote:
> Right now, a new process can't be forked in another time namespace
> if it shares mm with its parent. It is prohibited, because each time
> namespace has its own vvar page that is mapped into a process address
> space.
>
> When a process calls exec, it gets a new mm and so it could be "legal"
> to switch time namespace in that case. This was not implemented and
> now if we want to do this, we need to add another clone flag to not
> break backward compatibility.
>
> We don't have any user requests to switch times on exec except the
> vfork+exec combination, so there is no reason to add a new clone flag.
> As for vfork+exec, this should be safe to allow switching timens with
> the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails
> if a child is forked into another time namespace. With this change,
> vfork creates a new process in parent's timens, and the following exec
> does the actual switch to the target time namespace.
>
> Suggested-by: Florian Weimer <[email protected]>
> Signed-off-by: Andrei Vagin <[email protected]>
> ---
Looks good,
Acked-by: Christian Brauner (Microsoft) <[email protected]>
On Tue, Jun 14, 2022 at 02:14:35PM -0700, Kees Cook wrote:
> On Sun, Jun 12, 2022 at 11:07:22PM -0700, Andrei Vagin wrote:
> > Right now, a new process can't be forked in another time namespace
> > if it shares mm with its parent. It is prohibited, because each time
> > namespace has its own vvar page that is mapped into a process address
> > space.
> >
> > When a process calls exec, it gets a new mm and so it could be "legal"
> > to switch time namespace in that case. This was not implemented and
> > now if we want to do this, we need to add another clone flag to not
> > break backward compatibility.
> >
> > We don't have any user requests to switch times on exec except the
> > vfork+exec combination, so there is no reason to add a new clone flag.
> > As for vfork+exec, this should be safe to allow switching timens with
> > the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails
> > if a child is forked into another time namespace. With this change,
> > vfork creates a new process in parent's timens, and the following exec
> > does the actual switch to the target time namespace.
>
> This seems like a very special case. None of the other namespaces do
> this, do they?
>
> How is CLONE_NEWTIME supposed to be used today?
Time namespaces are similar to pid namespaces. If a process calls
unshare(CLONE_NEWTIME) it will not change into a new time namespace.
Only the children of the process will.
You can also see this via /proc/<pid>/ns/time and
/proc/<pid>/ns/time_for_children. After an unshare(CLONE_NEWTIME)
/proc/<pid>/ns/time will be unchanged while
/proc/<pid>/ns/time_for_children will reference a new time namespace.
So if the process now calls fork() the child will be placed in a new
time namespace.
As Andrei correctly points out in the commit message each time namespace
gets it's own vvar page mapped into the process address space.
Consequently calls to clone*() with CLONE_VM will need to fail because
it would alter the parent's mm as well. That includes vfork() which is
roughly just CLONE_VM | CLONE_VFORK.
fork() remains unaffected. So anything that implements a process
launcher using vfork() needs to implement a fork() fallback after vfork
failure() in case the original process has unshared a new time
namespace.
As posix spawn is implemented using vfork() we would force glibc to
implement a fork() fallback and enforce the introducing of a lot of
complexity to work around this.
I think the proposal here makes sense and allows us to avoid introducing
yet another clone flag. For vfork() it also makes sense because the
calling process is suspended until exec or exit so the semantics between
fork() and clone*() are sufficiently distinct to justify this
difference. Iow, vfork() is distinctly targeted at enforcing an exec*
call already anyway.
Christian
* Kees Cook:
> On Sun, Jun 12, 2022 at 11:07:22PM -0700, Andrei Vagin wrote:
>> Right now, a new process can't be forked in another time namespace
>> if it shares mm with its parent. It is prohibited, because each time
>> namespace has its own vvar page that is mapped into a process address
>> space.
>>
>> When a process calls exec, it gets a new mm and so it could be "legal"
>> to switch time namespace in that case. This was not implemented and
>> now if we want to do this, we need to add another clone flag to not
>> break backward compatibility.
>>
>> We don't have any user requests to switch times on exec except the
>> vfork+exec combination, so there is no reason to add a new clone flag.
>> As for vfork+exec, this should be safe to allow switching timens with
>> the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails
>> if a child is forked into another time namespace. With this change,
>> vfork creates a new process in parent's timens, and the following exec
>> does the actual switch to the target time namespace.
>
> This seems like a very special case. None of the other namespaces do
> this, do they?
I think this started with CLONE_NEWPID, which had a similar delayed
effect with unshare: it happens only after fork, not for the current
process image. I think it's just a limitation of the unshare interface.
Some of the effects simply have to be delayed due to their nature.
Thanks,
Florian
On Wed, Jun 15, 2022 at 09:53:29AM +0200, Florian Weimer wrote:
> * Kees Cook:
>
> > On Sun, Jun 12, 2022 at 11:07:22PM -0700, Andrei Vagin wrote:
> >> Right now, a new process can't be forked in another time namespace
> >> if it shares mm with its parent. It is prohibited, because each time
> >> namespace has its own vvar page that is mapped into a process address
> >> space.
> >>
> >> When a process calls exec, it gets a new mm and so it could be "legal"
> >> to switch time namespace in that case. This was not implemented and
> >> now if we want to do this, we need to add another clone flag to not
> >> break backward compatibility.
> >>
> >> We don't have any user requests to switch times on exec except the
> >> vfork+exec combination, so there is no reason to add a new clone flag.
> >> As for vfork+exec, this should be safe to allow switching timens with
> >> the current clone flag. Right now, vfork (CLONE_VFORK | CLONE_VM) fails
> >> if a child is forked into another time namespace. With this change,
> >> vfork creates a new process in parent's timens, and the following exec
> >> does the actual switch to the target time namespace.
> >
> > This seems like a very special case. None of the other namespaces do
> > this, do they?
>
> I think this started with CLONE_NEWPID, which had a similar delayed
> effect with unshare: it happens only after fork, not for the current
> process image. I think it's just a limitation of the unshare interface.
> Some of the effects simply have to be delayed due to their nature.
I tried to give more context in another mail wrt to time namespaces
specifically.
For pid namespaces one problem would be that it could end up confusing a
process about its own pid. This was a more serious problem when the pid
cache was still active in glibc; but fwiw systemd still has a pid cache
afair.
Christian
* Christian Brauner:
> For pid namespaces one problem would be that it could end up confusing a
> process about its own pid. This was a more serious problem when the pid
> cache was still active in glibc; but fwiw systemd still has a pid cache
> afair.
Right. glibc still has a TID cache, mainly for use with recursive
mutexes (where we need a 32-bit thread identifier and can't perform a
system call on every locking operation for performance reasons).
Assuming that a non-delayed CLONE_NEWPID would also change the TID
underneath us, we'd have subtly broken recursive mutexes.
vfork gets away with not updating the TID cache (which is shared with
the parent process) because the parent process is suspended while the
new subprocess is still running and has not execve'ed yet.
Now one could argue that calling unshare automatically means that you
must not call any glibc functions afterwards (similar to thread-creating
clone), or at least that you cannot call any functions which are not
async-signal-safe, but that does not match existing application
practice. And I think we actually prefer that file servers call chroot
after unshare(CLONE_FS), rather than trying to reimplement restricted
pathname lookup in userspace.
Thanks,
Florian
On Wed, Jun 15, 2022 at 10:14:19AM +0200, Florian Weimer wrote:
> * Christian Brauner:
>
> > For pid namespaces one problem would be that it could end up confusing a
> > process about its own pid. This was a more serious problem when the pid
> > cache was still active in glibc; but fwiw systemd still has a pid cache
> > afair.
>
> Right. glibc still has a TID cache, mainly for use with recursive
> mutexes (where we need a 32-bit thread identifier and can't perform a
> system call on every locking operation for performance reasons).
> Assuming that a non-delayed CLONE_NEWPID would also change the TID
> underneath us, we'd have subtly broken recursive mutexes.
Fwiw, you can't call CLONE_NEWPID with CLONE_THREAD. This guarantees
that threads can send signals to each other and all threads within the
same threadgroup can be reached via proc. It'd be awkward if you'd have
a thread whose thread-group leader lives in an ancestor pidns.
Even if you'd make whole threadgroup change pid namespaces immediately
it would mean allocating new TGID and TIDs in the new pid namespaces -
unless they are accidently not already allocated.
>
> vfork gets away with not updating the TID cache (which is shared with
> the parent process) because the parent process is suspended while the
> new subprocess is still running and has not execve'ed yet.
>
> Now one could argue that calling unshare automatically means that you
> must not call any glibc functions afterwards (similar to thread-creating
> clone), or at least that you cannot call any functions which are not
> async-signal-safe, but that does not match existing application
> practice. And I think we actually prefer that file servers call chroot
Yeah, that'd be a rather subtle and risky change for pid namespaces.
On Sun, 12 Jun 2022 23:07:22 -0700, Andrei Vagin wrote:
> Right now, a new process can't be forked in another time namespace
> if it shares mm with its parent. It is prohibited, because each time
> namespace has its own vvar page that is mapped into a process address
> space.
>
> When a process calls exec, it gets a new mm and so it could be "legal"
> to switch time namespace in that case. This was not implemented and
> now if we want to do this, we need to add another clone flag to not
> break backward compatibility.
>
> [...]
Applied to for-next/execve, thanks!
[1/2] fs/exec: allow to unshare a time namespace on vfork+exec
https://git.kernel.org/kees/c/133e2d3e81de
[2/2] testing/timens: add a test for vfork+exit
https://git.kernel.org/kees/c/6342140db660
--
Kees Cook