This patch is the first step in enabling checkpoint/restore of processes
with seccomp enabled.
One of the things CRIU does while dumping tasks is inject code into them
via ptrace to collect information that is only available to the process
itself. However, if we are in a seccomp mode where these processes are
prohibited from making these syscalls, then what CRIU does kills the task.
This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
task from the init user namespace which has CAP_SYS_ADMIN to disable (and
re-enable) seccomp filters for another task so that they can be
successfully dumped (and restored).
Signed-off-by: Tycho Andersen <[email protected]>
CC: Kees Cook <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Will Drewry <[email protected]>
CC: Roland McGrath <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Pavel Emelyanov <[email protected]>
CC: Serge E. Hallyn <[email protected]>
---
include/linux/seccomp.h | 8 ++++++
include/uapi/linux/ptrace.h | 1 +
kernel/ptrace.c | 10 ++++++++
kernel/seccomp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index a19ddac..7cc870f 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -25,6 +25,9 @@ struct seccomp_filter;
struct seccomp {
int mode;
struct seccomp_filter *filter;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ bool suspended;
+#endif
};
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
@@ -53,6 +56,11 @@ static inline int seccomp_mode(struct seccomp *s)
return s->mode;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+extern int suspend_seccomp(struct task_struct *);
+extern int resume_seccomp(struct task_struct *);
+#endif
+
#else /* CONFIG_SECCOMP */
#include <linux/errno.h>
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index cf1019e..8ba4e4f 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -17,6 +17,7 @@
#define PTRACE_CONT 7
#define PTRACE_KILL 8
#define PTRACE_SINGLESTEP 9
+#define PTRACE_SUSPEND_SECCOMP 10
#define PTRACE_ATTACH 16
#define PTRACE_DETACH 17
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e05..a6b6527 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -15,6 +15,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/ptrace.h>
+#include <linux/seccomp.h>
#include <linux/security.h>
#include <linux/signal.h>
#include <linux/uio.h>
@@ -1003,6 +1004,15 @@ int ptrace_request(struct task_struct *child, long request,
break;
}
#endif
+
+#if defined(CONFIG_SECCOMP) && defined(CONFIG_CHECKPOINT_RESTORE)
+ case PTRACE_SUSPEND_SECCOMP:
+ if (data)
+ return suspend_seccomp(child);
+ else
+ return resume_seccomp(child);
+#endif
+
default:
break;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 980fd26..a358a58 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -569,6 +569,7 @@ static int mode1_syscalls_32[] = {
static void __secure_computing_strict(int this_syscall)
{
int *syscall_whitelist = mode1_syscalls;
+
#ifdef CONFIG_COMPAT
if (is_compat_task())
syscall_whitelist = mode1_syscalls_32;
@@ -590,6 +591,11 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (current->seccomp.suspended)
+ return;
+#endif
+
if (mode == 0)
return;
else if (mode == SECCOMP_MODE_STRICT)
@@ -691,6 +697,11 @@ u32 seccomp_phase1(struct seccomp_data *sd)
int this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (unlikely(current->seccomp.suspended))
+ return SECCOMP_PHASE1_OK;
+#endif
+
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
@@ -769,7 +780,8 @@ static long seccomp_set_mode_strict(void)
goto out;
#ifdef TIF_NOTSC
- disable_TSC();
+ if (!current->seccomp.suspended)
+ disable_TSC();
#endif
seccomp_assign_mode(current, seccomp_mode);
ret = 0;
@@ -901,3 +913,51 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
/* prctl interface doesn't have flags, so they are always zero. */
return do_seccomp(op, 0, uargs);
}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+int suspend_seccomp(struct task_struct *task)
+{
+ int ret = -EACCES;
+
+ spin_lock_irq(&task->sighand->siglock);
+
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ task->seccomp.suspended = true;
+
+#ifdef TIF_NOTSC
+ if (task->seccomp.mode == SECCOMP_MODE_STRICT)
+ clear_tsk_thread_flag(task, TIF_NOTSC);
+#endif
+
+ ret = 0;
+out:
+ spin_unlock_irq(&task->sighand->siglock);
+
+ return ret;
+}
+
+int resume_seccomp(struct task_struct *task)
+{
+ int ret = -EACCES;
+
+ spin_lock_irq(&task->sighand->siglock);
+
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ task->seccomp.suspended = false;
+
+#ifdef TIF_NOTSC
+ if (task->seccomp.mode == SECCOMP_MODE_STRICT)
+ set_tsk_thread_flag(task, TIF_NOTSC);
+#endif
+
+ ret = 0;
+out:
+ spin_unlock_irq(&task->sighand->siglock);
+
+ return ret;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
--
2.1.4
On Mon, Jun 1, 2015 at 12:28 PM, Tycho Andersen
<[email protected]> wrote:
> This patch is the first step in enabling checkpoint/restore of processes
> with seccomp enabled.
>
> One of the things CRIU does while dumping tasks is inject code into them
> via ptrace to collect information that is only available to the process
> itself. However, if we are in a seccomp mode where these processes are
> prohibited from making these syscalls, then what CRIU does kills the task.
>
> This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
> task from the init user namespace which has CAP_SYS_ADMIN to disable (and
> re-enable) seccomp filters for another task so that they can be
> successfully dumped (and restored).
>
> Signed-off-by: Tycho Andersen <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Roland McGrath <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Pavel Emelyanov <[email protected]>
> CC: Serge E. Hallyn <[email protected]>
> ---
> include/linux/seccomp.h | 8 ++++++
> include/uapi/linux/ptrace.h | 1 +
> kernel/ptrace.c | 10 ++++++++
> kernel/seccomp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
> 4 files changed, 80 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index a19ddac..7cc870f 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -25,6 +25,9 @@ struct seccomp_filter;
> struct seccomp {
> int mode;
> struct seccomp_filter *filter;
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> + bool suspended;
> +#endif
> };
>
> #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
> @@ -53,6 +56,11 @@ static inline int seccomp_mode(struct seccomp *s)
> return s->mode;
> }
>
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> +extern int suspend_seccomp(struct task_struct *);
> +extern int resume_seccomp(struct task_struct *);
> +#endif
> +
> #else /* CONFIG_SECCOMP */
>
> #include <linux/errno.h>
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index cf1019e..8ba4e4f 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -17,6 +17,7 @@
> #define PTRACE_CONT 7
> #define PTRACE_KILL 8
> #define PTRACE_SINGLESTEP 9
> +#define PTRACE_SUSPEND_SECCOMP 10
>
> #define PTRACE_ATTACH 16
> #define PTRACE_DETACH 17
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index c8e0e05..a6b6527 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -15,6 +15,7 @@
> #include <linux/highmem.h>
> #include <linux/pagemap.h>
> #include <linux/ptrace.h>
> +#include <linux/seccomp.h>
> #include <linux/security.h>
> #include <linux/signal.h>
> #include <linux/uio.h>
> @@ -1003,6 +1004,15 @@ int ptrace_request(struct task_struct *child, long request,
> break;
> }
> #endif
> +
> +#if defined(CONFIG_SECCOMP) && defined(CONFIG_CHECKPOINT_RESTORE)
> + case PTRACE_SUSPEND_SECCOMP:
> + if (data)
> + return suspend_seccomp(child);
> + else
> + return resume_seccomp(child);
> +#endif
> +
> default:
> break;
> }
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 980fd26..a358a58 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -569,6 +569,7 @@ static int mode1_syscalls_32[] = {
> static void __secure_computing_strict(int this_syscall)
> {
> int *syscall_whitelist = mode1_syscalls;
> +
> #ifdef CONFIG_COMPAT
> if (is_compat_task())
> syscall_whitelist = mode1_syscalls_32;
> @@ -590,6 +591,11 @@ void secure_computing_strict(int this_syscall)
> {
> int mode = current->seccomp.mode;
>
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> + if (current->seccomp.suspended)
> + return;
> +#endif
IMO it's unfortunate that this has any runtime overhead at all. Can
it be suspend be multiplexed into mode to avoid this?
>
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> + if (unlikely(current->seccomp.suspended))
> + return SECCOMP_PHASE1_OK;
> +#endif
> +
Ditto.
> @@ -769,7 +780,8 @@ static long seccomp_set_mode_strict(void)
> goto out;
>
> #ifdef TIF_NOTSC
> - disable_TSC();
> + if (!current->seccomp.suspended)
> + disable_TSC();
> #endif
If you get here with seccomp suspended, you have already utterly
failed. Please don't try to pretend that you're going to do something
sensible if this happens.
CRIU needs to freeze the target task reliably before suspending
seccomp to avoid massive security holes.
> seccomp_assign_mode(current, seccomp_mode);
> ret = 0;
> @@ -901,3 +913,51 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
> /* prctl interface doesn't have flags, so they are always zero. */
> return do_seccomp(op, 0, uargs);
> }
> +
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> +int suspend_seccomp(struct task_struct *task)
> +{
> + int ret = -EACCES;
> +
> + spin_lock_irq(&task->sighand->siglock);
> +
> + if (!capable(CAP_SYS_ADMIN))
> + goto out;
> +
> + task->seccomp.suspended = true;
> +
> +#ifdef TIF_NOTSC
> + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> + clear_tsk_thread_flag(task, TIF_NOTSC);
> +#endif
And what if the task is running?
> +
> + ret = 0;
> +out:
> + spin_unlock_irq(&task->sighand->siglock);
> +
> + return ret;
> +}
> +
> +int resume_seccomp(struct task_struct *task)
> +{
> + int ret = -EACCES;
> +
> + spin_lock_irq(&task->sighand->siglock);
> +
> + if (!capable(CAP_SYS_ADMIN))
> + goto out;
> +
> + task->seccomp.suspended = false;
> +
> +#ifdef TIF_NOTSC
> + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> + set_tsk_thread_flag(task, TIF_NOTSC);
> +#endif
Ditto. Or can the task not be running here?
--Andy
On Mon, Jun 01, 2015 at 12:38:57PM -0700, Andy Lutomirski wrote:
> On Mon, Jun 1, 2015 at 12:28 PM, Tycho Andersen
> <[email protected]> wrote:
> > This patch is the first step in enabling checkpoint/restore of processes
> > with seccomp enabled.
> >
> > One of the things CRIU does while dumping tasks is inject code into them
> > via ptrace to collect information that is only available to the process
> > itself. However, if we are in a seccomp mode where these processes are
> > prohibited from making these syscalls, then what CRIU does kills the task.
> >
> > This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
> > task from the init user namespace which has CAP_SYS_ADMIN to disable (and
> > re-enable) seccomp filters for another task so that they can be
> > successfully dumped (and restored).
> >
> > Signed-off-by: Tycho Andersen <[email protected]>
> > CC: Kees Cook <[email protected]>
> > CC: Andy Lutomirski <[email protected]>
> > CC: Will Drewry <[email protected]>
> > CC: Roland McGrath <[email protected]>
> > CC: Oleg Nesterov <[email protected]>
> > CC: Pavel Emelyanov <[email protected]>
> > CC: Serge E. Hallyn <[email protected]>
> > ---
> > include/linux/seccomp.h | 8 ++++++
> > include/uapi/linux/ptrace.h | 1 +
> > kernel/ptrace.c | 10 ++++++++
> > kernel/seccomp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
> > 4 files changed, 80 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > index a19ddac..7cc870f 100644
> > --- a/include/linux/seccomp.h
> > +++ b/include/linux/seccomp.h
> > @@ -25,6 +25,9 @@ struct seccomp_filter;
> > struct seccomp {
> > int mode;
> > struct seccomp_filter *filter;
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > + bool suspended;
> > +#endif
> > };
> >
> > #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
> > @@ -53,6 +56,11 @@ static inline int seccomp_mode(struct seccomp *s)
> > return s->mode;
> > }
> >
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > +extern int suspend_seccomp(struct task_struct *);
> > +extern int resume_seccomp(struct task_struct *);
> > +#endif
> > +
> > #else /* CONFIG_SECCOMP */
> >
> > #include <linux/errno.h>
> > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> > index cf1019e..8ba4e4f 100644
> > --- a/include/uapi/linux/ptrace.h
> > +++ b/include/uapi/linux/ptrace.h
> > @@ -17,6 +17,7 @@
> > #define PTRACE_CONT 7
> > #define PTRACE_KILL 8
> > #define PTRACE_SINGLESTEP 9
> > +#define PTRACE_SUSPEND_SECCOMP 10
> >
> > #define PTRACE_ATTACH 16
> > #define PTRACE_DETACH 17
> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > index c8e0e05..a6b6527 100644
> > --- a/kernel/ptrace.c
> > +++ b/kernel/ptrace.c
> > @@ -15,6 +15,7 @@
> > #include <linux/highmem.h>
> > #include <linux/pagemap.h>
> > #include <linux/ptrace.h>
> > +#include <linux/seccomp.h>
> > #include <linux/security.h>
> > #include <linux/signal.h>
> > #include <linux/uio.h>
> > @@ -1003,6 +1004,15 @@ int ptrace_request(struct task_struct *child, long request,
> > break;
> > }
> > #endif
> > +
> > +#if defined(CONFIG_SECCOMP) && defined(CONFIG_CHECKPOINT_RESTORE)
> > + case PTRACE_SUSPEND_SECCOMP:
> > + if (data)
> > + return suspend_seccomp(child);
> > + else
> > + return resume_seccomp(child);
> > +#endif
> > +
> > default:
> > break;
> > }
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index 980fd26..a358a58 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -569,6 +569,7 @@ static int mode1_syscalls_32[] = {
> > static void __secure_computing_strict(int this_syscall)
> > {
> > int *syscall_whitelist = mode1_syscalls;
> > +
> > #ifdef CONFIG_COMPAT
> > if (is_compat_task())
> > syscall_whitelist = mode1_syscalls_32;
> > @@ -590,6 +591,11 @@ void secure_computing_strict(int this_syscall)
> > {
> > int mode = current->seccomp.mode;
> >
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > + if (current->seccomp.suspended)
> > + return;
> > +#endif
>
> IMO it's unfortunate that this has any runtime overhead at all. Can
> it be suspend be multiplexed into mode to avoid this?
>
> >
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > + if (unlikely(current->seccomp.suspended))
> > + return SECCOMP_PHASE1_OK;
> > +#endif
> > +
>
> Ditto.
>
> > @@ -769,7 +780,8 @@ static long seccomp_set_mode_strict(void)
> > goto out;
> >
> > #ifdef TIF_NOTSC
> > - disable_TSC();
> > + if (!current->seccomp.suspended)
> > + disable_TSC();
> > #endif
>
> If you get here with seccomp suspended, you have already utterly
> failed. Please don't try to pretend that you're going to do something
> sensible if this happens.
>
> CRIU needs to freeze the target task reliably before suspending
> seccomp to avoid massive security holes.
Sorry, I should probably have mentioned this. It actually useful on
restore: the problem is that CRIU maps its restorer code into memory,
does all of the necessary work to restore (including restore the
seccomp stuff), and then tries to unmap that blob and gets killed.
So, the criu restore procedure is to suspend seccomp, restore the
seccomp state, and then let criu resume the seccomp state just before
the final task resume.
> > seccomp_assign_mode(current, seccomp_mode);
> > ret = 0;
> > @@ -901,3 +913,51 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
> > /* prctl interface doesn't have flags, so they are always zero. */
> > return do_seccomp(op, 0, uargs);
> > }
> > +
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > +int suspend_seccomp(struct task_struct *task)
> > +{
> > + int ret = -EACCES;
> > +
> > + spin_lock_irq(&task->sighand->siglock);
> > +
> > + if (!capable(CAP_SYS_ADMIN))
> > + goto out;
> > +
> > + task->seccomp.suspended = true;
> > +
> > +#ifdef TIF_NOTSC
> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> > + clear_tsk_thread_flag(task, TIF_NOTSC);
> > +#endif
>
> And what if the task is running?
>
> > +
> > + ret = 0;
> > +out:
> > + spin_unlock_irq(&task->sighand->siglock);
> > +
> > + return ret;
> > +}
> > +
> > +int resume_seccomp(struct task_struct *task)
> > +{
> > + int ret = -EACCES;
> > +
> > + spin_lock_irq(&task->sighand->siglock);
> > +
> > + if (!capable(CAP_SYS_ADMIN))
> > + goto out;
> > +
> > + task->seccomp.suspended = false;
> > +
> > +#ifdef TIF_NOTSC
> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> > + set_tsk_thread_flag(task, TIF_NOTSC);
> > +#endif
>
> Ditto. Or can the task not be running here?
It is stopped since ptrace requires it to be stopped; I don't know if
that's enough to guarantee correctness, though. Is there some
additional barrier that is needed?
Tycho
> --Andy
On Mon, Jun 1, 2015 at 12:47 PM, Tycho Andersen
<[email protected]> wrote:
> On Mon, Jun 01, 2015 at 12:38:57PM -0700, Andy Lutomirski wrote:
>> On Mon, Jun 1, 2015 at 12:28 PM, Tycho Andersen
>> <[email protected]> wrote:
>> > This patch is the first step in enabling checkpoint/restore of processes
>> > with seccomp enabled.
>> >
>> > One of the things CRIU does while dumping tasks is inject code into them
>> > via ptrace to collect information that is only available to the process
>> > itself. However, if we are in a seccomp mode where these processes are
>> > prohibited from making these syscalls, then what CRIU does kills the task.
>> >
>> > This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
>> > task from the init user namespace which has CAP_SYS_ADMIN to disable (and
>> > re-enable) seccomp filters for another task so that they can be
>> > successfully dumped (and restored).
>> >
>> > Signed-off-by: Tycho Andersen <[email protected]>
>> > CC: Kees Cook <[email protected]>
>> > CC: Andy Lutomirski <[email protected]>
>> > CC: Will Drewry <[email protected]>
>> > CC: Roland McGrath <[email protected]>
>> > CC: Oleg Nesterov <[email protected]>
>> > CC: Pavel Emelyanov <[email protected]>
>> > CC: Serge E. Hallyn <[email protected]>
>> > ---
>> > include/linux/seccomp.h | 8 ++++++
>> > include/uapi/linux/ptrace.h | 1 +
>> > kernel/ptrace.c | 10 ++++++++
>> > kernel/seccomp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
>> > 4 files changed, 80 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> > index a19ddac..7cc870f 100644
>> > --- a/include/linux/seccomp.h
>> > +++ b/include/linux/seccomp.h
>> > @@ -25,6 +25,9 @@ struct seccomp_filter;
>> > struct seccomp {
>> > int mode;
>> > struct seccomp_filter *filter;
>> > +#ifdef CONFIG_CHECKPOINT_RESTORE
>> > + bool suspended;
>> > +#endif
>> > };
>> >
>> > #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
>> > @@ -53,6 +56,11 @@ static inline int seccomp_mode(struct seccomp *s)
>> > return s->mode;
>> > }
>> >
>> > +#ifdef CONFIG_CHECKPOINT_RESTORE
>> > +extern int suspend_seccomp(struct task_struct *);
>> > +extern int resume_seccomp(struct task_struct *);
>> > +#endif
>> > +
>> > #else /* CONFIG_SECCOMP */
>> >
>> > #include <linux/errno.h>
>> > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
>> > index cf1019e..8ba4e4f 100644
>> > --- a/include/uapi/linux/ptrace.h
>> > +++ b/include/uapi/linux/ptrace.h
>> > @@ -17,6 +17,7 @@
>> > #define PTRACE_CONT 7
>> > #define PTRACE_KILL 8
>> > #define PTRACE_SINGLESTEP 9
>> > +#define PTRACE_SUSPEND_SECCOMP 10
>> >
>> > #define PTRACE_ATTACH 16
>> > #define PTRACE_DETACH 17
>> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
>> > index c8e0e05..a6b6527 100644
>> > --- a/kernel/ptrace.c
>> > +++ b/kernel/ptrace.c
>> > @@ -15,6 +15,7 @@
>> > #include <linux/highmem.h>
>> > #include <linux/pagemap.h>
>> > #include <linux/ptrace.h>
>> > +#include <linux/seccomp.h>
>> > #include <linux/security.h>
>> > #include <linux/signal.h>
>> > #include <linux/uio.h>
>> > @@ -1003,6 +1004,15 @@ int ptrace_request(struct task_struct *child, long request,
>> > break;
>> > }
>> > #endif
>> > +
>> > +#if defined(CONFIG_SECCOMP) && defined(CONFIG_CHECKPOINT_RESTORE)
>> > + case PTRACE_SUSPEND_SECCOMP:
>> > + if (data)
>> > + return suspend_seccomp(child);
>> > + else
>> > + return resume_seccomp(child);
>> > +#endif
>> > +
>> > default:
>> > break;
>> > }
>> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> > index 980fd26..a358a58 100644
>> > --- a/kernel/seccomp.c
>> > +++ b/kernel/seccomp.c
>> > @@ -569,6 +569,7 @@ static int mode1_syscalls_32[] = {
>> > static void __secure_computing_strict(int this_syscall)
>> > {
>> > int *syscall_whitelist = mode1_syscalls;
>> > +
>> > #ifdef CONFIG_COMPAT
>> > if (is_compat_task())
>> > syscall_whitelist = mode1_syscalls_32;
>> > @@ -590,6 +591,11 @@ void secure_computing_strict(int this_syscall)
>> > {
>> > int mode = current->seccomp.mode;
>> >
>> > +#ifdef CONFIG_CHECKPOINT_RESTORE
>> > + if (current->seccomp.suspended)
>> > + return;
>> > +#endif
>>
>> IMO it's unfortunate that this has any runtime overhead at all. Can
>> it be suspend be multiplexed into mode to avoid this?
>>
>> >
>> > +#ifdef CONFIG_CHECKPOINT_RESTORE
>> > + if (unlikely(current->seccomp.suspended))
>> > + return SECCOMP_PHASE1_OK;
>> > +#endif
>> > +
>>
>> Ditto.
>>
>> > @@ -769,7 +780,8 @@ static long seccomp_set_mode_strict(void)
>> > goto out;
>> >
>> > #ifdef TIF_NOTSC
>> > - disable_TSC();
>> > + if (!current->seccomp.suspended)
>> > + disable_TSC();
>> > #endif
>>
>> If you get here with seccomp suspended, you have already utterly
>> failed. Please don't try to pretend that you're going to do something
>> sensible if this happens.
>>
>> CRIU needs to freeze the target task reliably before suspending
>> seccomp to avoid massive security holes.
>
> Sorry, I should probably have mentioned this. It actually useful on
> restore: the problem is that CRIU maps its restorer code into memory,
> does all of the necessary work to restore (including restore the
> seccomp stuff), and then tries to unmap that blob and gets killed.
>
> So, the criu restore procedure is to suspend seccomp, restore the
> seccomp state, and then let criu resume the seccomp state just before
> the final task resume.
>
>> > seccomp_assign_mode(current, seccomp_mode);
>> > ret = 0;
>> > @@ -901,3 +913,51 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
>> > /* prctl interface doesn't have flags, so they are always zero. */
>> > return do_seccomp(op, 0, uargs);
>> > }
>> > +
>> > +#ifdef CONFIG_CHECKPOINT_RESTORE
>> > +int suspend_seccomp(struct task_struct *task)
>> > +{
>> > + int ret = -EACCES;
>> > +
>> > + spin_lock_irq(&task->sighand->siglock);
>> > +
>> > + if (!capable(CAP_SYS_ADMIN))
>> > + goto out;
>> > +
>> > + task->seccomp.suspended = true;
>> > +
>> > +#ifdef TIF_NOTSC
>> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
>> > + clear_tsk_thread_flag(task, TIF_NOTSC);
>> > +#endif
>>
>> And what if the task is running?
>>
>> > +
>> > + ret = 0;
>> > +out:
>> > + spin_unlock_irq(&task->sighand->siglock);
>> > +
>> > + return ret;
>> > +}
>> > +
>> > +int resume_seccomp(struct task_struct *task)
>> > +{
>> > + int ret = -EACCES;
>> > +
>> > + spin_lock_irq(&task->sighand->siglock);
>> > +
>> > + if (!capable(CAP_SYS_ADMIN))
>> > + goto out;
>> > +
>> > + task->seccomp.suspended = false;
>> > +
>> > +#ifdef TIF_NOTSC
>> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
>> > + set_tsk_thread_flag(task, TIF_NOTSC);
>> > +#endif
>>
>> Ditto. Or can the task not be running here?
>
> It is stopped since ptrace requires it to be stopped; I don't know if
> that's enough to guarantee correctness, though. Is there some
> additional barrier that is needed?
Dunno. Does ptrace actually guarantee that for new operations?
In any event, I'm not sure I see why you need to manipulate NOTSC like
this at all. What goes wrong if you let the NOTSC take effect
immediately?
--Andy
On Mon, Jun 01, 2015 at 12:38:57PM -0700, Andy Lutomirski wrote:
> On Mon, Jun 1, 2015 at 12:28 PM, Tycho Andersen
> <[email protected]> wrote:
> > This patch is the first step in enabling checkpoint/restore of processes
> > with seccomp enabled.
> >
> > One of the things CRIU does while dumping tasks is inject code into them
> > via ptrace to collect information that is only available to the process
> > itself. However, if we are in a seccomp mode where these processes are
> > prohibited from making these syscalls, then what CRIU does kills the task.
> >
> > This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
> > task from the init user namespace which has CAP_SYS_ADMIN to disable (and
> > re-enable) seccomp filters for another task so that they can be
> > successfully dumped (and restored).
> >
> > Signed-off-by: Tycho Andersen <[email protected]>
> > CC: Kees Cook <[email protected]>
> > CC: Andy Lutomirski <[email protected]>
> > CC: Will Drewry <[email protected]>
> > CC: Roland McGrath <[email protected]>
> > CC: Oleg Nesterov <[email protected]>
> > CC: Pavel Emelyanov <[email protected]>
> > CC: Serge E. Hallyn <[email protected]>
> > ---
> > include/linux/seccomp.h | 8 ++++++
> > include/uapi/linux/ptrace.h | 1 +
> > kernel/ptrace.c | 10 ++++++++
> > kernel/seccomp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
> > 4 files changed, 80 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > index a19ddac..7cc870f 100644
> > --- a/include/linux/seccomp.h
> > +++ b/include/linux/seccomp.h
> > @@ -25,6 +25,9 @@ struct seccomp_filter;
> > struct seccomp {
> > int mode;
> > struct seccomp_filter *filter;
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > + bool suspended;
> > +#endif
> > };
> >
> > #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
> > @@ -53,6 +56,11 @@ static inline int seccomp_mode(struct seccomp *s)
> > return s->mode;
> > }
> >
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > +extern int suspend_seccomp(struct task_struct *);
> > +extern int resume_seccomp(struct task_struct *);
> > +#endif
> > +
> > #else /* CONFIG_SECCOMP */
> >
> > #include <linux/errno.h>
> > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> > index cf1019e..8ba4e4f 100644
> > --- a/include/uapi/linux/ptrace.h
> > +++ b/include/uapi/linux/ptrace.h
> > @@ -17,6 +17,7 @@
> > #define PTRACE_CONT 7
> > #define PTRACE_KILL 8
> > #define PTRACE_SINGLESTEP 9
> > +#define PTRACE_SUSPEND_SECCOMP 10
> >
> > #define PTRACE_ATTACH 16
> > #define PTRACE_DETACH 17
> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > index c8e0e05..a6b6527 100644
> > --- a/kernel/ptrace.c
> > +++ b/kernel/ptrace.c
> > @@ -15,6 +15,7 @@
> > #include <linux/highmem.h>
> > #include <linux/pagemap.h>
> > #include <linux/ptrace.h>
> > +#include <linux/seccomp.h>
> > #include <linux/security.h>
> > #include <linux/signal.h>
> > #include <linux/uio.h>
> > @@ -1003,6 +1004,15 @@ int ptrace_request(struct task_struct *child, long request,
> > break;
> > }
> > #endif
> > +
> > +#if defined(CONFIG_SECCOMP) && defined(CONFIG_CHECKPOINT_RESTORE)
> > + case PTRACE_SUSPEND_SECCOMP:
> > + if (data)
> > + return suspend_seccomp(child);
> > + else
> > + return resume_seccomp(child);
> > +#endif
> > +
> > default:
> > break;
> > }
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index 980fd26..a358a58 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -569,6 +569,7 @@ static int mode1_syscalls_32[] = {
> > static void __secure_computing_strict(int this_syscall)
> > {
> > int *syscall_whitelist = mode1_syscalls;
> > +
> > #ifdef CONFIG_COMPAT
> > if (is_compat_task())
> > syscall_whitelist = mode1_syscalls_32;
> > @@ -590,6 +591,11 @@ void secure_computing_strict(int this_syscall)
> > {
> > int mode = current->seccomp.mode;
> >
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > + if (current->seccomp.suspended)
> > + return;
> > +#endif
>
> IMO it's unfortunate that this has any runtime overhead at all. Can
> it be suspend be multiplexed into mode to avoid this?
Yes, I can make that change. Something like:
#define SECCOMP_SUSPENDED 0x8000000
and then masking it off at the appropriate places?
Tycho
On Mon, Jun 01, 2015 at 12:51:12PM -0700, Andy Lutomirski wrote:
> On Mon, Jun 1, 2015 at 12:47 PM, Tycho Andersen
> <[email protected]> wrote:
> > On Mon, Jun 01, 2015 at 12:38:57PM -0700, Andy Lutomirski wrote:
> >> On Mon, Jun 1, 2015 at 12:28 PM, Tycho Andersen
> >> <[email protected]> wrote:
> >> > This patch is the first step in enabling checkpoint/restore of processes
> >> > with seccomp enabled.
> >> >
> >> > One of the things CRIU does while dumping tasks is inject code into them
> >> > via ptrace to collect information that is only available to the process
> >> > itself. However, if we are in a seccomp mode where these processes are
> >> > prohibited from making these syscalls, then what CRIU does kills the task.
> >> >
> >> > This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
> >> > task from the init user namespace which has CAP_SYS_ADMIN to disable (and
> >> > re-enable) seccomp filters for another task so that they can be
> >> > successfully dumped (and restored).
> >> >
> >> > Signed-off-by: Tycho Andersen <[email protected]>
> >> > CC: Kees Cook <[email protected]>
> >> > CC: Andy Lutomirski <[email protected]>
> >> > CC: Will Drewry <[email protected]>
> >> > CC: Roland McGrath <[email protected]>
> >> > CC: Oleg Nesterov <[email protected]>
> >> > CC: Pavel Emelyanov <[email protected]>
> >> > CC: Serge E. Hallyn <[email protected]>
> >> > ---
> >> > include/linux/seccomp.h | 8 ++++++
> >> > include/uapi/linux/ptrace.h | 1 +
> >> > kernel/ptrace.c | 10 ++++++++
> >> > kernel/seccomp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
> >> > 4 files changed, 80 insertions(+), 1 deletion(-)
> >> >
> >> > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> >> > index a19ddac..7cc870f 100644
> >> > --- a/include/linux/seccomp.h
> >> > +++ b/include/linux/seccomp.h
> >> > @@ -25,6 +25,9 @@ struct seccomp_filter;
> >> > struct seccomp {
> >> > int mode;
> >> > struct seccomp_filter *filter;
> >> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> >> > + bool suspended;
> >> > +#endif
> >> > };
> >> >
> >> > #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
> >> > @@ -53,6 +56,11 @@ static inline int seccomp_mode(struct seccomp *s)
> >> > return s->mode;
> >> > }
> >> >
> >> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> >> > +extern int suspend_seccomp(struct task_struct *);
> >> > +extern int resume_seccomp(struct task_struct *);
> >> > +#endif
> >> > +
> >> > #else /* CONFIG_SECCOMP */
> >> >
> >> > #include <linux/errno.h>
> >> > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> >> > index cf1019e..8ba4e4f 100644
> >> > --- a/include/uapi/linux/ptrace.h
> >> > +++ b/include/uapi/linux/ptrace.h
> >> > @@ -17,6 +17,7 @@
> >> > #define PTRACE_CONT 7
> >> > #define PTRACE_KILL 8
> >> > #define PTRACE_SINGLESTEP 9
> >> > +#define PTRACE_SUSPEND_SECCOMP 10
> >> >
> >> > #define PTRACE_ATTACH 16
> >> > #define PTRACE_DETACH 17
> >> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> >> > index c8e0e05..a6b6527 100644
> >> > --- a/kernel/ptrace.c
> >> > +++ b/kernel/ptrace.c
> >> > @@ -15,6 +15,7 @@
> >> > #include <linux/highmem.h>
> >> > #include <linux/pagemap.h>
> >> > #include <linux/ptrace.h>
> >> > +#include <linux/seccomp.h>
> >> > #include <linux/security.h>
> >> > #include <linux/signal.h>
> >> > #include <linux/uio.h>
> >> > @@ -1003,6 +1004,15 @@ int ptrace_request(struct task_struct *child, long request,
> >> > break;
> >> > }
> >> > #endif
> >> > +
> >> > +#if defined(CONFIG_SECCOMP) && defined(CONFIG_CHECKPOINT_RESTORE)
> >> > + case PTRACE_SUSPEND_SECCOMP:
> >> > + if (data)
> >> > + return suspend_seccomp(child);
> >> > + else
> >> > + return resume_seccomp(child);
> >> > +#endif
> >> > +
> >> > default:
> >> > break;
> >> > }
> >> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> >> > index 980fd26..a358a58 100644
> >> > --- a/kernel/seccomp.c
> >> > +++ b/kernel/seccomp.c
> >> > @@ -569,6 +569,7 @@ static int mode1_syscalls_32[] = {
> >> > static void __secure_computing_strict(int this_syscall)
> >> > {
> >> > int *syscall_whitelist = mode1_syscalls;
> >> > +
> >> > #ifdef CONFIG_COMPAT
> >> > if (is_compat_task())
> >> > syscall_whitelist = mode1_syscalls_32;
> >> > @@ -590,6 +591,11 @@ void secure_computing_strict(int this_syscall)
> >> > {
> >> > int mode = current->seccomp.mode;
> >> >
> >> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> >> > + if (current->seccomp.suspended)
> >> > + return;
> >> > +#endif
> >>
> >> IMO it's unfortunate that this has any runtime overhead at all. Can
> >> it be suspend be multiplexed into mode to avoid this?
> >>
> >> >
> >> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> >> > + if (unlikely(current->seccomp.suspended))
> >> > + return SECCOMP_PHASE1_OK;
> >> > +#endif
> >> > +
> >>
> >> Ditto.
> >>
> >> > @@ -769,7 +780,8 @@ static long seccomp_set_mode_strict(void)
> >> > goto out;
> >> >
> >> > #ifdef TIF_NOTSC
> >> > - disable_TSC();
> >> > + if (!current->seccomp.suspended)
> >> > + disable_TSC();
> >> > #endif
> >>
> >> If you get here with seccomp suspended, you have already utterly
> >> failed. Please don't try to pretend that you're going to do something
> >> sensible if this happens.
> >>
> >> CRIU needs to freeze the target task reliably before suspending
> >> seccomp to avoid massive security holes.
> >
> > Sorry, I should probably have mentioned this. It actually useful on
> > restore: the problem is that CRIU maps its restorer code into memory,
> > does all of the necessary work to restore (including restore the
> > seccomp stuff), and then tries to unmap that blob and gets killed.
> >
> > So, the criu restore procedure is to suspend seccomp, restore the
> > seccomp state, and then let criu resume the seccomp state just before
> > the final task resume.
> >
> >> > seccomp_assign_mode(current, seccomp_mode);
> >> > ret = 0;
> >> > @@ -901,3 +913,51 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
> >> > /* prctl interface doesn't have flags, so they are always zero. */
> >> > return do_seccomp(op, 0, uargs);
> >> > }
> >> > +
> >> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> >> > +int suspend_seccomp(struct task_struct *task)
> >> > +{
> >> > + int ret = -EACCES;
> >> > +
> >> > + spin_lock_irq(&task->sighand->siglock);
> >> > +
> >> > + if (!capable(CAP_SYS_ADMIN))
> >> > + goto out;
> >> > +
> >> > + task->seccomp.suspended = true;
> >> > +
> >> > +#ifdef TIF_NOTSC
> >> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> >> > + clear_tsk_thread_flag(task, TIF_NOTSC);
> >> > +#endif
> >>
> >> And what if the task is running?
> >>
> >> > +
> >> > + ret = 0;
> >> > +out:
> >> > + spin_unlock_irq(&task->sighand->siglock);
> >> > +
> >> > + return ret;
> >> > +}
> >> > +
> >> > +int resume_seccomp(struct task_struct *task)
> >> > +{
> >> > + int ret = -EACCES;
> >> > +
> >> > + spin_lock_irq(&task->sighand->siglock);
> >> > +
> >> > + if (!capable(CAP_SYS_ADMIN))
> >> > + goto out;
> >> > +
> >> > + task->seccomp.suspended = false;
> >> > +
> >> > +#ifdef TIF_NOTSC
> >> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> >> > + set_tsk_thread_flag(task, TIF_NOTSC);
> >> > +#endif
> >>
> >> Ditto. Or can the task not be running here?
> >
> > It is stopped since ptrace requires it to be stopped; I don't know if
> > that's enough to guarantee correctness, though. Is there some
> > additional barrier that is needed?
>
> Dunno. Does ptrace actually guarantee that for new operations?
It seems to; it kept giving me ESRCH when I didn't wait for it to
stop. I'll poke around and see if I can confirm this via the code.
> In any event, I'm not sure I see why you need to manipulate NOTSC like
> this at all. What goes wrong if you let the NOTSC take effect
> immediately?
When criu uses the TSC between the time the seccomp state is restored
and when it is resumed that task is killed. We could do this in CRIU
in such a way as to not use the TSC (do the seccomp restore as the
_very_ last thing before the final sigreturn), but it seemed like a
hack to me since seccomp was the thing enabling the TSC protection, so
presumably the seccomp suspend flag should suspend it as well.
As for what code is actually using the TSC, I guess the log code since
it prints stuff with timestamps, although I didn't look into it
closely.
Tycho
2015-06-01 22:28 GMT+03:00 Tycho Andersen <[email protected]>:
> This patch is the first step in enabling checkpoint/restore of processes
> with seccomp enabled.
>
> One of the things CRIU does while dumping tasks is inject code into them
> via ptrace to collect information that is only available to the process
> itself. However, if we are in a seccomp mode where these processes are
> prohibited from making these syscalls, then what CRIU does kills the task.
>
> This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
> task from the init user namespace which has CAP_SYS_ADMIN to disable (and
> re-enable) seccomp filters for another task so that they can be
> successfully dumped (and restored).
Do we need to re-enable seccomp if a tracer detaches unexpectedly.
CRIU can be killed and we should try to not affect the task state even
in this case.
Thanks,
Andrew
On Tue, Jun 02, 2015 at 12:36:16PM +0300, Andrey Wagin wrote:
> 2015-06-01 22:28 GMT+03:00 Tycho Andersen <[email protected]>:
> > This patch is the first step in enabling checkpoint/restore of processes
> > with seccomp enabled.
> >
> > One of the things CRIU does while dumping tasks is inject code into them
> > via ptrace to collect information that is only available to the process
> > itself. However, if we are in a seccomp mode where these processes are
> > prohibited from making these syscalls, then what CRIU does kills the task.
> >
> > This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
> > task from the init user namespace which has CAP_SYS_ADMIN to disable (and
> > re-enable) seccomp filters for another task so that they can be
> > successfully dumped (and restored).
>
> Do we need to re-enable seccomp if a tracer detaches unexpectedly.
> CRIU can be killed and we should try to not affect the task state even
> in this case.
Yes, I think Pavel's suggestion on the CRIU list of simply
automatically re-enabling seccomp on ptrace detach is the right way to
go here; it should cover this case. The only question is whether or
not to leave the explicit ability to re-enable seccomp before detach
or not. I don't think it's necessary for CRIU, so perhaps I'll remove
it in the next version.
Tycho
Hi Andy,
On Mon, Jun 01, 2015 at 02:12:33PM -0600, Tycho Andersen wrote:
> On Mon, Jun 01, 2015 at 12:51:12PM -0700, Andy Lutomirski wrote:
> > On Mon, Jun 1, 2015 at 12:47 PM, Tycho Andersen
> > <[email protected]> wrote:
> > > On Mon, Jun 01, 2015 at 12:38:57PM -0700, Andy Lutomirski wrote:
> > >> On Mon, Jun 1, 2015 at 12:28 PM, Tycho Andersen
> > >> > +int resume_seccomp(struct task_struct *task)
> > >> > +{
> > >> > + int ret = -EACCES;
> > >> > +
> > >> > + spin_lock_irq(&task->sighand->siglock);
> > >> > +
> > >> > + if (!capable(CAP_SYS_ADMIN))
> > >> > + goto out;
> > >> > +
> > >> > + task->seccomp.suspended = false;
> > >> > +
> > >> > +#ifdef TIF_NOTSC
> > >> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> > >> > + set_tsk_thread_flag(task, TIF_NOTSC);
> > >> > +#endif
> > >>
> > >> Ditto. Or can the task not be running here?
> > >
> > > It is stopped since ptrace requires it to be stopped; I don't know if
> > > that's enough to guarantee correctness, though. Is there some
> > > additional barrier that is needed?
> >
> > Dunno. Does ptrace actually guarantee that for new operations?
>
> It seems to; it kept giving me ESRCH when I didn't wait for it to
> stop. I'll poke around and see if I can confirm this via the code.
It looks to me like ptrace does guarantee this. The commands that
don't require a task to be stopped are all special cases in the ptrace
syscall definition, and anything that's not one of those is protected
by a ptrace_check_attach(), which IIUC enforces that the task is
stopped.
Tycho
On 06/01, Tycho Andersen wrote:
>
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -25,6 +25,9 @@ struct seccomp_filter;
> struct seccomp {
> int mode;
> struct seccomp_filter *filter;
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> + bool suspended;
> +#endif
Then afaics you need to change copy_seccomp() to clear ->suspended.
At least if the child is not traced.
> @@ -691,6 +697,11 @@ u32 seccomp_phase1(struct seccomp_data *sd)
> int this_syscall = sd ? sd->nr :
> syscall_get_nr(current, task_pt_regs(current));
>
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> + if (unlikely(current->seccomp.suspended))
> + return SECCOMP_PHASE1_OK;
> +#endif
> +
I am wondering if PTRACE_SUSPEND_SECCOMP can just clear/set TIF_SECCOMP.
Of course, it is not that resume_seccomp() can simply do set_tsk_thread_flag,
it should be more careful. And prctl_set_seccomp() paths will need some
changes. Probably not, this would be more complex.
So perhaps it would be better to add PTRACE_O_SUSPEND_SECCOMP? This also
solves the problem with the killed tracer. Except TIF_NOTSC...
But why do we bother to play with TIF_NOTSC, could you explain?
> +int suspend_seccomp(struct task_struct *task)
> +{
> + int ret = -EACCES;
> +
> + spin_lock_irq(&task->sighand->siglock);
> +
> + if (!capable(CAP_SYS_ADMIN))
> + goto out;
I am puzzled ;) Why do we need ->siglock? And even if we need it, why
we can't check CAP_SYS_ADMIN lockless?
And I am not sure I understand why do we need the additional security
check, but I leave this to you and Andy.
If you have the rights to trace this task, then you can do anything
the tracee could do without the filtering.
> +
> + task->seccomp.suspended = true;
> +
> +#ifdef TIF_NOTSC
> + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> + clear_tsk_thread_flag(task, TIF_NOTSC);
> +#endif
> +
> + ret = 0;
> +out:
> + spin_unlock_irq(&task->sighand->siglock);
> +
> + return ret;
> +}
> +
> +int resume_seccomp(struct task_struct *task)
> +{
> + int ret = -EACCES;
> +
> + spin_lock_irq(&task->sighand->siglock);
> +
> + if (!capable(CAP_SYS_ADMIN))
> + goto out;
> +
> + task->seccomp.suspended = false;
> +
> +#ifdef TIF_NOTSC
> + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> + set_tsk_thread_flag(task, TIF_NOTSC);
> +#endif
> +
> + ret = 0;
> +out:
> + spin_unlock_irq(&task->sighand->siglock);
> +
> + return ret;
> +}
> +#endif /* CONFIG_CHECKPOINT_RESTORE */
Well, I do not think we need 2 helpers, just one which takes a boolean
will look better, imo.
Oleg.
On 06/02, Tycho Andersen wrote:
>
> > Do we need to re-enable seccomp if a tracer detaches unexpectedly.
> > CRIU can be killed and we should try to not affect the task state even
> > in this case.
>
> Yes, I think Pavel's suggestion on the CRIU list of simply
> automatically re-enabling seccomp on ptrace detach
But note that you can't enable tsc if the tracer dies, in this case
the tracee can be running.
Otherwise, if we use PTRACE_O_ instead, it goes away automatically if
the tracer dies or does PTRACE_DETACH.
Oleg.
>> +int suspend_seccomp(struct task_struct *task)
>> +{
>> + int ret = -EACCES;
>> +
>> + spin_lock_irq(&task->sighand->siglock);
>> +
>> + if (!capable(CAP_SYS_ADMIN))
>> + goto out;
>
> I am puzzled ;) Why do we need ->siglock? And even if we need it, why
> we can't check CAP_SYS_ADMIN lockless?
>
> And I am not sure I understand why do we need the additional security
> check, but I leave this to you and Andy.
>
> If you have the rights to trace this task, then you can do anything
> the tracee could do without the filtering.
I think _this_ check is required, otherwise the seccomp-ed task (in
filtered mode) fork-s a child, then this child ptrace-attach to parent
(allowed) then suspend its seccomd. And -- we have unpriviledged process
de-seccomped.
-- Pavel
On Tue, Jun 02, 2015 at 10:02:10PM +0300, Pavel Emelyanov wrote:
>
> >> +int suspend_seccomp(struct task_struct *task)
> >> +{
> >> + int ret = -EACCES;
> >> +
> >> + spin_lock_irq(&task->sighand->siglock);
> >> +
> >> + if (!capable(CAP_SYS_ADMIN))
> >> + goto out;
> >
> > I am puzzled ;) Why do we need ->siglock? And even if we need it, why
> > we can't check CAP_SYS_ADMIN lockless?
> >
> > And I am not sure I understand why do we need the additional security
> > check, but I leave this to you and Andy.
> >
> > If you have the rights to trace this task, then you can do anything
> > the tracee could do without the filtering.
>
> I think _this_ check is required, otherwise the seccomp-ed task (in
> filtered mode) fork-s a child, then this child ptrace-attach to parent
> (allowed) then suspend its seccomd. And -- we have unpriviledged process
> de-seccomped.
If you can ptrace(), you can already escape from seccomp. See this
section in man 2 seccomp, in the SECCOMP_RET_TRACE section:
The seccomp check will not be run again after the tracer is
notified. (This means that seccomp-based sandboxes must not
allow use of ptrace(2)—even of other sandboxed processes—
without extreme care; ptracers can use this mechanism to
escape from the seccomp sandbox.)
(But I think there have been discussions about changing that behavior in
the future?)
On Tue, Jun 2, 2015 at 12:02 PM, Pavel Emelyanov <[email protected]> wrote:
>
>>> +int suspend_seccomp(struct task_struct *task)
>>> +{
>>> + int ret = -EACCES;
>>> +
>>> + spin_lock_irq(&task->sighand->siglock);
>>> +
>>> + if (!capable(CAP_SYS_ADMIN))
>>> + goto out;
>>
>> I am puzzled ;) Why do we need ->siglock? And even if we need it, why
>> we can't check CAP_SYS_ADMIN lockless?
>>
>> And I am not sure I understand why do we need the additional security
>> check, but I leave this to you and Andy.
>>
>> If you have the rights to trace this task, then you can do anything
>> the tracee could do without the filtering.
>
> I think _this_ check is required, otherwise the seccomp-ed task (in
> filtered mode) fork-s a child, then this child ptrace-attach to parent
> (allowed) then suspend its seccomd. And -- we have unpriviledged process
> de-seccomped.
The interaction between ptrace and seccomp is already highly screwy,
and seccomp users can often escape using ptrace. That being said, I'd
rather not further enshrine it.
It might be worth changing the check to verify that the task trying to
suspect seccomp isn't itself subject to seccomp. That should get most
of the safety. We are already kind of screwed if someone tries to run
criu under seccomp due to unresolved nesting issues.
--Andy
On 06/02, Pavel Emelyanov wrote:
>
> > And I am not sure I understand why do we need the additional security
> > check, but I leave this to you and Andy.
> >
> > If you have the rights to trace this task, then you can do anything
> > the tracee could do without the filtering.
>
> I think _this_ check is required, otherwise the seccomp-ed task (in
> filtered mode) fork-s a child, then this child ptrace-attach to parent
> (allowed) then suspend its seccomd.
If you force (hack) that task to do this. And if the seccomp-ed task
does this by its own we do not care.
> And -- we have unpriviledged process
> de-seccomped.
Heh. The case when the priviledged CAP_SYS_ADMIN process escapes the
filtering is much worse I think ;)
But as I said I will nott argue, just I think this needs a bit of
documentantion. And I agree in advance with something like "better
be safe than sorry, we can always remove this later" comment or a
note in the changelog.
Oleg.
On Tue, Jun 02, 2015 at 08:28:29PM +0200, Oleg Nesterov wrote:
> On 06/01, Tycho Andersen wrote:
> >
> > --- a/include/linux/seccomp.h
> > +++ b/include/linux/seccomp.h
> > @@ -25,6 +25,9 @@ struct seccomp_filter;
> > struct seccomp {
> > int mode;
> > struct seccomp_filter *filter;
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > + bool suspended;
> > +#endif
>
> Then afaics you need to change copy_seccomp() to clear ->suspended.
> At least if the child is not traced.
Yes, thank you.
> > @@ -691,6 +697,11 @@ u32 seccomp_phase1(struct seccomp_data *sd)
> > int this_syscall = sd ? sd->nr :
> > syscall_get_nr(current, task_pt_regs(current));
> >
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > + if (unlikely(current->seccomp.suspended))
> > + return SECCOMP_PHASE1_OK;
> > +#endif
> > +
>
> I am wondering if PTRACE_SUSPEND_SECCOMP can just clear/set TIF_SECCOMP.
> Of course, it is not that resume_seccomp() can simply do set_tsk_thread_flag,
> it should be more careful. And prctl_set_seccomp() paths will need some
> changes. Probably not, this would be more complex.
>
> So perhaps it would be better to add PTRACE_O_SUSPEND_SECCOMP? This also
> solves the problem with the killed tracer. Except TIF_NOTSC...
>
> But why do we bother to play with TIF_NOTSC, could you explain?
The procedure for restoring is to call seccomp suspend, restore the
seccomp filters (and potentially other stuff), and then resume them at
the end. If the other stuff happens to use RDTSC, the process gets
killed because TIF_NOTSC has been set. It seems to me that since
seccomp is the one setting TIF_NOTSC, suspending seccomp should unset
it.
We can work around this in criu by doing the seccomp restore as the
very last thing before the final sigreturn, but that seems like the
seccomp suspend API is incomplete, IMO. However, since both you and
Andy complained, perhaps I should remove it :)
> > +int suspend_seccomp(struct task_struct *task)
> > +{
> > + int ret = -EACCES;
> > +
> > + spin_lock_irq(&task->sighand->siglock);
> > +
> > + if (!capable(CAP_SYS_ADMIN))
> > + goto out;
>
> I am puzzled ;) Why do we need ->siglock? And even if we need it, why
> we can't check CAP_SYS_ADMIN lockless?
I'm not sure, other pieces of the seccomp code
(seccomp_set_mode_strict() and friends) take the lock when
manipulating the seccomp mode, so I just followed suit here. You're
right that we can do the capability check lockless, I'll make that
change.
> And I am not sure I understand why do we need the additional security
> check, but I leave this to you and Andy.
Yes, it is required to prevent the case Pavel mentions (although there
are other ways to get around seccomp with ptrace, the goal here is to
not depend on that behavior so that when it is eventually fixed this
doesn't break).
> If you have the rights to trace this task, then you can do anything
> the tracee could do without the filtering.
>
> > +
> > + task->seccomp.suspended = true;
> > +
> > +#ifdef TIF_NOTSC
> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> > + clear_tsk_thread_flag(task, TIF_NOTSC);
> > +#endif
> > +
> > + ret = 0;
> > +out:
> > + spin_unlock_irq(&task->sighand->siglock);
> > +
> > + return ret;
> > +}
> > +
> > +int resume_seccomp(struct task_struct *task)
> > +{
> > + int ret = -EACCES;
> > +
> > + spin_lock_irq(&task->sighand->siglock);
> > +
> > + if (!capable(CAP_SYS_ADMIN))
> > + goto out;
> > +
> > + task->seccomp.suspended = false;
> > +
> > +#ifdef TIF_NOTSC
> > + if (task->seccomp.mode == SECCOMP_MODE_STRICT)
> > + set_tsk_thread_flag(task, TIF_NOTSC);
> > +#endif
> > +
> > + ret = 0;
> > +out:
> > + spin_unlock_irq(&task->sighand->siglock);
> > +
> > + return ret;
> > +}
> > +#endif /* CONFIG_CHECKPOINT_RESTORE */
>
> Well, I do not think we need 2 helpers, just one which takes a boolean
> will look better, imo.
Ok, this has changed slightly with the "always resume on
detach/unlink" change Pavel suggested, but I'll see if I can find a
nice way to merge them.
Thanks for taking a look.
Tycho
> Oleg.
>
On Tue, Jun 02, 2015 at 12:27:42PM -0700, Andy Lutomirski wrote:
>
> It might be worth changing the check to verify that the task trying to
> suspect seccomp isn't itself subject to seccomp. That should get most
> of the safety.
Sounds good, I'll make the change.
Tycho
On Tue, Jun 02, 2015 at 08:48:48PM +0200, Oleg Nesterov wrote:
> On 06/02, Tycho Andersen wrote:
> >
> > > Do we need to re-enable seccomp if a tracer detaches unexpectedly.
> > > CRIU can be killed and we should try to not affect the task state even
> > > in this case.
> >
> > Yes, I think Pavel's suggestion on the CRIU list of simply
> > automatically re-enabling seccomp on ptrace detach
>
> But note that you can't enable tsc if the tracer dies, in this case
> the tracee can be running.
Yes, this does complicate things. I think I'll get rid of the TSC
manipulation, since it's not clear to me how to resolve this. (I do
think it makes the API slightly incomplete, though, so suggestions how
to resolve it would be welcome.)
> Otherwise, if we use PTRACE_O_ instead, it goes away automatically if
> the tracer dies or does PTRACE_DETACH.
IIRC the flag goes away, but we still have to do something in
__ptrace_unlink to clear the seccomp suspended, so I'm not sure if the
automatic-ness helps us.
Tycho
On 06/03, Tycho Andersen wrote:
>
> On Tue, Jun 02, 2015 at 08:28:29PM +0200, Oleg Nesterov wrote:
> > On 06/01, Tycho Andersen wrote:
> > >
> > > --- a/include/linux/seccomp.h
> > > +++ b/include/linux/seccomp.h
> > > @@ -25,6 +25,9 @@ struct seccomp_filter;
> > > struct seccomp {
> > > int mode;
> > > struct seccomp_filter *filter;
> > > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > > + bool suspended;
> > > +#endif
> >
> > Then afaics you need to change copy_seccomp() to clear ->suspended.
> > At least if the child is not traced.
>
> Yes, thank you.
And if we really need to play with TIF_NOTSC, then copy_seccomp() should
set it too if SUSPEND has cleared in parent's flags.
> > But why do we bother to play with TIF_NOTSC, could you explain?
>
> The procedure for restoring is to call seccomp suspend, restore the
> seccomp filters (and potentially other stuff), and then resume them at
> the end. If the other stuff happens to use RDTSC, the process gets
> killed because TIF_NOTSC has been set.
This is clear, just I thought that CRIU doesn't use rdtsc on behalf of
the traced task...
> We can work around this in criu by doing the seccomp restore as the
> very last thing before the final sigreturn,
Not sure I understand... You need to suspend at "dump" time too afaics,
otherwise, say, syscall_seized() can fail because this syscall is nacked
by seccomp?
> but that seems like the
> seccomp suspend API is incomplete, IMO. However, since both you and
> Andy complained, perhaps I should remove it :)
Well, this is up to you ;)
But. Note that a process can also disable TSC via PR_SET_TSC. So if
dump or restore can't work without enabling TSC you probably want to
handle this case too.
And this makes me think that this needs a separate interface. I dunno.
> > And I am not sure I understand why do we need the additional security
> > check, but I leave this to you and Andy.
>
> Yes, it is required to prevent the case Pavel mentions (although there
> are other ways to get around seccomp with ptrace, the goal here is to
> not depend on that behavior so that when it is eventually fixed this
> doesn't break).
I still do not think it makes any sense. again, if you can trace this
process then you can disable the filtering anyway. Lets assume that
seccomp_run_filters() acks, say, sys_getpid(). Or fork() in the case
Pavel mentioned, this doesn't matter. Now you can force the tracee to
call this syscall, then change syscall_nr.
But as I said I won't argue, please forget.
> Ok, this has changed slightly with the "always resume on
> detach/unlink" change Pavel suggested,
To remind, it is not easy to restore TIF_NOTSC if the tracer dies.
PTRACE_DETACH can do this because the tracee can't be woken up. But
personally I'd prefer the expicit RESUME request rather than "rely
on PTRACE_DETACH".
If we avoid the TSC games, then, again, please consider
PTRACE_O_SECCOMP_DISABLE. This will solve the problems with
fork/detach/tracer-death automatically.
Oleg.
On 06/03, Tycho Andersen wrote:
>
> On Tue, Jun 02, 2015 at 08:48:48PM +0200, Oleg Nesterov wrote:
>
> > Otherwise, if we use PTRACE_O_ instead, it goes away automatically if
> > the tracer dies or does PTRACE_DETACH.
>
> IIRC the flag goes away, but we still have to do something in
> __ptrace_unlink to clear the seccomp suspended, so I'm not sure if the
> automatic-ness helps us.
But we do not need seccomp->suspended at all?
Unless I missed something PTRACE_O_ needs a one-liner patch (ignoring
the defines in include files),
--- x/kernel/seccomp.c
+++ x/kernel/seccomp.c
@@ -692,6 +692,9 @@ u32 seccomp_phase1(struct seccomp_data *
int this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
+ if (unlikely(current->ptrace & PT_NAME_OF_THIS_OPTION))
+ return OK;
+
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
OK, and the same check in secure_computing_strict().
No?
Oleg.
On Wed, Jun 03, 2015 at 06:54:51PM +0200, Oleg Nesterov wrote:
> On 06/03, Tycho Andersen wrote:
> >
> > On Tue, Jun 02, 2015 at 08:48:48PM +0200, Oleg Nesterov wrote:
> >
> > > Otherwise, if we use PTRACE_O_ instead, it goes away automatically if
> > > the tracer dies or does PTRACE_DETACH.
> >
> > IIRC the flag goes away, but we still have to do something in
> > __ptrace_unlink to clear the seccomp suspended, so I'm not sure if the
> > automatic-ness helps us.
>
> But we do not need seccomp->suspended at all?
>
> Unless I missed something PTRACE_O_ needs a one-liner patch (ignoring
> the defines in include files),
>
> --- x/kernel/seccomp.c
> +++ x/kernel/seccomp.c
> @@ -692,6 +692,9 @@ u32 seccomp_phase1(struct seccomp_data *
> int this_syscall = sd ? sd->nr :
> syscall_get_nr(current, task_pt_regs(current));
>
> + if (unlikely(current->ptrace & PT_NAME_OF_THIS_OPTION))
> + return OK;
> +
> switch (mode) {
> case SECCOMP_MODE_STRICT:
> __secure_computing_strict(this_syscall); /* may call do_exit */
>
>
> OK, and the same check in secure_computing_strict().
>
> No?
I see, I misunderstood. Yes, this is a lot nicer, thank you.
Tycho
Hi Oleg,
On Wed, Jun 03, 2015 at 06:41:21PM +0200, Oleg Nesterov wrote:
> On 06/03, Tycho Andersen wrote:
> >
> > On Tue, Jun 02, 2015 at 08:28:29PM +0200, Oleg Nesterov wrote:
> > > On 06/01, Tycho Andersen wrote:
> > > >
> > > > --- a/include/linux/seccomp.h
> > > > +++ b/include/linux/seccomp.h
> > > > @@ -25,6 +25,9 @@ struct seccomp_filter;
> > > > struct seccomp {
> > > > int mode;
> > > > struct seccomp_filter *filter;
> > > > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > > > + bool suspended;
> > > > +#endif
> > >
> > > Then afaics you need to change copy_seccomp() to clear ->suspended.
> > > At least if the child is not traced.
> >
> > Yes, thank you.
>
> And if we really need to play with TIF_NOTSC, then copy_seccomp() should
> set it too if SUSPEND has cleared in parent's flags.
>
> > > But why do we bother to play with TIF_NOTSC, could you explain?
> >
> > The procedure for restoring is to call seccomp suspend, restore the
> > seccomp filters (and potentially other stuff), and then resume them at
> > the end. If the other stuff happens to use RDTSC, the process gets
> > killed because TIF_NOTSC has been set.
>
> This is clear, just I thought that CRIU doesn't use rdtsc on behalf of
> the traced task...
Unfortunately it does (I think to print timestamps in logs, although I
didn't look closely).
> > We can work around this in criu by doing the seccomp restore as the
> > very last thing before the final sigreturn,
>
> Not sure I understand... You need to suspend at "dump" time too afaics,
> otherwise, say, syscall_seized() can fail because this syscall is nacked
> by seccomp?
Yes, but thankfully criu's dump code doesn't seem to use RDTSC, so it
just happens to work. Of course, if the dump code starts to use it,
we'll have to revisit this change.
> > but that seems like the
> > seccomp suspend API is incomplete, IMO. However, since both you and
> > Andy complained, perhaps I should remove it :)
>
> Well, this is up to you ;)
>
> But. Note that a process can also disable TSC via PR_SET_TSC. So if
> dump or restore can't work without enabling TSC you probably want to
> handle this case too.
>
> And this makes me think that this needs a separate interface. I dunno.
I guess if we want to disable the TSC we need to save the state
irrespective of seccomp. I think I will ignore this for now, since we
can work around it in CRIU, and hope that we don't have to revisit it
because of the complexity r.e. tracer dying you mention below.
Tycho
> > > And I am not sure I understand why do we need the additional security
> > > check, but I leave this to you and Andy.
> >
> > Yes, it is required to prevent the case Pavel mentions (although there
> > are other ways to get around seccomp with ptrace, the goal here is to
> > not depend on that behavior so that when it is eventually fixed this
> > doesn't break).
>
> I still do not think it makes any sense. again, if you can trace this
> process then you can disable the filtering anyway. Lets assume that
> seccomp_run_filters() acks, say, sys_getpid(). Or fork() in the case
> Pavel mentioned, this doesn't matter. Now you can force the tracee to
> call this syscall, then change syscall_nr.
>
> But as I said I won't argue, please forget.
>
> > Ok, this has changed slightly with the "always resume on
> > detach/unlink" change Pavel suggested,
>
> To remind, it is not easy to restore TIF_NOTSC if the tracer dies.
>
> PTRACE_DETACH can do this because the tracee can't be woken up. But
> personally I'd prefer the expicit RESUME request rather than "rely
> on PTRACE_DETACH".
>
> If we avoid the TSC games, then, again, please consider
> PTRACE_O_SECCOMP_DISABLE. This will solve the problems with
> fork/detach/tracer-death automatically.
>
> Oleg.
>
On Wed, Jun 3, 2015 at 9:41 AM, Oleg Nesterov <[email protected]> wrote:
> On 06/03, Tycho Andersen wrote:
>>
>> On Tue, Jun 02, 2015 at 08:28:29PM +0200, Oleg Nesterov wrote:
>> > On 06/01, Tycho Andersen wrote:
>> > >
>> > > --- a/include/linux/seccomp.h
>> > > +++ b/include/linux/seccomp.h
>> > > @@ -25,6 +25,9 @@ struct seccomp_filter;
>> > > struct seccomp {
>> > > int mode;
>> > > struct seccomp_filter *filter;
>> > > +#ifdef CONFIG_CHECKPOINT_RESTORE
>> > > + bool suspended;
>> > > +#endif
>> >
>> > Then afaics you need to change copy_seccomp() to clear ->suspended.
>> > At least if the child is not traced.
>>
>> Yes, thank you.
>
> And if we really need to play with TIF_NOTSC, then copy_seccomp() should
> set it too if SUSPEND has cleared in parent's flags.
>
>> > But why do we bother to play with TIF_NOTSC, could you explain?
>>
>> The procedure for restoring is to call seccomp suspend, restore the
>> seccomp filters (and potentially other stuff), and then resume them at
>> the end. If the other stuff happens to use RDTSC, the process gets
>> killed because TIF_NOTSC has been set.
>
> This is clear, just I thought that CRIU doesn't use rdtsc on behalf of
> the traced task...
>
>> We can work around this in criu by doing the seccomp restore as the
>> very last thing before the final sigreturn,
>
> Not sure I understand... You need to suspend at "dump" time too afaics,
> otherwise, say, syscall_seized() can fail because this syscall is nacked
> by seccomp?
>
>> but that seems like the
>> seccomp suspend API is incomplete, IMO. However, since both you and
>> Andy complained, perhaps I should remove it :)
>
> Well, this is up to you ;)
>
> But. Note that a process can also disable TSC via PR_SET_TSC. So if
> dump or restore can't work without enabling TSC you probably want to
> handle this case too.
>
> And this makes me think that this needs a separate interface. I dunno.
>
True. Or we could keep track of all the reasons the TSC is off.
--Andy
On Wed, Jun 03, 2015 at 06:54:51PM +0200, Oleg Nesterov wrote:
> On 06/03, Tycho Andersen wrote:
> >
> > On Tue, Jun 02, 2015 at 08:48:48PM +0200, Oleg Nesterov wrote:
> >
> > > Otherwise, if we use PTRACE_O_ instead, it goes away automatically if
> > > the tracer dies or does PTRACE_DETACH.
> >
> > IIRC the flag goes away, but we still have to do something in
> > __ptrace_unlink to clear the seccomp suspended, so I'm not sure if the
> > automatic-ness helps us.
>
> But we do not need seccomp->suspended at all?
>
> Unless I missed something PTRACE_O_ needs a one-liner patch (ignoring
> the defines in include files),
>
> --- x/kernel/seccomp.c
> +++ x/kernel/seccomp.c
> @@ -692,6 +692,9 @@ u32 seccomp_phase1(struct seccomp_data *
> int this_syscall = sd ? sd->nr :
> syscall_get_nr(current, task_pt_regs(current));
>
> + if (unlikely(current->ptrace & PT_NAME_OF_THIS_OPTION))
> + return OK;
> +
> switch (mode) {
> case SECCOMP_MODE_STRICT:
> __secure_computing_strict(this_syscall); /* may call do_exit */
>
>
> OK, and the same check in secure_computing_strict().
>
> No?
One problem with this is that we still incur the runtime overhead of
checking, which I guess is a question of ptrace vs. seccomp
complexity.
Andy had suggested multiplexing seccomp->suspended into seccomp->mode
directly to avoid this, but the above still requires a check. We could
play with TIF_SECCOMP, but that has the same problems as playing with
TIF_NOTSC.
Thoughts?
Tycho