Hello,
Those are two longstanding updates for seccomp that I need applied ASAP.
It's not the first time I submit them, hope they go in this time, for the
happiness of all the anti-seccomp and in turn anti-cpushare folks out there.
1) this reduces the number of bytes that seccomp takes when enabled (in ram
terms)
2) this makes seccomp absolutely zerocost at runtime (in cpu terms, even for
the scheduler and not only for the syscalls)
As soon as the money is allowed in (and no it's not a technical problem
anymore, the code is completely finished and tested in the sandbox since two
weeks ago), the userland package may become a bit more spread than only in
debian, and I hardcoded those prctl in the userland side, so I wouldn't like
breakage to spread if somebody registers those prctl numbers in the kernel for
something else.
Thanks.
# HG changeset patch
# User Andrea Arcangeli <[email protected]>
# Date 1181833362 -7200
# Node ID 8d75c4aa7185fcdcc2e99f3fe0f1ec68cbd78a43
# Parent 6416ccf1c2fa5f990695b38c8a692fa9e109a223
move seccomp from /proc to a prctl
This reduces the memory footprint and it enforces that only the current
task can enable seccomp on itself (this is a requirement for a
strightforward [modulo preempt ;) ] TIF_NOTSC implementation).
Signed-off-by: Andrea Arcangeli <[email protected]>
diff --git a/fs/proc/base.c b/fs/proc/base.c
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,7 +67,6 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
-#include <linux/seccomp.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
@@ -811,71 +810,6 @@ static const struct file_operations proc
.write = proc_loginuid_write,
};
#endif
-
-#ifdef CONFIG_SECCOMP
-static ssize_t seccomp_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
- char __buf[20];
- size_t len;
-
- if (!tsk)
- return -ESRCH;
- /* no need to print the trailing zero, so use only len */
- len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
- put_task_struct(tsk);
-
- return simple_read_from_buffer(buf, count, ppos, __buf, len);
-}
-
-static ssize_t seccomp_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
- char __buf[20], *end;
- unsigned int seccomp_mode;
- ssize_t result;
-
- result = -ESRCH;
- if (!tsk)
- goto out_no_task;
-
- /* can set it only once to be even more secure */
- result = -EPERM;
- if (unlikely(tsk->seccomp.mode))
- goto out;
-
- result = -EFAULT;
- memset(__buf, 0, sizeof(__buf));
- count = min(count, sizeof(__buf) - 1);
- if (copy_from_user(__buf, buf, count))
- goto out;
-
- seccomp_mode = simple_strtoul(__buf, &end, 0);
- if (*end == '\n')
- end++;
- result = -EINVAL;
- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
- tsk->seccomp.mode = seccomp_mode;
- set_tsk_thread_flag(tsk, TIF_SECCOMP);
- } else
- goto out;
- result = -EIO;
- if (unlikely(!(end - __buf)))
- goto out;
- result = end - __buf;
-out:
- put_task_struct(tsk);
-out_no_task:
- return result;
-}
-
-static const struct file_operations proc_seccomp_operations = {
- .read = seccomp_read,
- .write = seccomp_write,
-};
-#endif /* CONFIG_SECCOMP */
#ifdef CONFIG_FAULT_INJECTION
static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
@@ -1971,9 +1905,6 @@ static const struct pid_entry tgid_base_
REG("numa_maps", S_IRUGO, numa_maps),
#endif
REG("mem", S_IRUSR|S_IWUSR, mem),
-#ifdef CONFIG_SECCOMP
- REG("seccomp", S_IRUSR|S_IWUSR, seccomp),
-#endif
LNK("cwd", cwd),
LNK("root", root),
LNK("exe", exe),
@@ -2255,9 +2186,6 @@ static const struct pid_entry tid_base_s
REG("numa_maps", S_IRUGO, numa_maps),
#endif
REG("mem", S_IRUSR|S_IWUSR, mem),
-#ifdef CONFIG_SECCOMP
- REG("seccomp", S_IRUSR|S_IWUSR, seccomp),
-#endif
LNK("cwd", cwd),
LNK("root", root),
LNK("exe", exe),
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -59,4 +59,8 @@
# define PR_ENDIAN_LITTLE 1 /* True little endian mode */
# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */
+/* Get/set process seccomp mode */
+#define PR_GET_SECCOMP 21
+#define PR_SET_SECCOMP 22
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,8 +3,6 @@
#ifdef CONFIG_SECCOMP
-
-#define NR_SECCOMP_MODES 1
#include <linux/thread_info.h>
#include <asm/seccomp.h>
@@ -23,6 +21,9 @@ static inline int has_secure_computing(s
return unlikely(test_ti_thread_flag(ti, TIF_SECCOMP));
}
+extern long prctl_get_seccomp(void);
+extern long prctl_set_seccomp(unsigned long);
+
#else /* CONFIG_SECCOMP */
typedef struct { } seccomp_t;
@@ -34,6 +35,16 @@ static inline int has_secure_computing(s
return 0;
}
+static inline long prctl_get_seccomp(void)
+{
+ return -EINVAL;
+}
+
+static inline long prctl_set_seccomp(unsigned long arg2)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_SECCOMP */
#endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -10,6 +10,7 @@
#include <linux/sched.h>
/* #define SECCOMP_DEBUG 1 */
+#define NR_SECCOMP_MODES 1
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -54,3 +55,28 @@ void __secure_computing(int this_syscall
#endif
do_exit(SIGKILL);
}
+
+long prctl_get_seccomp(void)
+{
+ return current->seccomp.mode;
+}
+
+long prctl_set_seccomp(unsigned long seccomp_mode)
+{
+ long ret;
+
+ /* can set it only once to be even more secure */
+ ret = -EPERM;
+ if (unlikely(current->seccomp.mode))
+ goto out;
+
+ ret = -EINVAL;
+ if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+ ret = 0;
+ }
+
+ out:
+ return ret;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,6 +31,7 @@
#include <linux/cn_proc.h>
#include <linux/getcpu.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/seccomp.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -2241,6 +2242,13 @@ asmlinkage long sys_prctl(int option, un
error = SET_ENDIAN(current, arg2);
break;
+ case PR_GET_SECCOMP:
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+ error = prctl_set_seccomp(arg2);
+ break;
+
default:
error = -EINVAL;
break;
# HG changeset patch
# User Andrea Arcangeli <[email protected]>
# Date 1181833362 -7200
# Node ID a1896c3e29c7fb4dc1811f24e9f5cfc8dcbad419
# Parent 8d75c4aa7185fcdcc2e99f3fe0f1ec68cbd78a43
make seccomp zerocost in schedule
This follows a suggestion from Chuck Ebbert on how to make seccomp
absolutely zerocost in schedule too. The only remaining footprint of
seccomp is in terms of the bzImage size that becomes a few bytes
(perhaps even a few kbytes) larger, measure it if you care in the
embedded.
Signed-off-by: Andrea Arcangeli <[email protected]>
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -538,8 +538,31 @@ int dump_task_regs(struct task_struct *t
return 1;
}
-static noinline void __switch_to_xtra(struct task_struct *next_p,
- struct tss_struct *tss)
+#ifdef CONFIG_SECCOMP
+void hard_disable_TSC(void)
+{
+ write_cr4(read_cr4() | X86_CR4_TSD);
+}
+void disable_TSC(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_disable_TSC();
+ preempt_enable();
+}
+void hard_enable_TSC(void)
+{
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+#endif /* CONFIG_SECCOMP */
+
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+ struct tss_struct *tss)
{
struct thread_struct *next;
@@ -555,6 +578,17 @@ static noinline void __switch_to_xtra(st
set_debugreg(next->debugreg[7], 7);
}
+#ifdef CONFIG_SECCOMP
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+ hard_disable_TSC();
+ else
+ hard_enable_TSC();
+ }
+#endif
+
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Disable the bitmap via an invalid offset. We still cache
@@ -583,33 +617,6 @@ static noinline void __switch_to_xtra(st
* perform any I/O during its timeslice.
*/
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-}
-
-/*
- * This function selects if the context switch from prev to next
- * has to tweak the TSC disable bit in the cr4.
- */
-static inline void disable_tsc(struct task_struct *prev_p,
- struct task_struct *next_p)
-{
- struct thread_info *prev, *next;
-
- /*
- * gcc should eliminate the ->thread_info dereference if
- * has_secure_computing returns 0 at compile time (SECCOMP=n).
- */
- prev = task_thread_info(prev_p);
- next = task_thread_info(next_p);
-
- if (has_secure_computing(prev) || has_secure_computing(next)) {
- /* slow path here */
- if (has_secure_computing(prev) &&
- !has_secure_computing(next)) {
- write_cr4(read_cr4() & ~X86_CR4_TSD);
- } else if (!has_secure_computing(prev) &&
- has_secure_computing(next))
- write_cr4(read_cr4() | X86_CR4_TSD);
- }
}
/*
@@ -689,11 +696,9 @@ struct task_struct fastcall * __switch_t
/*
* Now maybe handle debug registers and/or IO bitmaps
*/
- if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
- || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
- __switch_to_xtra(next_p, tss);
-
- disable_tsc(prev_p, next_p);
+ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+ task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+ __switch_to_xtra(prev_p, next_p, tss);
/*
* Leave lazy mode, flushing any hypercalls made here.
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -227,6 +227,10 @@ extern int bootloader_type;
#define HAVE_ARCH_PICK_MMAP_LAYOUT
+extern void hard_disable_TSC(void);
+extern void disable_TSC(void);
+extern void hard_enable_TSC(void);
+
/*
* Size of io_bitmap.
*/
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -137,6 +137,7 @@ static inline struct thread_info *curren
#define TIF_DEBUG 17 /* uses debug registers */
#define TIF_IO_BITMAP 18 /* uses I/O bitmap */
#define TIF_FREEZE 19 /* is freezing for suspend */
+#define TIF_NOTSC 20 /* TSC is not accessible in userland */
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
@@ -151,6 +152,7 @@ static inline struct thread_info *curren
#define _TIF_DEBUG (1<<TIF_DEBUG)
#define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP)
#define _TIF_FREEZE (1<<TIF_FREEZE)
+#define _TIF_NOTSC (1<<TIF_NOTSC)
/* work to do on interrupt/exception return */
#define _TIF_WORK_MASK \
@@ -160,7 +162,8 @@ static inline struct thread_info *curren
#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
/* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
+#define _TIF_WORK_CTXSW_NEXT (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUG)
+#define _TIF_WORK_CTXSW_PREV (_TIF_IO_BITMAP | _TIF_NOTSC)
/*
* Thread-synchronous status.
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -16,11 +16,6 @@ static inline void secure_computing(int
__secure_computing(this_syscall);
}
-static inline int has_secure_computing(struct thread_info *ti)
-{
- return unlikely(test_ti_thread_flag(ti, TIF_SECCOMP));
-}
-
extern long prctl_get_seccomp(void);
extern long prctl_set_seccomp(unsigned long);
@@ -29,11 +24,6 @@ typedef struct { } seccomp_t;
typedef struct { } seccomp_t;
#define secure_computing(x) do { } while (0)
-/* static inline to preserve typechecking */
-static inline int has_secure_computing(struct thread_info *ti)
-{
- return 0;
-}
static inline long prctl_get_seccomp(void)
{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -74,6 +74,9 @@ long prctl_set_seccomp(unsigned long sec
if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
current->seccomp.mode = seccomp_mode;
set_thread_flag(TIF_SECCOMP);
+#ifdef TIF_NOTSC
+ disable_TSC();
+#endif
ret = 0;
}