Some embedded systems can do without the prctl syscall, saving some space.
This also avoids regular increases in tinyconfig size as people add more
non-optional functionality to prctl (observed via the 0-day kernel
infrastructure).
The first patch moves prctl and all its helpers into a separate source file,
kernel/prctl.c, to allow making it optional via the Makefile without any
ifdefs. The second patch adds the Kconfig option to make prctl optional.
Josh Triplett (2):
kernel: Move prctl and helpers from kernel/sys.c to new kernel/prctl.c
kernel: Support compiling out the prctl syscall
init/Kconfig | 12 +-
kernel/Makefile | 1 +-
kernel/prctl.c | 684 +++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sys.c | 672 +------------------------------------------------
kernel/sys_ni.c | 1 +-
5 files changed, 698 insertions(+), 672 deletions(-)
create mode 100644 kernel/prctl.c
base-commit: e3a00f68e426df24a5fb98956a1bd1b23943aa1e
--
git-series 0.8.11
This prepares for making prctl optional.
Signed-off-by: Josh Triplett <[email protected]>
---
kernel/Makefile | 2 +-
kernel/prctl.c | 684 +++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sys.c | 672 +------------------------------------------------
3 files changed, 685 insertions(+), 673 deletions(-)
create mode 100644 kernel/prctl.c
diff --git a/kernel/Makefile b/kernel/Makefile
index eb26e12..37c6d4c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o
+ async.o range.o smpboot.o ucount.o prctl.o
obj-$(CONFIG_MULTIUSER) += groups.o
diff --git a/kernel/prctl.c b/kernel/prctl.c
new file mode 100644
index 0000000..f7156e4
--- /dev/null
+++ b/kernel/prctl.c
@@ -0,0 +1,684 @@
+#include <linux/sched.h>
+#include <linux/cn_proc.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/prctl.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+#include <asm/processor.h>
+
+#ifndef SET_UNALIGN_CTL
+# define SET_UNALIGN_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_UNALIGN_CTL
+# define GET_UNALIGN_CTL(a, b) (-EINVAL)
+#endif
+#ifndef SET_FPEMU_CTL
+# define SET_FPEMU_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_FPEMU_CTL
+# define GET_FPEMU_CTL(a, b) (-EINVAL)
+#endif
+#ifndef SET_FPEXC_CTL
+# define SET_FPEXC_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_FPEXC_CTL
+# define GET_FPEXC_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a, b) (-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a, b) (-EINVAL)
+#endif
+#ifndef GET_TSC_CTL
+# define GET_TSC_CTL(a) (-EINVAL)
+#endif
+#ifndef SET_TSC_CTL
+# define SET_TSC_CTL(a) (-EINVAL)
+#endif
+#ifndef MPX_ENABLE_MANAGEMENT
+# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
+#endif
+#ifndef MPX_DISABLE_MANAGEMENT
+# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
+#endif
+#ifndef GET_FP_MODE
+# define GET_FP_MODE(a) (-EINVAL)
+#endif
+#ifndef SET_FP_MODE
+# define SET_FP_MODE(a,b) (-EINVAL)
+#endif
+
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+ struct fd exe;
+ struct file *old_exe, *exe_file;
+ struct inode *inode;
+ int err;
+
+ exe = fdget(fd);
+ if (!exe.file)
+ return -EBADF;
+
+ inode = file_inode(exe.file);
+
+ /*
+ * Because the original mm->exe_file points to executable file, make
+ * sure that this one is executable as well, to avoid breaking an
+ * overall picture.
+ */
+ err = -EACCES;
+ if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
+ goto exit;
+
+ err = inode_permission(inode, MAY_EXEC);
+ if (err)
+ goto exit;
+
+ /*
+ * Forbid mm->exe_file change if old file still mapped.
+ */
+ exe_file = get_mm_exe_file(mm);
+ err = -EBUSY;
+ if (exe_file) {
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &exe_file->f_path))
+ goto exit_err;
+ }
+
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ }
+
+ /*
+ * The symlink can be changed only once, just to disallow arbitrary
+ * transitions malicious software might bring in. This means one
+ * could make a snapshot over all processes running and monitor
+ * /proc/pid/exe changes to notice unusual activity if needed.
+ */
+ err = -EPERM;
+ if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
+ goto exit;
+
+ err = 0;
+ /* set the new file, lockless */
+ get_file(exe.file);
+ old_exe = xchg(&mm->exe_file, exe.file);
+ if (old_exe)
+ fput(old_exe);
+exit:
+ fdput(exe);
+ return err;
+exit_err:
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ goto exit;
+}
+
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+ unsigned long mmap_max_addr = TASK_SIZE;
+ struct mm_struct *mm = current->mm;
+ int error = -EINVAL, i;
+
+ static const unsigned char offsets[] = {
+ offsetof(struct prctl_mm_map, start_code),
+ offsetof(struct prctl_mm_map, end_code),
+ offsetof(struct prctl_mm_map, start_data),
+ offsetof(struct prctl_mm_map, end_data),
+ offsetof(struct prctl_mm_map, start_brk),
+ offsetof(struct prctl_mm_map, brk),
+ offsetof(struct prctl_mm_map, start_stack),
+ offsetof(struct prctl_mm_map, arg_start),
+ offsetof(struct prctl_mm_map, arg_end),
+ offsetof(struct prctl_mm_map, env_start),
+ offsetof(struct prctl_mm_map, env_end),
+ };
+
+ /*
+ * Make sure the members are not somewhere outside
+ * of allowed address space.
+ */
+ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+ u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+
+ if ((unsigned long)val >= mmap_max_addr ||
+ (unsigned long)val < mmap_min_addr)
+ goto out;
+ }
+
+ /*
+ * Make sure the pairs are ordered.
+ */
+#define __prctl_check_order(__m1, __op, __m2) \
+ ((unsigned long)prctl_map->__m1 __op \
+ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+ error = __prctl_check_order(start_code, <, end_code);
+ error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_brk, <=, brk);
+ error |= __prctl_check_order(arg_start, <=, arg_end);
+ error |= __prctl_check_order(env_start, <=, env_end);
+ if (error)
+ goto out;
+#undef __prctl_check_order
+
+ error = -EINVAL;
+
+ /*
+ * @brk should be after @end_data in traditional maps.
+ */
+ if (prctl_map->start_brk <= prctl_map->end_data ||
+ prctl_map->brk <= prctl_map->end_data)
+ goto out;
+
+ /*
+ * Neither we should allow to override limits if they set.
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+ prctl_map->start_brk, prctl_map->end_data,
+ prctl_map->start_data))
+ goto out;
+
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (prctl_map->auxv_size) {
+ if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+ goto out;
+ }
+
+ /*
+ * Finally, make sure the caller has the rights to
+ * change /proc/pid/exe link: only local root should
+ * be allowed to.
+ */
+ if (prctl_map->exe_fd != (u32)-1) {
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *cred = current_cred();
+
+ if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+ !gid_eq(cred->gid, make_kgid(ns, 0)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+ BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+
+ if (opt == PR_SET_MM_MAP_SIZE)
+ return put_user((unsigned int)sizeof(prctl_map),
+ (unsigned int __user *)addr);
+
+ if (data_size != sizeof(prctl_map))
+ return -EINVAL;
+
+ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+ return -EFAULT;
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ return error;
+
+ if (prctl_map.auxv_size) {
+ memset(user_auxv, 0, sizeof(user_auxv));
+ if (copy_from_user(user_auxv,
+ (const void __user *)prctl_map.auxv,
+ prctl_map.auxv_size))
+ return -EFAULT;
+
+ /* Last entry must be AT_NULL as specification requires */
+ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+ user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+ }
+
+ if (prctl_map.exe_fd != (u32)-1) {
+ error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+ if (error)
+ return error;
+ }
+
+ down_write(&mm->mmap_sem);
+
+ /*
+ * We don't validate if these members are pointing to
+ * real present VMAs because application may have correspond
+ * VMAs already unmapped and kernel uses these members for statistics
+ * output in procfs mostly, except
+ *
+ * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * for VMAs when updating these memvers so anything wrong written
+ * here cause kernel to swear at userspace program but won't lead
+ * to any problem in kernel itself
+ */
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ /*
+ * Note this update of @saved_auxv is lockless thus
+ * if someone reads this member in procfs while we're
+ * updating -- it may get partly updated results. It's
+ * known and acceptable trade off: we leave it as is to
+ * not introduce additional locks here making the kernel
+ * more complex.
+ */
+ if (prctl_map.auxv_size)
+ memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ /*
+ * This doesn't move the auxiliary vector itself since it's pinned to
+ * mm_struct, but it permits filling the vector with new values. It's
+ * up to the caller to provide sane values here, otherwise userspace
+ * tools which use this vector might be unhappy.
+ */
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+
+ if (len > sizeof(user_auxv))
+ return -EINVAL;
+
+ if (copy_from_user(user_auxv, (const void __user *)addr, len))
+ return -EFAULT;
+
+ /* Make sure the last entry is always AT_NULL */
+ user_auxv[AT_VECTOR_SIZE - 2] = 0;
+ user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+ task_lock(current);
+ memcpy(mm->saved_auxv, user_auxv, len);
+ task_unlock(current);
+
+ return 0;
+}
+
+static int prctl_set_mm(int opt, unsigned long addr,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+ struct prctl_mm_map prctl_map;
+ struct vm_area_struct *vma;
+ int error;
+
+ if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+ opt != PR_SET_MM_MAP &&
+ opt != PR_SET_MM_MAP_SIZE)))
+ return -EINVAL;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (opt == PR_SET_MM_EXE_FILE)
+ return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+
+ if (opt == PR_SET_MM_AUXV)
+ return prctl_set_auxv(mm, addr, arg4);
+
+ if (addr >= TASK_SIZE || addr < mmap_min_addr)
+ return -EINVAL;
+
+ error = -EINVAL;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+
+ prctl_map.start_code = mm->start_code;
+ prctl_map.end_code = mm->end_code;
+ prctl_map.start_data = mm->start_data;
+ prctl_map.end_data = mm->end_data;
+ prctl_map.start_brk = mm->start_brk;
+ prctl_map.brk = mm->brk;
+ prctl_map.start_stack = mm->start_stack;
+ prctl_map.arg_start = mm->arg_start;
+ prctl_map.arg_end = mm->arg_end;
+ prctl_map.env_start = mm->env_start;
+ prctl_map.env_end = mm->env_end;
+ prctl_map.auxv = NULL;
+ prctl_map.auxv_size = 0;
+ prctl_map.exe_fd = -1;
+
+ switch (opt) {
+ case PR_SET_MM_START_CODE:
+ prctl_map.start_code = addr;
+ break;
+ case PR_SET_MM_END_CODE:
+ prctl_map.end_code = addr;
+ break;
+ case PR_SET_MM_START_DATA:
+ prctl_map.start_data = addr;
+ break;
+ case PR_SET_MM_END_DATA:
+ prctl_map.end_data = addr;
+ break;
+ case PR_SET_MM_START_STACK:
+ prctl_map.start_stack = addr;
+ break;
+ case PR_SET_MM_START_BRK:
+ prctl_map.start_brk = addr;
+ break;
+ case PR_SET_MM_BRK:
+ prctl_map.brk = addr;
+ break;
+ case PR_SET_MM_ARG_START:
+ prctl_map.arg_start = addr;
+ break;
+ case PR_SET_MM_ARG_END:
+ prctl_map.arg_end = addr;
+ break;
+ case PR_SET_MM_ENV_START:
+ prctl_map.env_start = addr;
+ break;
+ case PR_SET_MM_ENV_END:
+ prctl_map.env_end = addr;
+ break;
+ default:
+ goto out;
+ }
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ goto out;
+
+ switch (opt) {
+ /*
+ * If command line arguments and environment
+ * are placed somewhere else on stack, we can
+ * set them up here, ARG_START/END to setup
+ * command line argumets and ENV_START/END
+ * for environment.
+ */
+ case PR_SET_MM_START_STACK:
+ case PR_SET_MM_ARG_START:
+ case PR_SET_MM_ARG_END:
+ case PR_SET_MM_ENV_START:
+ case PR_SET_MM_ENV_END:
+ if (!vma) {
+ error = -EFAULT;
+ goto out;
+ }
+ }
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ error = 0;
+out:
+ up_write(&mm->mmap_sem);
+ return error;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+ return put_user(me->clear_child_tid, tid_addr);
+}
+#else
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+ return -EINVAL;
+}
+#endif
+
+SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ unsigned long, arg4, unsigned long, arg5)
+{
+ struct task_struct *me = current;
+ unsigned char comm[sizeof(me->comm)];
+ long error;
+
+ error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+ if (error != -ENOSYS)
+ return error;
+
+ error = 0;
+ switch (option) {
+ case PR_SET_PDEATHSIG:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
+ break;
+ }
+ me->pdeath_signal = arg2;
+ break;
+ case PR_GET_PDEATHSIG:
+ error = put_user(me->pdeath_signal, (int __user *)arg2);
+ break;
+ case PR_GET_DUMPABLE:
+ error = get_dumpable(me->mm);
+ break;
+ case PR_SET_DUMPABLE:
+ if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+ error = -EINVAL;
+ break;
+ }
+ set_dumpable(me->mm, arg2);
+ break;
+
+ case PR_SET_UNALIGN:
+ error = SET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_GET_UNALIGN:
+ error = GET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_SET_FPEMU:
+ error = SET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_GET_FPEMU:
+ error = GET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_SET_FPEXC:
+ error = SET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_FPEXC:
+ error = GET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_TIMING:
+ error = PR_TIMING_STATISTICAL;
+ break;
+ case PR_SET_TIMING:
+ if (arg2 != PR_TIMING_STATISTICAL)
+ error = -EINVAL;
+ break;
+ case PR_SET_NAME:
+ comm[sizeof(me->comm) - 1] = 0;
+ if (strncpy_from_user(comm, (char __user *)arg2,
+ sizeof(me->comm) - 1) < 0)
+ return -EFAULT;
+ set_task_comm(me, comm);
+ proc_comm_connector(me);
+ break;
+ case PR_GET_NAME:
+ get_task_comm(comm, me);
+ if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
+ return -EFAULT;
+ break;
+ case PR_GET_ENDIAN:
+ error = GET_ENDIAN(me, arg2);
+ break;
+ case PR_SET_ENDIAN:
+ error = SET_ENDIAN(me, arg2);
+ break;
+ case PR_GET_SECCOMP:
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
+ break;
+ case PR_GET_TSC:
+ error = GET_TSC_CTL(arg2);
+ break;
+ case PR_SET_TSC:
+ error = SET_TSC_CTL(arg2);
+ break;
+ case PR_TASK_PERF_EVENTS_DISABLE:
+ error = perf_event_task_disable();
+ break;
+ case PR_TASK_PERF_EVENTS_ENABLE:
+ error = perf_event_task_enable();
+ break;
+ case PR_GET_TIMERSLACK:
+ if (current->timer_slack_ns > ULONG_MAX)
+ error = ULONG_MAX;
+ else
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
+ current->default_timer_slack_ns;
+ else
+ current->timer_slack_ns = arg2;
+ break;
+ case PR_MCE_KILL:
+ if (arg4 | arg5)
+ return -EINVAL;
+ switch (arg2) {
+ case PR_MCE_KILL_CLEAR:
+ if (arg3 != 0)
+ return -EINVAL;
+ current->flags &= ~PF_MCE_PROCESS;
+ break;
+ case PR_MCE_KILL_SET:
+ current->flags |= PF_MCE_PROCESS;
+ if (arg3 == PR_MCE_KILL_EARLY)
+ current->flags |= PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_LATE)
+ current->flags &= ~PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_DEFAULT)
+ current->flags &=
+ ~(PF_MCE_EARLY|PF_MCE_PROCESS);
+ else
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ case PR_MCE_KILL_GET:
+ if (arg2 | arg3 | arg4 | arg5)
+ return -EINVAL;
+ if (current->flags & PF_MCE_PROCESS)
+ error = (current->flags & PF_MCE_EARLY) ?
+ PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ else
+ error = PR_MCE_KILL_DEFAULT;
+ break;
+ case PR_SET_MM:
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_TID_ADDRESS:
+ error = prctl_get_tid_address(me, (int __user **)arg2);
+ break;
+ case PR_SET_CHILD_SUBREAPER:
+ me->signal->is_child_subreaper = !!arg2;
+ break;
+ case PR_GET_CHILD_SUBREAPER:
+ error = put_user(me->signal->is_child_subreaper,
+ (int __user *)arg2);
+ break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ task_set_no_new_privs(current);
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return task_no_new_privs(current) ? 1 : 0;
+ case PR_GET_THP_DISABLE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+ break;
+ case PR_SET_THP_DISABLE:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ if (down_write_killable(&me->mm->mmap_sem))
+ return -EINTR;
+ if (arg2)
+ me->mm->def_flags |= VM_NOHUGEPAGE;
+ else
+ me->mm->def_flags &= ~VM_NOHUGEPAGE;
+ up_write(&me->mm->mmap_sem);
+ break;
+ case PR_MPX_ENABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = MPX_ENABLE_MANAGEMENT();
+ break;
+ case PR_MPX_DISABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = MPX_DISABLE_MANAGEMENT();
+ break;
+ case PR_SET_FP_MODE:
+ error = SET_FP_MODE(me, arg2);
+ break;
+ case PR_GET_FP_MODE:
+ error = GET_FP_MODE(me);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ return error;
+}
+
diff --git a/kernel/sys.c b/kernel/sys.c
index 89d5be4..bda75ca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -9,7 +9,6 @@
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
-#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kmod.h>
@@ -61,49 +60,6 @@
#include <asm/io.h>
#include <asm/unistd.h>
-#ifndef SET_UNALIGN_CTL
-# define SET_UNALIGN_CTL(a, b) (-EINVAL)
-#endif
-#ifndef GET_UNALIGN_CTL
-# define GET_UNALIGN_CTL(a, b) (-EINVAL)
-#endif
-#ifndef SET_FPEMU_CTL
-# define SET_FPEMU_CTL(a, b) (-EINVAL)
-#endif
-#ifndef GET_FPEMU_CTL
-# define GET_FPEMU_CTL(a, b) (-EINVAL)
-#endif
-#ifndef SET_FPEXC_CTL
-# define SET_FPEXC_CTL(a, b) (-EINVAL)
-#endif
-#ifndef GET_FPEXC_CTL
-# define GET_FPEXC_CTL(a, b) (-EINVAL)
-#endif
-#ifndef GET_ENDIAN
-# define GET_ENDIAN(a, b) (-EINVAL)
-#endif
-#ifndef SET_ENDIAN
-# define SET_ENDIAN(a, b) (-EINVAL)
-#endif
-#ifndef GET_TSC_CTL
-# define GET_TSC_CTL(a) (-EINVAL)
-#endif
-#ifndef SET_TSC_CTL
-# define SET_TSC_CTL(a) (-EINVAL)
-#endif
-#ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
-#endif
-#ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
-#endif
-#ifndef GET_FP_MODE
-# define GET_FP_MODE(a) (-EINVAL)
-#endif
-#ifndef SET_FP_MODE
-# define SET_FP_MODE(a,b) (-EINVAL)
-#endif
-
/*
* this is where the system-wide overflow UID and GID are defined, for
* architectures that now have 32-bit UID/GID but didn't in the past
@@ -1649,634 +1605,6 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
-{
- struct fd exe;
- struct file *old_exe, *exe_file;
- struct inode *inode;
- int err;
-
- exe = fdget(fd);
- if (!exe.file)
- return -EBADF;
-
- inode = file_inode(exe.file);
-
- /*
- * Because the original mm->exe_file points to executable file, make
- * sure that this one is executable as well, to avoid breaking an
- * overall picture.
- */
- err = -EACCES;
- if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
- goto exit;
-
- err = inode_permission(inode, MAY_EXEC);
- if (err)
- goto exit;
-
- /*
- * Forbid mm->exe_file change if old file still mapped.
- */
- exe_file = get_mm_exe_file(mm);
- err = -EBUSY;
- if (exe_file) {
- struct vm_area_struct *vma;
-
- down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (path_equal(&vma->vm_file->f_path,
- &exe_file->f_path))
- goto exit_err;
- }
-
- up_read(&mm->mmap_sem);
- fput(exe_file);
- }
-
- /*
- * The symlink can be changed only once, just to disallow arbitrary
- * transitions malicious software might bring in. This means one
- * could make a snapshot over all processes running and monitor
- * /proc/pid/exe changes to notice unusual activity if needed.
- */
- err = -EPERM;
- if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
- goto exit;
-
- err = 0;
- /* set the new file, lockless */
- get_file(exe.file);
- old_exe = xchg(&mm->exe_file, exe.file);
- if (old_exe)
- fput(old_exe);
-exit:
- fdput(exe);
- return err;
-exit_err:
- up_read(&mm->mmap_sem);
- fput(exe_file);
- goto exit;
-}
-
-/*
- * WARNING: we don't require any capability here so be very careful
- * in what is allowed for modification from userspace.
- */
-static int validate_prctl_map(struct prctl_mm_map *prctl_map)
-{
- unsigned long mmap_max_addr = TASK_SIZE;
- struct mm_struct *mm = current->mm;
- int error = -EINVAL, i;
-
- static const unsigned char offsets[] = {
- offsetof(struct prctl_mm_map, start_code),
- offsetof(struct prctl_mm_map, end_code),
- offsetof(struct prctl_mm_map, start_data),
- offsetof(struct prctl_mm_map, end_data),
- offsetof(struct prctl_mm_map, start_brk),
- offsetof(struct prctl_mm_map, brk),
- offsetof(struct prctl_mm_map, start_stack),
- offsetof(struct prctl_mm_map, arg_start),
- offsetof(struct prctl_mm_map, arg_end),
- offsetof(struct prctl_mm_map, env_start),
- offsetof(struct prctl_mm_map, env_end),
- };
-
- /*
- * Make sure the members are not somewhere outside
- * of allowed address space.
- */
- for (i = 0; i < ARRAY_SIZE(offsets); i++) {
- u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
-
- if ((unsigned long)val >= mmap_max_addr ||
- (unsigned long)val < mmap_min_addr)
- goto out;
- }
-
- /*
- * Make sure the pairs are ordered.
- */
-#define __prctl_check_order(__m1, __op, __m2) \
- ((unsigned long)prctl_map->__m1 __op \
- (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
- error = __prctl_check_order(start_code, <, end_code);
- error |= __prctl_check_order(start_data, <, end_data);
- error |= __prctl_check_order(start_brk, <=, brk);
- error |= __prctl_check_order(arg_start, <=, arg_end);
- error |= __prctl_check_order(env_start, <=, env_end);
- if (error)
- goto out;
-#undef __prctl_check_order
-
- error = -EINVAL;
-
- /*
- * @brk should be after @end_data in traditional maps.
- */
- if (prctl_map->start_brk <= prctl_map->end_data ||
- prctl_map->brk <= prctl_map->end_data)
- goto out;
-
- /*
- * Neither we should allow to override limits if they set.
- */
- if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
- prctl_map->start_brk, prctl_map->end_data,
- prctl_map->start_data))
- goto out;
-
- /*
- * Someone is trying to cheat the auxv vector.
- */
- if (prctl_map->auxv_size) {
- if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
- goto out;
- }
-
- /*
- * Finally, make sure the caller has the rights to
- * change /proc/pid/exe link: only local root should
- * be allowed to.
- */
- if (prctl_map->exe_fd != (u32)-1) {
- struct user_namespace *ns = current_user_ns();
- const struct cred *cred = current_cred();
-
- if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
- !gid_eq(cred->gid, make_kgid(ns, 0)))
- goto out;
- }
-
- error = 0;
-out:
- return error;
-}
-
-#ifdef CONFIG_CHECKPOINT_RESTORE
-static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
-{
- struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
- unsigned long user_auxv[AT_VECTOR_SIZE];
- struct mm_struct *mm = current->mm;
- int error;
-
- BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
- BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
-
- if (opt == PR_SET_MM_MAP_SIZE)
- return put_user((unsigned int)sizeof(prctl_map),
- (unsigned int __user *)addr);
-
- if (data_size != sizeof(prctl_map))
- return -EINVAL;
-
- if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
- return -EFAULT;
-
- error = validate_prctl_map(&prctl_map);
- if (error)
- return error;
-
- if (prctl_map.auxv_size) {
- memset(user_auxv, 0, sizeof(user_auxv));
- if (copy_from_user(user_auxv,
- (const void __user *)prctl_map.auxv,
- prctl_map.auxv_size))
- return -EFAULT;
-
- /* Last entry must be AT_NULL as specification requires */
- user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
- user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
- }
-
- if (prctl_map.exe_fd != (u32)-1) {
- error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
- if (error)
- return error;
- }
-
- down_write(&mm->mmap_sem);
-
- /*
- * We don't validate if these members are pointing to
- * real present VMAs because application may have correspond
- * VMAs already unmapped and kernel uses these members for statistics
- * output in procfs mostly, except
- *
- * - @start_brk/@brk which are used in do_brk but kernel lookups
- * for VMAs when updating these memvers so anything wrong written
- * here cause kernel to swear at userspace program but won't lead
- * to any problem in kernel itself
- */
-
- mm->start_code = prctl_map.start_code;
- mm->end_code = prctl_map.end_code;
- mm->start_data = prctl_map.start_data;
- mm->end_data = prctl_map.end_data;
- mm->start_brk = prctl_map.start_brk;
- mm->brk = prctl_map.brk;
- mm->start_stack = prctl_map.start_stack;
- mm->arg_start = prctl_map.arg_start;
- mm->arg_end = prctl_map.arg_end;
- mm->env_start = prctl_map.env_start;
- mm->env_end = prctl_map.env_end;
-
- /*
- * Note this update of @saved_auxv is lockless thus
- * if someone reads this member in procfs while we're
- * updating -- it may get partly updated results. It's
- * known and acceptable trade off: we leave it as is to
- * not introduce additional locks here making the kernel
- * more complex.
- */
- if (prctl_map.auxv_size)
- memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
-
- up_write(&mm->mmap_sem);
- return 0;
-}
-#endif /* CONFIG_CHECKPOINT_RESTORE */
-
-static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
- unsigned long len)
-{
- /*
- * This doesn't move the auxiliary vector itself since it's pinned to
- * mm_struct, but it permits filling the vector with new values. It's
- * up to the caller to provide sane values here, otherwise userspace
- * tools which use this vector might be unhappy.
- */
- unsigned long user_auxv[AT_VECTOR_SIZE];
-
- if (len > sizeof(user_auxv))
- return -EINVAL;
-
- if (copy_from_user(user_auxv, (const void __user *)addr, len))
- return -EFAULT;
-
- /* Make sure the last entry is always AT_NULL */
- user_auxv[AT_VECTOR_SIZE - 2] = 0;
- user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
- BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
- task_lock(current);
- memcpy(mm->saved_auxv, user_auxv, len);
- task_unlock(current);
-
- return 0;
-}
-
-static int prctl_set_mm(int opt, unsigned long addr,
- unsigned long arg4, unsigned long arg5)
-{
- struct mm_struct *mm = current->mm;
- struct prctl_mm_map prctl_map;
- struct vm_area_struct *vma;
- int error;
-
- if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
- opt != PR_SET_MM_MAP &&
- opt != PR_SET_MM_MAP_SIZE)))
- return -EINVAL;
-
-#ifdef CONFIG_CHECKPOINT_RESTORE
- if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
- return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
-#endif
-
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- if (opt == PR_SET_MM_EXE_FILE)
- return prctl_set_mm_exe_file(mm, (unsigned int)addr);
-
- if (opt == PR_SET_MM_AUXV)
- return prctl_set_auxv(mm, addr, arg4);
-
- if (addr >= TASK_SIZE || addr < mmap_min_addr)
- return -EINVAL;
-
- error = -EINVAL;
-
- down_write(&mm->mmap_sem);
- vma = find_vma(mm, addr);
-
- prctl_map.start_code = mm->start_code;
- prctl_map.end_code = mm->end_code;
- prctl_map.start_data = mm->start_data;
- prctl_map.end_data = mm->end_data;
- prctl_map.start_brk = mm->start_brk;
- prctl_map.brk = mm->brk;
- prctl_map.start_stack = mm->start_stack;
- prctl_map.arg_start = mm->arg_start;
- prctl_map.arg_end = mm->arg_end;
- prctl_map.env_start = mm->env_start;
- prctl_map.env_end = mm->env_end;
- prctl_map.auxv = NULL;
- prctl_map.auxv_size = 0;
- prctl_map.exe_fd = -1;
-
- switch (opt) {
- case PR_SET_MM_START_CODE:
- prctl_map.start_code = addr;
- break;
- case PR_SET_MM_END_CODE:
- prctl_map.end_code = addr;
- break;
- case PR_SET_MM_START_DATA:
- prctl_map.start_data = addr;
- break;
- case PR_SET_MM_END_DATA:
- prctl_map.end_data = addr;
- break;
- case PR_SET_MM_START_STACK:
- prctl_map.start_stack = addr;
- break;
- case PR_SET_MM_START_BRK:
- prctl_map.start_brk = addr;
- break;
- case PR_SET_MM_BRK:
- prctl_map.brk = addr;
- break;
- case PR_SET_MM_ARG_START:
- prctl_map.arg_start = addr;
- break;
- case PR_SET_MM_ARG_END:
- prctl_map.arg_end = addr;
- break;
- case PR_SET_MM_ENV_START:
- prctl_map.env_start = addr;
- break;
- case PR_SET_MM_ENV_END:
- prctl_map.env_end = addr;
- break;
- default:
- goto out;
- }
-
- error = validate_prctl_map(&prctl_map);
- if (error)
- goto out;
-
- switch (opt) {
- /*
- * If command line arguments and environment
- * are placed somewhere else on stack, we can
- * set them up here, ARG_START/END to setup
- * command line argumets and ENV_START/END
- * for environment.
- */
- case PR_SET_MM_START_STACK:
- case PR_SET_MM_ARG_START:
- case PR_SET_MM_ARG_END:
- case PR_SET_MM_ENV_START:
- case PR_SET_MM_ENV_END:
- if (!vma) {
- error = -EFAULT;
- goto out;
- }
- }
-
- mm->start_code = prctl_map.start_code;
- mm->end_code = prctl_map.end_code;
- mm->start_data = prctl_map.start_data;
- mm->end_data = prctl_map.end_data;
- mm->start_brk = prctl_map.start_brk;
- mm->brk = prctl_map.brk;
- mm->start_stack = prctl_map.start_stack;
- mm->arg_start = prctl_map.arg_start;
- mm->arg_end = prctl_map.arg_end;
- mm->env_start = prctl_map.env_start;
- mm->env_end = prctl_map.env_end;
-
- error = 0;
-out:
- up_write(&mm->mmap_sem);
- return error;
-}
-
-#ifdef CONFIG_CHECKPOINT_RESTORE
-static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
-{
- return put_user(me->clear_child_tid, tid_addr);
-}
-#else
-static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
-{
- return -EINVAL;
-}
-#endif
-
-SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
- unsigned long, arg4, unsigned long, arg5)
-{
- struct task_struct *me = current;
- unsigned char comm[sizeof(me->comm)];
- long error;
-
- error = security_task_prctl(option, arg2, arg3, arg4, arg5);
- if (error != -ENOSYS)
- return error;
-
- error = 0;
- switch (option) {
- case PR_SET_PDEATHSIG:
- if (!valid_signal(arg2)) {
- error = -EINVAL;
- break;
- }
- me->pdeath_signal = arg2;
- break;
- case PR_GET_PDEATHSIG:
- error = put_user(me->pdeath_signal, (int __user *)arg2);
- break;
- case PR_GET_DUMPABLE:
- error = get_dumpable(me->mm);
- break;
- case PR_SET_DUMPABLE:
- if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
- error = -EINVAL;
- break;
- }
- set_dumpable(me->mm, arg2);
- break;
-
- case PR_SET_UNALIGN:
- error = SET_UNALIGN_CTL(me, arg2);
- break;
- case PR_GET_UNALIGN:
- error = GET_UNALIGN_CTL(me, arg2);
- break;
- case PR_SET_FPEMU:
- error = SET_FPEMU_CTL(me, arg2);
- break;
- case PR_GET_FPEMU:
- error = GET_FPEMU_CTL(me, arg2);
- break;
- case PR_SET_FPEXC:
- error = SET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_FPEXC:
- error = GET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_TIMING:
- error = PR_TIMING_STATISTICAL;
- break;
- case PR_SET_TIMING:
- if (arg2 != PR_TIMING_STATISTICAL)
- error = -EINVAL;
- break;
- case PR_SET_NAME:
- comm[sizeof(me->comm) - 1] = 0;
- if (strncpy_from_user(comm, (char __user *)arg2,
- sizeof(me->comm) - 1) < 0)
- return -EFAULT;
- set_task_comm(me, comm);
- proc_comm_connector(me);
- break;
- case PR_GET_NAME:
- get_task_comm(comm, me);
- if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
- return -EFAULT;
- break;
- case PR_GET_ENDIAN:
- error = GET_ENDIAN(me, arg2);
- break;
- case PR_SET_ENDIAN:
- error = SET_ENDIAN(me, arg2);
- break;
- case PR_GET_SECCOMP:
- error = prctl_get_seccomp();
- break;
- case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2, (char __user *)arg3);
- break;
- case PR_GET_TSC:
- error = GET_TSC_CTL(arg2);
- break;
- case PR_SET_TSC:
- error = SET_TSC_CTL(arg2);
- break;
- case PR_TASK_PERF_EVENTS_DISABLE:
- error = perf_event_task_disable();
- break;
- case PR_TASK_PERF_EVENTS_ENABLE:
- error = perf_event_task_enable();
- break;
- case PR_GET_TIMERSLACK:
- if (current->timer_slack_ns > ULONG_MAX)
- error = ULONG_MAX;
- else
- error = current->timer_slack_ns;
- break;
- case PR_SET_TIMERSLACK:
- if (arg2 <= 0)
- current->timer_slack_ns =
- current->default_timer_slack_ns;
- else
- current->timer_slack_ns = arg2;
- break;
- case PR_MCE_KILL:
- if (arg4 | arg5)
- return -EINVAL;
- switch (arg2) {
- case PR_MCE_KILL_CLEAR:
- if (arg3 != 0)
- return -EINVAL;
- current->flags &= ~PF_MCE_PROCESS;
- break;
- case PR_MCE_KILL_SET:
- current->flags |= PF_MCE_PROCESS;
- if (arg3 == PR_MCE_KILL_EARLY)
- current->flags |= PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_LATE)
- current->flags &= ~PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_DEFAULT)
- current->flags &=
- ~(PF_MCE_EARLY|PF_MCE_PROCESS);
- else
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- break;
- case PR_MCE_KILL_GET:
- if (arg2 | arg3 | arg4 | arg5)
- return -EINVAL;
- if (current->flags & PF_MCE_PROCESS)
- error = (current->flags & PF_MCE_EARLY) ?
- PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
- else
- error = PR_MCE_KILL_DEFAULT;
- break;
- case PR_SET_MM:
- error = prctl_set_mm(arg2, arg3, arg4, arg5);
- break;
- case PR_GET_TID_ADDRESS:
- error = prctl_get_tid_address(me, (int __user **)arg2);
- break;
- case PR_SET_CHILD_SUBREAPER:
- me->signal->is_child_subreaper = !!arg2;
- break;
- case PR_GET_CHILD_SUBREAPER:
- error = put_user(me->signal->is_child_subreaper,
- (int __user *)arg2);
- break;
- case PR_SET_NO_NEW_PRIVS:
- if (arg2 != 1 || arg3 || arg4 || arg5)
- return -EINVAL;
-
- task_set_no_new_privs(current);
- break;
- case PR_GET_NO_NEW_PRIVS:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- return task_no_new_privs(current) ? 1 : 0;
- case PR_GET_THP_DISABLE:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
- break;
- case PR_SET_THP_DISABLE:
- if (arg3 || arg4 || arg5)
- return -EINVAL;
- if (down_write_killable(&me->mm->mmap_sem))
- return -EINTR;
- if (arg2)
- me->mm->def_flags |= VM_NOHUGEPAGE;
- else
- me->mm->def_flags &= ~VM_NOHUGEPAGE;
- up_write(&me->mm->mmap_sem);
- break;
- case PR_MPX_ENABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_ENABLE_MANAGEMENT();
- break;
- case PR_MPX_DISABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_DISABLE_MANAGEMENT();
- break;
- case PR_SET_FP_MODE:
- error = SET_FP_MODE(me, arg2);
- break;
- case PR_GET_FP_MODE:
- error = GET_FP_MODE(me);
- break;
- default:
- error = -EINVAL;
- break;
- }
- return error;
-}
-
SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
struct getcpu_cache __user *, unused)
{
--
git-series 0.8.11
Some embedded systems can do without the prctl syscall, saving some
space.
This also avoids regular increases in tinyconfig size as people add more
non-optional functionality to prctl (observed via the 0-day kernel
infrastructure).
bloat-o-meter results:
add/remove: 0/3 grow/shrink: 0/1 up/down: 0/-2143 (-2143)
function old new delta
offsets 23 12 -11
prctl_set_auxv 97 - -97
sys_prctl 794 - -794
prctl_set_mm 1241 - -1241
Total: Before=1902583, After=1900440, chg -0.11%
Signed-off-by: Josh Triplett <[email protected]>
---
init/Kconfig | 12 ++++++++++++
kernel/Makefile | 3 ++-
kernel/sys_ni.c | 1 +
3 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/init/Kconfig b/init/Kconfig
index 34407f1..1dd671c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1408,6 +1408,18 @@ config MULTIUSER
If unsure, say Y here.
+config PRCTL
+ bool "prctl syscall" if EXPERT
+ default y
+ help
+ This option enables the prctl syscall, used for a variety of
+ operations on the current process.
+
+ If building an embedded system where no applications or libraries use
+ prctl, you can disable this option to save space.
+
+ If unsure, say Y here.
+
config SGETMASK_SYSCALL
bool "sgetmask/ssetmask syscalls support" if EXPERT
def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
diff --git a/kernel/Makefile b/kernel/Makefile
index 37c6d4c..43fb4ca 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,9 +9,10 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o prctl.o
+ async.o range.o smpboot.o ucount.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_PRCTL) += prctl.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 635482e..84fd646 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,6 +175,7 @@ cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
cond_syscall(sys_copy_file_range);
+cond_syscall(sys_prctl);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
git-series 0.8.11
On Tue, Nov 8, 2016 at 4:18 PM, Josh Triplett <[email protected]> wrote:
> This prepares for making prctl optional.
I like this.
On Tuesday, November 8, 2016 4:17:47 PM CET Josh Triplett wrote:
> Some embedded systems can do without the prctl syscall, saving some space.
>
> This also avoids regular increases in tinyconfig size as people add more
> non-optional functionality to prctl (observed via the 0-day kernel
> infrastructure).
>
> The first patch moves prctl and all its helpers into a separate source file,
> kernel/prctl.c, to allow making it optional via the Makefile without any
> ifdefs. The second patch adds the Kconfig option to make prctl optional.
>
> Josh Triplett (2):
> kernel: Move prctl and helpers from kernel/sys.c to new kernel/prctl.c
> kernel: Support compiling out the prctl syscall
>
Both of these look really nice to me. Another syscall that comes to
mind as a candidate to be made optional is ptrace, and I've added
Nicolas Pitre to Cc here, as he has looked into this before.
Another area to look at may be syscalls that are always defined
but that are no longer part of the default syscall table in
include/uapi/asm-generic/unistd.h. Maybe we can have a single
conditional for those.
Arnd
On Tue, 8 Nov 2016, Josh Triplett wrote:
> Some embedded systems can do without the prctl syscall, saving some space.
>
> This also avoids regular increases in tinyconfig size as people add more
> non-optional functionality to prctl (observed via the 0-day kernel
> infrastructure).
>
> The first patch moves prctl and all its helpers into a separate source file,
> kernel/prctl.c, to allow making it optional via the Makefile without any
> ifdefs. The second patch adds the Kconfig option to make prctl optional.
Excellent. For both patches:
Acked-by: Nicolas Pitre <[email protected]>
>
> Josh Triplett (2):
> kernel: Move prctl and helpers from kernel/sys.c to new kernel/prctl.c
> kernel: Support compiling out the prctl syscall
>
> init/Kconfig | 12 +-
> kernel/Makefile | 1 +-
> kernel/prctl.c | 684 +++++++++++++++++++++++++++++++++++++++++++++++++-
> kernel/sys.c | 672 +------------------------------------------------
> kernel/sys_ni.c | 1 +-
> 5 files changed, 698 insertions(+), 672 deletions(-)
> create mode 100644 kernel/prctl.c
>
> base-commit: e3a00f68e426df24a5fb98956a1bd1b23943aa1e
> --
> git-series 0.8.11
>
On Tue, Nov 8, 2016 at 4:18 PM, Josh Triplett <[email protected]> wrote:
> Some embedded systems can do without the prctl syscall, saving some
> space.
>
> This also avoids regular increases in tinyconfig size as people add more
> non-optional functionality to prctl (observed via the 0-day kernel
> infrastructure).
>
> bloat-o-meter results:
>
> add/remove: 0/3 grow/shrink: 0/1 up/down: 0/-2143 (-2143)
> function old new delta
> offsets 23 12 -11
> prctl_set_auxv 97 - -97
> sys_prctl 794 - -794
> prctl_set_mm 1241 - -1241
> Total: Before=1902583, After=1900440, chg -0.11%
>
> Signed-off-by: Josh Triplett <[email protected]>
I'm absolutely a fan of doing this, but I wonder how this interacts
with the LSMs that define prctl hooks, etc. I wouldn't expect a system
that didn't want prctl to want an LSM, but maybe the LSMs all need to
depend on CONFIG_PRCTL now?
-Kees
> ---
> init/Kconfig | 12 ++++++++++++
> kernel/Makefile | 3 ++-
> kernel/sys_ni.c | 1 +
> 3 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 34407f1..1dd671c 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1408,6 +1408,18 @@ config MULTIUSER
>
> If unsure, say Y here.
>
> +config PRCTL
> + bool "prctl syscall" if EXPERT
> + default y
> + help
> + This option enables the prctl syscall, used for a variety of
> + operations on the current process.
> +
> + If building an embedded system where no applications or libraries use
> + prctl, you can disable this option to save space.
> +
> + If unsure, say Y here.
> +
> config SGETMASK_SYSCALL
> bool "sgetmask/ssetmask syscalls support" if EXPERT
> def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 37c6d4c..43fb4ca 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -9,9 +9,10 @@ obj-y = fork.o exec_domain.o panic.o \
> extable.o params.o \
> kthread.o sys_ni.o nsproxy.o \
> notifier.o ksysfs.o cred.o reboot.o \
> - async.o range.o smpboot.o ucount.o prctl.o
> + async.o range.o smpboot.o ucount.o
>
> obj-$(CONFIG_MULTIUSER) += groups.o
> +obj-$(CONFIG_PRCTL) += prctl.o
>
> ifdef CONFIG_FUNCTION_TRACER
> # Do not trace internal ftrace files
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 635482e..84fd646 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -175,6 +175,7 @@ cond_syscall(sys_setfsgid);
> cond_syscall(sys_capget);
> cond_syscall(sys_capset);
> cond_syscall(sys_copy_file_range);
> +cond_syscall(sys_prctl);
>
> /* arch-specific weak syscall entries */
> cond_syscall(sys_pciconfig_read);
> --
> git-series 0.8.11
--
Kees Cook
Nexus Security
On Tue, Nov 08, 2016 at 04:40:02PM -0800, Kees Cook wrote:
> On Tue, Nov 8, 2016 at 4:18 PM, Josh Triplett <[email protected]> wrote:
> > Some embedded systems can do without the prctl syscall, saving some
> > space.
> >
> > This also avoids regular increases in tinyconfig size as people add more
> > non-optional functionality to prctl (observed via the 0-day kernel
> > infrastructure).
> >
> > bloat-o-meter results:
> >
> > add/remove: 0/3 grow/shrink: 0/1 up/down: 0/-2143 (-2143)
> > function old new delta
> > offsets 23 12 -11
> > prctl_set_auxv 97 - -97
> > sys_prctl 794 - -794
> > prctl_set_mm 1241 - -1241
> > Total: Before=1902583, After=1900440, chg -0.11%
> >
> > Signed-off-by: Josh Triplett <[email protected]>
>
> I'm absolutely a fan of doing this, but I wonder how this interacts
> with the LSMs that define prctl hooks, etc. I wouldn't expect a system
> that didn't want prctl to want an LSM, but maybe the LSMs all need to
> depend on CONFIG_PRCTL now?
I did think about that (as well as SECCOMP), but I did confirm that the
kernel builds fine with allyesconfig minus CONFIG_PRCTL. An LSM that
wants to restrict access to some prctls should be fine with no process
having any access to prctl. :) Beyond that, anything wanting
configuration via LSM (such as SECCOMP) still exists and functions, even
if you can't access it from outside the kernel.
On Tue, Nov 8, 2016 at 4:47 PM, Josh Triplett <[email protected]> wrote:
> On Tue, Nov 08, 2016 at 04:40:02PM -0800, Kees Cook wrote:
>> On Tue, Nov 8, 2016 at 4:18 PM, Josh Triplett <[email protected]> wrote:
>> > Some embedded systems can do without the prctl syscall, saving some
>> > space.
>> >
>> > This also avoids regular increases in tinyconfig size as people add more
>> > non-optional functionality to prctl (observed via the 0-day kernel
>> > infrastructure).
>> >
>> > bloat-o-meter results:
>> >
>> > add/remove: 0/3 grow/shrink: 0/1 up/down: 0/-2143 (-2143)
>> > function old new delta
>> > offsets 23 12 -11
>> > prctl_set_auxv 97 - -97
>> > sys_prctl 794 - -794
>> > prctl_set_mm 1241 - -1241
>> > Total: Before=1902583, After=1900440, chg -0.11%
>> >
>> > Signed-off-by: Josh Triplett <[email protected]>
>>
>> I'm absolutely a fan of doing this, but I wonder how this interacts
>> with the LSMs that define prctl hooks, etc. I wouldn't expect a system
>> that didn't want prctl to want an LSM, but maybe the LSMs all need to
>> depend on CONFIG_PRCTL now?
>
> I did think about that (as well as SECCOMP), but I did confirm that the
> kernel builds fine with allyesconfig minus CONFIG_PRCTL. An LSM that
> wants to restrict access to some prctls should be fine with no process
> having any access to prctl. :) Beyond that, anything wanting
> configuration via LSM (such as SECCOMP) still exists and functions, even
> if you can't access it from outside the kernel.
Okay, testing that is good, thanks.
Seccomp can use the seccomp() syscall, so missing prctl isn't a big deal there.
Things like Yama, though, are almost useless in the !PRCTL case. I
think a "depends on PRCTL" should be added at least to Yama. All the
other LSMs are configured in other ways, and they'll just have some
dead code around their prctl hooks; no big deal.
This does also beg the question about how to configure some process
behaviors by default if PRCTL is disabled, but if people want those
things, they can write patches, I would think. :)
-Kees
--
Kees Cook
Nexus Security
On November 8, 2016 4:56:54 PM PST, Kees Cook <[email protected]> wrote:
>On Tue, Nov 8, 2016 at 4:47 PM, Josh Triplett <[email protected]>
>wrote:
>> On Tue, Nov 08, 2016 at 04:40:02PM -0800, Kees Cook wrote:
>>> On Tue, Nov 8, 2016 at 4:18 PM, Josh Triplett
><[email protected]> wrote:
>>> > Some embedded systems can do without the prctl syscall, saving
>some
>>> > space.
>>> >
>>> > This also avoids regular increases in tinyconfig size as people
>add more
>>> > non-optional functionality to prctl (observed via the 0-day kernel
>>> > infrastructure).
>>> >
>>> > bloat-o-meter results:
>>> >
>>> > add/remove: 0/3 grow/shrink: 0/1 up/down: 0/-2143 (-2143)
>>> > function old new delta
>>> > offsets 23 12 -11
>>> > prctl_set_auxv 97 - -97
>>> > sys_prctl 794 - -794
>>> > prctl_set_mm 1241 - -1241
>>> > Total: Before=1902583, After=1900440, chg -0.11%
>>> >
>>> > Signed-off-by: Josh Triplett <[email protected]>
>>>
>>> I'm absolutely a fan of doing this, but I wonder how this interacts
>>> with the LSMs that define prctl hooks, etc. I wouldn't expect a
>system
>>> that didn't want prctl to want an LSM, but maybe the LSMs all need
>to
>>> depend on CONFIG_PRCTL now?
>>
>> I did think about that (as well as SECCOMP), but I did confirm that
>the
>> kernel builds fine with allyesconfig minus CONFIG_PRCTL. An LSM that
>> wants to restrict access to some prctls should be fine with no
>process
>> having any access to prctl. :) Beyond that, anything wanting
>> configuration via LSM (such as SECCOMP) still exists and functions,
>even
>> if you can't access it from outside the kernel.
>
>Okay, testing that is good, thanks.
>
>Seccomp can use the seccomp() syscall, so missing prctl isn't a big
>deal there.
>
>Things like Yama, though, are almost useless in the !PRCTL case. I
>think a "depends on PRCTL" should be added at least to Yama. All the
>other LSMs are configured in other ways, and they'll just have some
>dead code around their prctl hooks; no big deal.
OK, I'll add that dependency to Yama in v2.
>This does also beg the question about how to configure some process
>behaviors by default if PRCTL is disabled, but if people want those
>things, they can write patches, I would think. :)
Agreed.
On Wed, Nov 09, 2016 at 01:26:53AM +0100, Arnd Bergmann wrote:
> On Tuesday, November 8, 2016 4:17:47 PM CET Josh Triplett wrote:
> > Some embedded systems can do without the prctl syscall, saving some space.
> >
> > This also avoids regular increases in tinyconfig size as people add more
> > non-optional functionality to prctl (observed via the 0-day kernel
> > infrastructure).
> >
> > The first patch moves prctl and all its helpers into a separate source file,
> > kernel/prctl.c, to allow making it optional via the Makefile without any
> > ifdefs. The second patch adds the Kconfig option to make prctl optional.
> >
> > Josh Triplett (2):
> > kernel: Move prctl and helpers from kernel/sys.c to new kernel/prctl.c
> > kernel: Support compiling out the prctl syscall
> >
>
> Both of these look really nice to me. Another syscall that comes to
> mind as a candidate to be made optional is ptrace, and I've added
> Nicolas Pitre to Cc here, as he has looked into this before.
I've looked at that as well; it's a rather more complex patch, as
several other bits of the kernel hook into ptrace or otherwise depend on
ptrace. (And ptrace_may_access will need to stick around even when
ptrace doesn't.)
- Josh Triplett
On Tue, Nov 08, 2016 at 04:18:13PM -0800, Josh Triplett wrote:
> This prepares for making prctl optional.
>
> Signed-off-by: Josh Triplett <[email protected]>
> +
...
> +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
> +{
> + struct fd exe;
> + struct file *old_exe, *exe_file;
> + struct inode *inode;
> + int err;
> +
> + exe = fdget(fd);
> + if (!exe.file)
> + return -EBADF;
> +
> + inode = file_inode(exe.file);
> +
> + /*
> + * Because the original mm->exe_file points to executable file, make
> + * sure that this one is executable as well, to avoid breaking an
> + * overall picture.
> + */
> + err = -EACCES;
> + if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
> + goto exit;
> +
> + err = inode_permission(inode, MAY_EXEC);
> + if (err)
> + goto exit;
> +
> + /*
> + * Forbid mm->exe_file change if old file still mapped.
> + */
> + exe_file = get_mm_exe_file(mm);
> + err = -EBUSY;
> + if (exe_file) {
> + struct vm_area_struct *vma;
> +
> + down_read(&mm->mmap_sem);
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + if (!vma->vm_file)
> + continue;
> + if (path_equal(&vma->vm_file->f_path,
> + &exe_file->f_path))
> + goto exit_err;
> + }
> +
> + up_read(&mm->mmap_sem);
> + fput(exe_file);
> + }
> +
> + /*
> + * The symlink can be changed only once, just to disallow arbitrary
> + * transitions malicious software might bring in. This means one
> + * could make a snapshot over all processes running and monitor
> + * /proc/pid/exe changes to notice unusual activity if needed.
> + */
> + err = -EPERM;
> + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
> + goto exit;
IIRC this snippet has been dropped in linux-next tree. Stas CC'ed.
The rest looks cool for me. Thanks!
Reviewed-by: Cyrill Gorcunov <[email protected]>