LinuxLists.cc - [PATCHv4 1/3] fs: Move core dump functionality into its own file

2012-08-10 08:29:34

Subject: [PATCHv4 1/3] fs: Move core dump functionality into its own file

This prepares for making core dump functionality optional.

The variable "suid_dumpable" and associated functions are left in fs/exec.c
because they're used elsewhere, such as in ptrace.

Signed-off-by: Alex Kelly <[email protected]>
Reviewed-by: Josh Triplett <[email protected]>
---
v2: This patch set is a second revision that follows some suggestions from
Ingo Molnar and Josh Triplett. Specifically, authorship of commits is
revised for consistency, and an additional two patches cleaning up artifacts
and making headers more sane are added.

v3: This version fixes a few more authorship issues and some problems caused
by a bad git send-email config. Sorry about the extra mails

v4: This version fixes some ordering issues pointed out by Kees Cook and Josh
Triplett, such that the order of the functions moved to fs/coredump.c is now
consistent with their original order in fs/exec.c. v4 also drops some extra
blank lines unintentionally introduced in fs/coredump.c, to avoid the need to
clean them up later. That left the cleanup patch just reformatting a comment,
so I dropped that patch. Some of the functions moved to coredump.c need a lot
of cleaning up, but I'm not sure that those formatting changes should be
folded into this patch series.

fs/Makefile | 2 +-
fs/coredump.c | 689 ++++++++++++++++++++++++++++++++++++++++++++++++++
fs/exec.c | 647 +----------------------------------------------
include/linux/sched.h | 1 +
4 files changed, 692 insertions(+), 647 deletions(-)
create mode 100644 fs/coredump.c

diff --git a/fs/Makefile b/fs/Makefile
index 2fb9779..8938f82 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o
+ stack.o fs_struct.o statfs.o coredump.o

ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/coredump.c b/fs/coredump.c
new file mode 100644
index 0000000..9692329
--- /dev/null
+++ b/fs/coredump.c
@@ -0,0 +1,689 @@
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/swap.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/perf_event.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/key.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
+#include "internal.h"
+
+#include <trace/events/sched.h>
+
+int core_uses_pid;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
+
+struct core_name {
+ char *corename;
+ int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
+
+/* The maximal length of core_pattern is also specified in sysctl.c */
+
+static int expand_corename(struct core_name *cn)
+{
+ char *old_corename = cn->corename;
+
+ cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+ cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+
+ if (!cn->corename) {
+ kfree(old_corename);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+ char *cur;
+ int need;
+ int ret;
+ va_list arg;
+
+ va_start(arg, fmt);
+ need = vsnprintf(NULL, 0, fmt, arg);
+ va_end(arg);
+
+ if (likely(need < cn->size - cn->used - 1))
+ goto out_printf;
+
+ ret = expand_corename(cn);
+ if (ret)
+ goto expand_fail;
+
+out_printf:
+ cur = cn->corename + cn->used;
+ va_start(arg, fmt);
+ vsnprintf(cur, need + 1, fmt, arg);
+ va_end(arg);
+ cn->used += need;
+ return 0;
+
+expand_fail:
+ return ret;
+}
+
+static void cn_escape(char *str)
+{
+ for (; *str; str++)
+ if (*str == '/')
+ *str = '!';
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+ struct file *exe_file;
+ char *pathbuf, *path;
+ int ret;
+
+ exe_file = get_mm_exe_file(current->mm);
+ if (!exe_file) {
+ char *commstart = cn->corename + cn->used;
+ ret = cn_printf(cn, "%s (path unknown)", current->comm);
+ cn_escape(commstart);
+ return ret;
+ }
+
+ pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+ if (!pathbuf) {
+ ret = -ENOMEM;
+ goto put_exe_file;
+ }
+
+ path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ goto free_buf;
+ }
+
+ cn_escape(path);
+
+ ret = cn_printf(cn, "%s", path);
+
+free_buf:
+ kfree(pathbuf);
+put_exe_file:
+ fput(exe_file);
+ return ret;
+}
+
+/* format_corename will inspect the pattern parameter, and output a
+ * name into corename, which must have space for at least
+ * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+ */
+static int format_corename(struct core_name *cn, long signr)
+{
+ const struct cred *cred = current_cred();
+ const char *pat_ptr = core_pattern;
+ int ispipe = (*pat_ptr == '|');
+ int pid_in_pattern = 0;
+ int err = 0;
+
+ cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+ cn->corename = kmalloc(cn->size, GFP_KERNEL);
+ cn->used = 0;
+
+ if (!cn->corename)
+ return -ENOMEM;
+
+ /* Repeat as long as we have more pattern to process and more output
+ space */
+ while (*pat_ptr) {
+ if (*pat_ptr != '%') {
+ if (*pat_ptr == 0)
+ goto out;
+ err = cn_printf(cn, "%c", *pat_ptr++);
+ } else {
+ switch (*++pat_ptr) {
+ /* single % at the end, drop that */
+ case 0:
+ goto out;
+ /* Double percent, output one percent */
+ case '%':
+ err = cn_printf(cn, "%c", '%');
+ break;
+ /* pid */
+ case 'p':
+ pid_in_pattern = 1;
+ err = cn_printf(cn, "%d",
+ task_tgid_vnr(current));
+ break;
+ /* uid */
+ case 'u':
+ err = cn_printf(cn, "%d", cred->uid);
+ break;
+ /* gid */
+ case 'g':
+ err = cn_printf(cn, "%d", cred->gid);
+ break;
+ /* signal that caused the coredump */
+ case 's':
+ err = cn_printf(cn, "%ld", signr);
+ break;
+ /* UNIX time of coredump */
+ case 't': {
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ err = cn_printf(cn, "%lu", tv.tv_sec);
+ break;
+ }
+ /* hostname */
+ case 'h': {
+ char *namestart = cn->corename + cn->used;
+ down_read(&uts_sem);
+ err = cn_printf(cn, "%s",
+ utsname()->nodename);
+ up_read(&uts_sem);
+ cn_escape(namestart);
+ break;
+ }
+ /* executable */
+ case 'e': {
+ char *commstart = cn->corename + cn->used;
+ err = cn_printf(cn, "%s", current->comm);
+ cn_escape(commstart);
+ break;
+ }
+ case 'E':
+ err = cn_print_exe_file(cn);
+ break;
+ /* core limit size */
+ case 'c':
+ err = cn_printf(cn, "%lu",
+ rlimit(RLIMIT_CORE));
+ break;
+ default:
+ break;
+ }
+ ++pat_ptr;
+ }
+
+ if (err)
+ return err;
+ }
+
+ /* Backward compatibility with core_uses_pid:
+ *
+ * If core_pattern does not include a %p (as is the default)
+ * and core_uses_pid is set, then .%pid will be appended to
+ * the filename. Do not do this for piped commands. */
+ if (!ispipe && !pid_in_pattern && core_uses_pid) {
+ err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+ if (err)
+ return err;
+ }
+out:
+ return ispipe;
+}
+
+static int zap_process(struct task_struct *start, int exit_code)
+{
+ struct task_struct *t;
+ int nr = 0;
+
+ start->signal->flags = SIGNAL_GROUP_EXIT;
+ start->signal->group_exit_code = exit_code;
+ start->signal->group_stop_count = 0;
+
+ t = start;
+ do {
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
+ if (t != current && t->mm) {
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
+ nr++;
+ }
+ } while_each_thread(start, t);
+
+ return nr;
+}
+
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+ struct core_state *core_state, int exit_code)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+ int nr = -EAGAIN;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!signal_group_exit(tsk->signal)) {
+ mm->core_state = core_state;
+ nr = zap_process(tsk, exit_code);
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (unlikely(nr < 0))
+ return nr;
+
+ if (atomic_read(&mm->mm_users) == nr + 1)
+ goto done;
+ /*
+ * We should find and kill all tasks which use this mm, and we should
+ * count them correctly into ->nr_threads. We don't take tasklist
+ * lock, but this is safe wrt:
+ *
+ * fork:
+ * None of sub-threads can fork after zap_process(leader). All
+ * processes which were created before this point should be
+ * visible to zap_threads() because copy_process() adds the new
+ * process to the tail of init_task.tasks list, and lock/unlock
+ * of ->siglock provides a memory barrier.
+ *
+ * do_exit:
+ * The caller holds mm->mmap_sem. This means that the task which
+ * uses this mm can't pass exit_mm(), so it can't exit or clear
+ * its ->mm.
+ *
+ * de_thread:
+ * It does list_replace_rcu(&leader->tasks, &current->tasks),
+ * we must see either old or new leader, this does not matter.
+ * However, it can change p->sighand, so lock_task_sighand(p)
+ * must be used. Since p->mm != NULL and we hold ->mmap_sem
+ * it can't fail.
+ *
+ * Note also that "g" can be the old leader with ->mm == NULL
+ * and already unhashed and thus removed from ->thread_group.
+ * This is OK, __unhash_process()->list_del_rcu() does not
+ * clear the ->next pointer, we will find the new leader via
+ * next_thread().
+ */
+ rcu_read_lock();
+ for_each_process(g) {
+ if (g == tsk->group_leader)
+ continue;
+ if (g->flags & PF_KTHREAD)
+ continue;
+ p = g;
+ do {
+ if (p->mm) {
+ if (unlikely(p->mm == mm)) {
+ lock_task_sighand(p, &flags);
+ nr += zap_process(p, exit_code);
+ unlock_task_sighand(p, &flags);
+ }
+ break;
+ }
+ } while_each_thread(g, p);
+ }
+ rcu_read_unlock();
+done:
+ atomic_set(&core_state->nr_threads, nr);
+ return nr;
+}
+
+static int coredump_wait(int exit_code, struct core_state *core_state)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *mm = tsk->mm;
+ int core_waiters = -EBUSY;
+
+ init_completion(&core_state->startup);
+ core_state->dumper.task = tsk;
+ core_state->dumper.next = NULL;
+
+ down_write(&mm->mmap_sem);
+ if (!mm->core_state)
+ core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+ up_write(&mm->mmap_sem);
+
+ if (core_waiters > 0) {
+ struct core_thread *ptr;
+
+ wait_for_completion(&core_state->startup);
+ /*
+ * Wait for all the threads to become inactive, so that
+ * all the thread context (extended register state, like
+ * fpu etc) gets copied to the memory.
+ */
+ ptr = core_state->dumper.next;
+ while (ptr != NULL) {
+ wait_task_inactive(ptr->task, 0);
+ ptr = ptr->next;
+ }
+ }
+
+ return core_waiters;
+}
+
+static void coredump_finish(struct mm_struct *mm)
+{
+ struct core_thread *curr, *next;
+ struct task_struct *task;
+
+ next = mm->core_state->dumper.next;
+ while ((curr = next) != NULL) {
+ next = curr->next;
+ task = curr->task;
+ /*
+ * see exit_mm(), curr->task must not see
+ * ->task == NULL before we read ->next.
+ */
+ smp_mb();
+ curr->task = NULL;
+ wake_up_process(task);
+ }
+
+ mm->core_state = NULL;
+}
+
+static void wait_for_dump_helpers(struct file *file)
+{
+ struct pipe_inode_info *pipe;
+
+ pipe = file->f_path.dentry->d_inode->i_pipe;
+
+ pipe_lock(pipe);
+ pipe->readers++;
+ pipe->writers--;
+
+ while ((pipe->readers > 1) && (!signal_pending(current))) {
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ pipe_wait(pipe);
+ }
+
+ pipe->readers--;
+ pipe->writers++;
+ pipe_unlock(pipe);
+
+}
+
+
+/*
+ * umh_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace. Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process. Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1. This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+ struct file *files[2];
+ struct fdtable *fdt;
+ struct coredump_params *cp = (struct coredump_params *)info->data;
+ struct files_struct *cf = current->files;
+ int err = create_pipe_files(files, 0);
+ if (err)
+ return err;
+
+ cp->file = files[1];
+
+ sys_close(0);
+ fd_install(0, files[0]);
+ spin_lock(&cf->file_lock);
+ fdt = files_fdtable(cf);
+ __set_open_fd(0, fdt);
+ __clear_close_on_exec(0, fdt);
+ spin_unlock(&cf->file_lock);
+
+ /* and disallow core files too */
+ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+ return 0;
+}
+
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
+{
+ struct core_state core_state;
+ struct core_name cn;
+ struct mm_struct *mm = current->mm;
+ struct linux_binfmt * binfmt;
+ const struct cred *old_cred;
+ struct cred *cred;
+ int retval = 0;
+ int flag = 0;
+ int ispipe;
+ bool need_nonrelative = false;
+ static atomic_t core_dump_count = ATOMIC_INIT(0);
+ struct coredump_params cprm = {
+ .signr = signr,
+ .regs = regs,
+ .limit = rlimit(RLIMIT_CORE),
+ /*
+ * We must use the same mm->flags while dumping core to avoid
+ * inconsistency of bit flags, since this flag is not protected
+ * by any locks.
+ */
+ .mm_flags = mm->flags,
+ };
+
+ audit_core_dumps(signr);
+
+ binfmt = mm->binfmt;
+ if (!binfmt || !binfmt->core_dump)
+ goto fail;
+ if (!__get_dumpable(cprm.mm_flags))
+ goto fail;
+
+ cred = prepare_creds();
+ if (!cred)
+ goto fail;
+ /*
+ * We cannot trust fsuid as being the "true" uid of the process
+ * nor do we know its entire history. We only know it was tainted
+ * so we dump it as root in mode 2, and only into a controlled
+ * environment (pipe handler or fully qualified path).
+ */
+ if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
+ /* Setuid core dump mode */
+ flag = O_EXCL; /* Stop rewrite attacks */
+ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
+ need_nonrelative = true;
+ }
+
+ retval = coredump_wait(exit_code, &core_state);
+ if (retval < 0)
+ goto fail_creds;
+
+ old_cred = override_creds(cred);
+
+ /*
+ * Clear any false indication of pending signals that might
+ * be seen by the filesystem code called to write the core file.
+ */
+ clear_thread_flag(TIF_SIGPENDING);
+
+ ispipe = format_corename(&cn, signr);
+
+ if (ispipe) {
+ int dump_count;
+ char **helper_argv;
+
+ if (ispipe < 0) {
+ printk(KERN_WARNING "format_corename failed\n");
+ printk(KERN_WARNING "Aborting core\n");
+ goto fail_corename;
+ }
+
+ if (cprm.limit == 1) {
+ /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
+ *
+ * Normally core limits are irrelevant to pipes, since
+ * we're not writing to the file system, but we use
+ * cprm.limit of 1 here as a speacial value, this is a
+ * consistent way to catch recursive crashes.
+ * We can still crash if the core_pattern binary sets
+ * RLIM_CORE = !1, but it runs as root, and can do
+ * lots of stupid things.
+ *
+ * Note that we use task_tgid_vnr here to grab the pid
+ * of the process group leader. That way we get the
+ * right pid if a thread in a multi-threaded
+ * core_pattern process dies.
+ */
+ printk(KERN_WARNING
+ "Process %d(%s) has RLIMIT_CORE set to 1\n",
+ task_tgid_vnr(current), current->comm);
+ printk(KERN_WARNING "Aborting core\n");
+ goto fail_unlock;
+ }
+ cprm.limit = RLIM_INFINITY;
+
+ dump_count = atomic_inc_return(&core_dump_count);
+ if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+ printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+ task_tgid_vnr(current), current->comm);
+ printk(KERN_WARNING "Skipping core dump\n");
+ goto fail_dropcount;
+ }
+
+ helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+ if (!helper_argv) {
+ printk(KERN_WARNING "%s failed to allocate memory\n",
+ __func__);
+ goto fail_dropcount;
+ }
+
+ retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+ NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+ NULL, &cprm);
+ argv_free(helper_argv);
+ if (retval) {
+ printk(KERN_INFO "Core dump to %s pipe failed\n",
+ cn.corename);
+ goto close_fail;
+ }
+ } else {
+ struct inode *inode;
+
+ if (cprm.limit < binfmt->min_coredump)
+ goto fail_unlock;
+
+ if (need_nonrelative && cn.corename[0] != '/') {
+ printk(KERN_WARNING "Pid %d(%s) can only dump core "\
+ "to fully qualified path!\n",
+ task_tgid_vnr(current), current->comm);
+ printk(KERN_WARNING "Skipping core dump\n");
+ goto fail_unlock;
+ }
+
+ cprm.file = filp_open(cn.corename,
+ O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+ 0600);
+ if (IS_ERR(cprm.file))
+ goto fail_unlock;
+
+ inode = cprm.file->f_path.dentry->d_inode;
+ if (inode->i_nlink > 1)
+ goto close_fail;
+ if (d_unhashed(cprm.file->f_path.dentry))
+ goto close_fail;
+ /*
+ * AK: actually i see no reason to not allow this for named
+ * pipes etc, but keep the previous behaviour for now.
+ */
+ if (!S_ISREG(inode->i_mode))
+ goto close_fail;
+ /*
+ * Dont allow local users get cute and trick others to coredump
+ * into their pre-created files.
+ */
+ if (!uid_eq(inode->i_uid, current_fsuid()))
+ goto close_fail;
+ if (!cprm.file->f_op || !cprm.file->f_op->write)
+ goto close_fail;
+ if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+ goto close_fail;
+ }
+
+ retval = binfmt->core_dump(&cprm);
+ if (retval)
+ current->signal->group_exit_code |= 0x80;
+
+ if (ispipe && core_pipe_limit)
+ wait_for_dump_helpers(cprm.file);
+close_fail:
+ if (cprm.file)
+ filp_close(cprm.file, NULL);
+fail_dropcount:
+ if (ispipe)
+ atomic_dec(&core_dump_count);
+fail_unlock:
+ kfree(cn.corename);
+fail_corename:
+ coredump_finish(mm);
+ revert_creds(old_cred);
+fail_creds:
+ put_cred(cred);
+fail:
+ return;
+}
+
+/*
+ * Core dumping helper functions. These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+ return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+ int ret = 1;
+
+ if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+ if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+ return 0;
+ } else {
+ char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+ if (!buf)
+ return 0;
+ while (off > 0) {
+ unsigned long n = off;
+
+ if (n > PAGE_SIZE)
+ n = PAGE_SIZE;
+ if (!dump_write(file, buf, n)) {
+ ret = 0;
+ break;
+ }
+ off -= n;
+ }
+ free_page((unsigned long)buf);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exec.c b/fs/exec.c
index 574cf4d..b604050 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -66,19 +66,8 @@

#include <trace/events/sched.h>

-int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
-unsigned int core_pipe_limit;
int suid_dumpable = 0;

-struct core_name {
- char *corename;
- int used, size;
-};
-static atomic_t call_count = ATOMIC_INIT(1);
-
-/* The maximal length of core_pattern is also specified in sysctl.c */
-
static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);

@@ -1632,353 +1621,6 @@ void set_binfmt(struct linux_binfmt *new)

EXPORT_SYMBOL(set_binfmt);

-static int expand_corename(struct core_name *cn)
-{
- char *old_corename = cn->corename;
-
- cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
- cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
-
- if (!cn->corename) {
- kfree(old_corename);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
-{
- char *cur;
- int need;
- int ret;
- va_list arg;
-
- va_start(arg, fmt);
- need = vsnprintf(NULL, 0, fmt, arg);
- va_end(arg);
-
- if (likely(need < cn->size - cn->used - 1))
- goto out_printf;
-
- ret = expand_corename(cn);
- if (ret)
- goto expand_fail;
-
-out_printf:
- cur = cn->corename + cn->used;
- va_start(arg, fmt);
- vsnprintf(cur, need + 1, fmt, arg);
- va_end(arg);
- cn->used += need;
- return 0;
-
-expand_fail:
- return ret;
-}
-
-static void cn_escape(char *str)
-{
- for (; *str; str++)
- if (*str == '/')
- *str = '!';
-}
-
-static int cn_print_exe_file(struct core_name *cn)
-{
- struct file *exe_file;
- char *pathbuf, *path;
- int ret;
-
- exe_file = get_mm_exe_file(current->mm);
- if (!exe_file) {
- char *commstart = cn->corename + cn->used;
- ret = cn_printf(cn, "%s (path unknown)", current->comm);
- cn_escape(commstart);
- return ret;
- }
-
- pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
- if (!pathbuf) {
- ret = -ENOMEM;
- goto put_exe_file;
- }
-
- path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
- if (IS_ERR(path)) {
- ret = PTR_ERR(path);
- goto free_buf;
- }
-
- cn_escape(path);
-
- ret = cn_printf(cn, "%s", path);
-
-free_buf:
- kfree(pathbuf);
-put_exe_file:
- fput(exe_file);
- return ret;
-}
-
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
- */
-static int format_corename(struct core_name *cn, long signr)
-{
- const struct cred *cred = current_cred();
- const char *pat_ptr = core_pattern;
- int ispipe = (*pat_ptr == '|');
- int pid_in_pattern = 0;
- int err = 0;
-
- cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
- cn->corename = kmalloc(cn->size, GFP_KERNEL);
- cn->used = 0;
-
- if (!cn->corename)
- return -ENOMEM;
-
- /* Repeat as long as we have more pattern to process and more output
- space */
- while (*pat_ptr) {
- if (*pat_ptr != '%') {
- if (*pat_ptr == 0)
- goto out;
- err = cn_printf(cn, "%c", *pat_ptr++);
- } else {
- switch (*++pat_ptr) {
- /* single % at the end, drop that */
- case 0:
- goto out;
- /* Double percent, output one percent */
- case '%':
- err = cn_printf(cn, "%c", '%');
- break;
- /* pid */
- case 'p':
- pid_in_pattern = 1;
- err = cn_printf(cn, "%d",
- task_tgid_vnr(current));
- break;
- /* uid */
- case 'u':
- err = cn_printf(cn, "%d", cred->uid);
- break;
- /* gid */
- case 'g':
- err = cn_printf(cn, "%d", cred->gid);
- break;
- /* signal that caused the coredump */
- case 's':
- err = cn_printf(cn, "%ld", signr);
- break;
- /* UNIX time of coredump */
- case 't': {
- struct timeval tv;
- do_gettimeofday(&tv);
- err = cn_printf(cn, "%lu", tv.tv_sec);
- break;
- }
- /* hostname */
- case 'h': {
- char *namestart = cn->corename + cn->used;
- down_read(&uts_sem);
- err = cn_printf(cn, "%s",
- utsname()->nodename);
- up_read(&uts_sem);
- cn_escape(namestart);
- break;
- }
- /* executable */
- case 'e': {
- char *commstart = cn->corename + cn->used;
- err = cn_printf(cn, "%s", current->comm);
- cn_escape(commstart);
- break;
- }
- case 'E':
- err = cn_print_exe_file(cn);
- break;
- /* core limit size */
- case 'c':
- err = cn_printf(cn, "%lu",
- rlimit(RLIMIT_CORE));
- break;
- default:
- break;
- }
- ++pat_ptr;
- }
-
- if (err)
- return err;
- }
-
- /* Backward compatibility with core_uses_pid:
- *
- * If core_pattern does not include a %p (as is the default)
- * and core_uses_pid is set, then .%pid will be appended to
- * the filename. Do not do this for piped commands. */
- if (!ispipe && !pid_in_pattern && core_uses_pid) {
- err = cn_printf(cn, ".%d", task_tgid_vnr(current));
- if (err)
- return err;
- }
-out:
- return ispipe;
-}
-
-static int zap_process(struct task_struct *start, int exit_code)
-{
- struct task_struct *t;
- int nr = 0;
-
- start->signal->flags = SIGNAL_GROUP_EXIT;
- start->signal->group_exit_code = exit_code;
- start->signal->group_stop_count = 0;
-
- t = start;
- do {
- task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- if (t != current && t->mm) {
- sigaddset(&t->pending.signal, SIGKILL);
- signal_wake_up(t, 1);
- nr++;
- }
- } while_each_thread(start, t);
-
- return nr;
-}
-
-static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
- struct core_state *core_state, int exit_code)
-{
- struct task_struct *g, *p;
- unsigned long flags;
- int nr = -EAGAIN;
-
- spin_lock_irq(&tsk->sighand->siglock);
- if (!signal_group_exit(tsk->signal)) {
- mm->core_state = core_state;
- nr = zap_process(tsk, exit_code);
- }
- spin_unlock_irq(&tsk->sighand->siglock);
- if (unlikely(nr < 0))
- return nr;
-
- if (atomic_read(&mm->mm_users) == nr + 1)
- goto done;
- /*
- * We should find and kill all tasks which use this mm, and we should
- * count them correctly into ->nr_threads. We don't take tasklist
- * lock, but this is safe wrt:
- *
- * fork:
- * None of sub-threads can fork after zap_process(leader). All
- * processes which were created before this point should be
- * visible to zap_threads() because copy_process() adds the new
- * process to the tail of init_task.tasks list, and lock/unlock
- * of ->siglock provides a memory barrier.
- *
- * do_exit:
- * The caller holds mm->mmap_sem. This means that the task which
- * uses this mm can't pass exit_mm(), so it can't exit or clear
- * its ->mm.
- *
- * de_thread:
- * It does list_replace_rcu(&leader->tasks, &current->tasks),
- * we must see either old or new leader, this does not matter.
- * However, it can change p->sighand, so lock_task_sighand(p)
- * must be used. Since p->mm != NULL and we hold ->mmap_sem
- * it can't fail.
- *
- * Note also that "g" can be the old leader with ->mm == NULL
- * and already unhashed and thus removed from ->thread_group.
- * This is OK, __unhash_process()->list_del_rcu() does not
- * clear the ->next pointer, we will find the new leader via
- * next_thread().
- */
- rcu_read_lock();
- for_each_process(g) {
- if (g == tsk->group_leader)
- continue;
- if (g->flags & PF_KTHREAD)
- continue;
- p = g;
- do {
- if (p->mm) {
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code);
- unlock_task_sighand(p, &flags);
- }
- break;
- }
- } while_each_thread(g, p);
- }
- rcu_read_unlock();
-done:
- atomic_set(&core_state->nr_threads, nr);
- return nr;
-}
-
-static int coredump_wait(int exit_code, struct core_state *core_state)
-{
- struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
- int core_waiters = -EBUSY;
-
- init_completion(&core_state->startup);
- core_state->dumper.task = tsk;
- core_state->dumper.next = NULL;
-
- down_write(&mm->mmap_sem);
- if (!mm->core_state)
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
- up_write(&mm->mmap_sem);
-
- if (core_waiters > 0) {
- struct core_thread *ptr;
-
- wait_for_completion(&core_state->startup);
- /*
- * Wait for all the threads to become inactive, so that
- * all the thread context (extended register state, like
- * fpu etc) gets copied to the memory.
- */
- ptr = core_state->dumper.next;
- while (ptr != NULL) {
- wait_task_inactive(ptr->task, 0);
- ptr = ptr->next;
- }
- }
-
- return core_waiters;
-}
-
-static void coredump_finish(struct mm_struct *mm)
-{
- struct core_thread *curr, *next;
- struct task_struct *task;
-
- next = mm->core_state->dumper.next;
- while ((curr = next) != NULL) {
- next = curr->next;
- task = curr->task;
- /*
- * see exit_mm(), curr->task must not see
- * ->task == NULL before we read ->next.
- */
- smp_mb();
- curr->task = NULL;
- wake_up_process(task);
- }
-
- mm->core_state = NULL;
-}
-
/*
* set_dumpable converts traditional three-value dumpable to two flags and
* stores them into mm->flags. It modifies lower two bits of mm->flags, but
@@ -2020,7 +1662,7 @@ void set_dumpable(struct mm_struct *mm, int value)
}
}

-static int __get_dumpable(unsigned long mm_flags)
+int __get_dumpable(unsigned long mm_flags)
{
int ret;

@@ -2032,290 +1674,3 @@ int get_dumpable(struct mm_struct *mm)
{
return __get_dumpable(mm->flags);
}
-
-static void wait_for_dump_helpers(struct file *file)
-{
- struct pipe_inode_info *pipe;
-
- pipe = file->f_path.dentry->d_inode->i_pipe;
-
- pipe_lock(pipe);
- pipe->readers++;
- pipe->writers--;
-
- while ((pipe->readers > 1) && (!signal_pending(current))) {
- wake_up_interruptible_sync(&pipe->wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- pipe_wait(pipe);
- }
-
- pipe->readers--;
- pipe->writers++;
- pipe_unlock(pipe);
-
-}
-
-
-/*
- * umh_pipe_setup
- * helper function to customize the process used
- * to collect the core in userspace. Specifically
- * it sets up a pipe and installs it as fd 0 (stdin)
- * for the process. Returns 0 on success, or
- * PTR_ERR on failure.
- * Note that it also sets the core limit to 1. This
- * is a special value that we use to trap recursive
- * core dumps
- */
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
-{
- struct file *files[2];
- struct fdtable *fdt;
- struct coredump_params *cp = (struct coredump_params *)info->data;
- struct files_struct *cf = current->files;
- int err = create_pipe_files(files, 0);
- if (err)
- return err;
-
- cp->file = files[1];
-
- sys_close(0);
- fd_install(0, files[0]);
- spin_lock(&cf->file_lock);
- fdt = files_fdtable(cf);
- __set_open_fd(0, fdt);
- __clear_close_on_exec(0, fdt);
- spin_unlock(&cf->file_lock);
-
- /* and disallow core files too */
- current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
-
- return 0;
-}
-
-void do_coredump(long signr, int exit_code, struct pt_regs *regs)
-{
- struct core_state core_state;
- struct core_name cn;
- struct mm_struct *mm = current->mm;
- struct linux_binfmt * binfmt;
- const struct cred *old_cred;
- struct cred *cred;
- int retval = 0;
- int flag = 0;
- int ispipe;
- bool need_nonrelative = false;
- static atomic_t core_dump_count = ATOMIC_INIT(0);
- struct coredump_params cprm = {
- .signr = signr,
- .regs = regs,
- .limit = rlimit(RLIMIT_CORE),
- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency of bit flags, since this flag is not protected
- * by any locks.
- */
- .mm_flags = mm->flags,
- };
-
- audit_core_dumps(signr);
-
- binfmt = mm->binfmt;
- if (!binfmt || !binfmt->core_dump)
- goto fail;
- if (!__get_dumpable(cprm.mm_flags))
- goto fail;
-
- cred = prepare_creds();
- if (!cred)
- goto fail;
- /*
- * We cannot trust fsuid as being the "true" uid of the process
- * nor do we know its entire history. We only know it was tainted
- * so we dump it as root in mode 2, and only into a controlled
- * environment (pipe handler or fully qualified path).
- */
- if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
- /* Setuid core dump mode */
- flag = O_EXCL; /* Stop rewrite attacks */
- cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_nonrelative = true;
- }
-
- retval = coredump_wait(exit_code, &core_state);
- if (retval < 0)
- goto fail_creds;
-
- old_cred = override_creds(cred);
-
- /*
- * Clear any false indication of pending signals that might
- * be seen by the filesystem code called to write the core file.
- */
- clear_thread_flag(TIF_SIGPENDING);
-
- ispipe = format_corename(&cn, signr);
-
- if (ispipe) {
- int dump_count;
- char **helper_argv;
-
- if (ispipe < 0) {
- printk(KERN_WARNING "format_corename failed\n");
- printk(KERN_WARNING "Aborting core\n");
- goto fail_corename;
- }
-
- if (cprm.limit == 1) {
- /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
- *
- * Normally core limits are irrelevant to pipes, since
- * we're not writing to the file system, but we use
- * cprm.limit of 1 here as a speacial value, this is a
- * consistent way to catch recursive crashes.
- * We can still crash if the core_pattern binary sets
- * RLIM_CORE = !1, but it runs as root, and can do
- * lots of stupid things.
- *
- * Note that we use task_tgid_vnr here to grab the pid
- * of the process group leader. That way we get the
- * right pid if a thread in a multi-threaded
- * core_pattern process dies.
- */
- printk(KERN_WARNING
- "Process %d(%s) has RLIMIT_CORE set to 1\n",
- task_tgid_vnr(current), current->comm);
- printk(KERN_WARNING "Aborting core\n");
- goto fail_unlock;
- }
- cprm.limit = RLIM_INFINITY;
-
- dump_count = atomic_inc_return(&core_dump_count);
- if (core_pipe_limit && (core_pipe_limit < dump_count)) {
- printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
- task_tgid_vnr(current), current->comm);
- printk(KERN_WARNING "Skipping core dump\n");
- goto fail_dropcount;
- }
-
- helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
- if (!helper_argv) {
- printk(KERN_WARNING "%s failed to allocate memory\n",
- __func__);
- goto fail_dropcount;
- }
-
- retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
- NULL, UMH_WAIT_EXEC, umh_pipe_setup,
- NULL, &cprm);
- argv_free(helper_argv);
- if (retval) {
- printk(KERN_INFO "Core dump to %s pipe failed\n",
- cn.corename);
- goto close_fail;
- }
- } else {
- struct inode *inode;
-
- if (cprm.limit < binfmt->min_coredump)
- goto fail_unlock;
-
- if (need_nonrelative && cn.corename[0] != '/') {
- printk(KERN_WARNING "Pid %d(%s) can only dump core "\
- "to fully qualified path!\n",
- task_tgid_vnr(current), current->comm);
- printk(KERN_WARNING "Skipping core dump\n");
- goto fail_unlock;
- }
-
- cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
- 0600);
- if (IS_ERR(cprm.file))
- goto fail_unlock;
-
- inode = cprm.file->f_path.dentry->d_inode;
- if (inode->i_nlink > 1)
- goto close_fail;
- if (d_unhashed(cprm.file->f_path.dentry))
- goto close_fail;
- /*
- * AK: actually i see no reason to not allow this for named
- * pipes etc, but keep the previous behaviour for now.
- */
- if (!S_ISREG(inode->i_mode))
- goto close_fail;
- /*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files.
- */
- if (!uid_eq(inode->i_uid, current_fsuid()))
- goto close_fail;
- if (!cprm.file->f_op || !cprm.file->f_op->write)
- goto close_fail;
- if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
- goto close_fail;
- }
-
- retval = binfmt->core_dump(&cprm);
- if (retval)
- current->signal->group_exit_code |= 0x80;
-
- if (ispipe && core_pipe_limit)
- wait_for_dump_helpers(cprm.file);
-close_fail:
- if (cprm.file)
- filp_close(cprm.file, NULL);
-fail_dropcount:
- if (ispipe)
- atomic_dec(&core_dump_count);
-fail_unlock:
- kfree(cn.corename);
-fail_corename:
- coredump_finish(mm);
- revert_creds(old_cred);
-fail_creds:
- put_cred(cred);
-fail:
- return;
-}
-
-/*
- * Core dumping helper functions. These are the only things you should
- * do on a core-file: use only these functions to write out all the
- * necessary info.
- */
-int dump_write(struct file *file, const void *addr, int nr)
-{
- return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-EXPORT_SYMBOL(dump_write);
-
-int dump_seek(struct file *file, loff_t off)
-{
- int ret = 1;
-
- if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
- if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
- return 0;
- } else {
- char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-
- if (!buf)
- return 0;
- while (off > 0) {
- unsigned long n = off;
-
- if (n > PAGE_SIZE)
- n = PAGE_SIZE;
- if (!dump_write(file, buf, n)) {
- ret = 0;
- break;
- }
- off -= n;
- }
- free_page((unsigned long)buf);
- }
- return ret;
-}
-EXPORT_SYMBOL(dump_seek);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c147e70..7bb5047 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -413,6 +413,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}

extern void set_dumpable(struct mm_struct *mm, int value);
extern int get_dumpable(struct mm_struct *mm);
+extern int __get_dumpable(unsigned long mm_flags);

/* get/set_dumpable() values */
#define SUID_DUMPABLE_DISABLED 0
--
1.7.11.2

2012-08-10 08:30:49

by Alex Kelly

[permalink] [raw]

Subject: [PATCHv4 3/3] fs: Update coredump-related headers

This patch creates a new header file, fs/coredump.h, which contains
functions only used by the new coredump.c. It also moves do_coredump
to the include/linux/coredump.h header file, for consistency.

Signed-off-by: Alex Kelly <[email protected]>
Reviewed-by: Josh Triplett <[email protected]>
---
fs/coredump.c | 2 ++
fs/coredump.h | 6 ++++++
fs/exec.c | 1 +
include/linux/binfmts.h | 5 -----
include/linux/coredump.h | 5 +++++
include/linux/sched.h | 1 -
kernel/signal.c | 1 +
7 files changed, 15 insertions(+), 6 deletions(-)
create mode 100644 fs/coredump.h

diff --git a/fs/coredump.c b/fs/coredump.c
index 9692329..1935b4d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -14,6 +14,7 @@
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
+#include <linux/coredump.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
@@ -39,6 +40,7 @@

#include <trace/events/task.h>
#include "internal.h"
+#include "coredump.h"

#include <trace/events/sched.h>

diff --git a/fs/coredump.h b/fs/coredump.h
new file mode 100644
index 0000000..e39ff07
--- /dev/null
+++ b/fs/coredump.h
@@ -0,0 +1,6 @@
+#ifndef _FS_COREDUMP_H
+#define _FS_COREDUMP_H
+
+extern int __get_dumpable(unsigned long mm_flags);
+
+#endif
diff --git a/fs/exec.c b/fs/exec.c
index b604050..a0ad3a2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -63,6 +63,7 @@

#include <trace/events/task.h>
#include "internal.h"
+#include "coredump.h"

#include <trace/events/sched.h>

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 00e2e89..c7b16ee 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -132,11 +132,6 @@ extern int copy_strings_kernel(int argc, const char *const *argv,
struct linux_binprm *bprm);
extern int prepare_bprm_creds(struct linux_binprm *bprm);
extern void install_exec_creds(struct linux_binprm *bprm);
-#ifdef CONFIG_COREDUMP
-extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
-#else
-static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
-#endif
extern void set_binfmt(struct linux_binfmt *new);
extern void free_bprm(struct linux_binprm *);

diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index ba4b85a..42f9752 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -11,5 +11,10 @@
*/
extern int dump_write(struct file *file, const void *addr, int nr);
extern int dump_seek(struct file *file, loff_t off);
+#ifdef CONFIG_COREDUMP
+extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
+#else
+static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
+#endif

#endif /* _LINUX_COREDUMP_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7bb5047..c147e70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -413,7 +413,6 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}

extern void set_dumpable(struct mm_struct *mm, int value);
extern int get_dumpable(struct mm_struct *mm);
-extern int __get_dumpable(unsigned long mm_flags);

/* get/set_dumpable() values */
#define SUID_DUMPABLE_DISABLED 0
diff --git a/kernel/signal.c b/kernel/signal.c
index be4f856..fb4fd72 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
+#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
--
1.7.11.2

2012-08-10 08:30:43

by Alex Kelly

[permalink] [raw]

Subject: [PATCHv4 2/3] fs: Make core dump functionality optional

Adds an expert Kconfig option, CONFIG_COREDUMP, which allows disabling of core dump.
This saves approximately 2.6k in the compiled kernel, and complements CONFIG_ELF_CORE,
which now depends on it.

CONFIG_COREDUMP also disables coredump-related sysctls, except for suid_dumpable and
related functions, which are necessary for ptrace.

Signed-off-by: Alex Kelly <[email protected]>
Reviewed-by: Josh Triplett <[email protected]>
---
fs/Kconfig.binfmt | 8 ++++++++
fs/Makefile | 3 ++-
include/linux/binfmts.h | 4 ++++
init/Kconfig | 1 +
kernel/sysctl.c | 6 +++++-
5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 0225742..0efd152 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -164,3 +164,11 @@ config BINFMT_MISC
You may say M here for module support and later load the module when
you have use for it; the module is called binfmt_misc. If you
don't know what to answer at this point, say Y.
+
+config COREDUMP
+ bool "Enable core dump support" if EXPERT
+ default y
+ help
+ This option enables support for performing core dumps. You almost
+ certainly want to say Y here. Not necessary on systems that never
+ need debugging or only ever run flawless code.
diff --git a/fs/Makefile b/fs/Makefile
index 8938f82..1d7af79 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o coredump.o
+ stack.o fs_struct.o statfs.o

ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
+obj-$(CONFIG_COREDUMP) += coredump.o

obj-$(CONFIG_FHANDLE) += fhandle.o

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 366422b..00e2e89 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -132,7 +132,11 @@ extern int copy_strings_kernel(int argc, const char *const *argv,
struct linux_binprm *bprm);
extern int prepare_bprm_creds(struct linux_binprm *bprm);
extern void install_exec_creds(struct linux_binprm *bprm);
+#ifdef CONFIG_COREDUMP
extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
+#else
+static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
+#endif
extern void set_binfmt(struct linux_binfmt *new);
extern void free_bprm(struct linux_binprm *);

diff --git a/init/Kconfig b/init/Kconfig
index af6c7f8..0e75056 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1230,6 +1230,7 @@ config BUG
Just say Y.

config ELF_CORE
+ depends on COREDUMP
default y
bool "Enable ELF core dumps" if EXPERT
help
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87174ef..af57e84 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,12 @@
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern int max_threads;
-extern int core_uses_pid;
extern int suid_dumpable;
+#ifdef CONFIG_COREDUMP
+extern int core_uses_pid;
extern char core_pattern[];
extern unsigned int core_pipe_limit;
+#endif
extern int pid_max;
extern int min_free_kbytes;
extern int pid_max_min, pid_max_max;
@@ -404,6 +406,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_COREDUMP
{
.procname = "core_uses_pid",
.data = &core_uses_pid,
@@ -425,6 +428,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
--
1.7.11.2

2012-08-10 13:23:41

by Serge Hallyn

[permalink] [raw]

Subject: Re: [PATCHv4 2/3] fs: Make core dump functionality optional

Quoting Alex Kelly ([email protected]):
> Adds an expert Kconfig option, CONFIG_COREDUMP, which allows disabling of core dump.
> This saves approximately 2.6k in the compiled kernel, and complements CONFIG_ELF_CORE,
> which now depends on it.

Is there another reason than the 2.6k to do this? My kernels range
between 4.8 and 5M, so that's .05% size savings?

I'm not saying nack (it's not my area anyway), just seem to be missing
the point.

> CONFIG_COREDUMP also disables coredump-related sysctls, except for suid_dumpable and
> related functions, which are necessary for ptrace.
>
> Signed-off-by: Alex Kelly <[email protected]>
> Reviewed-by: Josh Triplett <[email protected]>
> ---
> fs/Kconfig.binfmt | 8 ++++++++
> fs/Makefile | 3 ++-
> include/linux/binfmts.h | 4 ++++
> init/Kconfig | 1 +
> kernel/sysctl.c | 6 +++++-
> 5 files changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
> index 0225742..0efd152 100644
> --- a/fs/Kconfig.binfmt
> +++ b/fs/Kconfig.binfmt
> @@ -164,3 +164,11 @@ config BINFMT_MISC
> You may say M here for module support and later load the module when
> you have use for it; the module is called binfmt_misc. If you
> don't know what to answer at this point, say Y.
> +
> +config COREDUMP
> + bool "Enable core dump support" if EXPERT
> + default y
> + help
> + This option enables support for performing core dumps. You almost
> + certainly want to say Y here. Not necessary on systems that never
> + need debugging or only ever run flawless code.
> diff --git a/fs/Makefile b/fs/Makefile
> index 8938f82..1d7af79 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
> attr.o bad_inode.o file.o filesystems.o namespace.o \
> seq_file.o xattr.o libfs.o fs-writeback.o \
> pnode.o drop_caches.o splice.o sync.o utimes.o \
> - stack.o fs_struct.o statfs.o coredump.o
> + stack.o fs_struct.o statfs.o
>
> ifeq ($(CONFIG_BLOCK),y)
> obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
> @@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
> obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
> obj-$(CONFIG_NFS_COMMON) += nfs_common/
> obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
> +obj-$(CONFIG_COREDUMP) += coredump.o
>
> obj-$(CONFIG_FHANDLE) += fhandle.o
>
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 366422b..00e2e89 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -132,7 +132,11 @@ extern int copy_strings_kernel(int argc, const char *const *argv,
> struct linux_binprm *bprm);
> extern int prepare_bprm_creds(struct linux_binprm *bprm);
> extern void install_exec_creds(struct linux_binprm *bprm);
> +#ifdef CONFIG_COREDUMP
> extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
> +#else
> +static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
> +#endif
> extern void set_binfmt(struct linux_binfmt *new);
> extern void free_bprm(struct linux_binprm *);
>
> diff --git a/init/Kconfig b/init/Kconfig
> index af6c7f8..0e75056 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1230,6 +1230,7 @@ config BUG
> Just say Y.
>
> config ELF_CORE
> + depends on COREDUMP
> default y
> bool "Enable ELF core dumps" if EXPERT
> help
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 87174ef..af57e84 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -97,10 +97,12 @@
> extern int sysctl_overcommit_memory;
> extern int sysctl_overcommit_ratio;
> extern int max_threads;
> -extern int core_uses_pid;
> extern int suid_dumpable;
> +#ifdef CONFIG_COREDUMP
> +extern int core_uses_pid;
> extern char core_pattern[];
> extern unsigned int core_pipe_limit;
> +#endif
> extern int pid_max;
> extern int min_free_kbytes;
> extern int pid_max_min, pid_max_max;
> @@ -404,6 +406,7 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> +#ifdef CONFIG_COREDUMP
> {
> .procname = "core_uses_pid",
> .data = &core_uses_pid,
> @@ -425,6 +428,7 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> +#endif
> #ifdef CONFIG_PROC_SYSCTL
> {
> .procname = "tainted",
> --
> 1.7.11.2
>

2012-08-10 15:02:09

by Josh Triplett

[permalink] [raw]

Subject: Re: [PATCHv4 2/3] fs: Make core dump functionality optional

On Fri, Aug 10, 2012 at 08:23:23AM -0500, Serge Hallyn wrote:
> Quoting Alex Kelly ([email protected]):
> > Adds an expert Kconfig option, CONFIG_COREDUMP, which allows disabling of core dump.
> > This saves approximately 2.6k in the compiled kernel, and complements CONFIG_ELF_CORE,
> > which now depends on it.
>
> Is there another reason than the 2.6k to do this? My kernels range
> between 4.8 and 5M, so that's .05% size savings?

A kitchen-sink kernel might take up that much space, but you can build a
minimal embedded kernel that only takes up ~200k, at which point 2.6k
represents a >1% decrease. Add a few more changes like this, and those
decreases start to add up. At this point, no one thing you can chop out
of the kernel will give you a 100k decrease by itself; you need a pile
of changes like this one to do that.

- Josh Triplett

2012-08-10 15:26:22

by Serge Hallyn

[permalink] [raw]

Subject: Re: [PATCHv4 2/3] fs: Make core dump functionality optional

Quoting Josh Triplett ([email protected]):
> On Fri, Aug 10, 2012 at 08:23:23AM -0500, Serge Hallyn wrote:
> > Quoting Alex Kelly ([email protected]):
> > > Adds an expert Kconfig option, CONFIG_COREDUMP, which allows disabling of core dump.
> > > This saves approximately 2.6k in the compiled kernel, and complements CONFIG_ELF_CORE,
> > > which now depends on it.
> >
> > Is there another reason than the 2.6k to do this? My kernels range
> > between 4.8 and 5M, so that's .05% size savings?
>
> A kitchen-sink kernel might take up that much space, but you can build a
> minimal embedded kernel that only takes up ~200k, at which point 2.6k
> represents a >1% decrease. Add a few more changes like this, and those
> decreases start to add up. At this point, no one thing you can chop out
> of the kernel will give you a 100k decrease by itself; you need a pile
> of changes like this one to do that.
>
> - Josh Triplett

I see. That's an order of magnitude smaller than what i figured you'd
get with a reasonable kernel :)

2012-08-10 15:31:45

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [PATCHv4 1/3] fs: Move core dump functionality into its own file

Quoting Alex Kelly ([email protected]):
> This prepares for making core dump functionality optional.
>
> The variable "suid_dumpable" and associated functions are left in fs/exec.c
> because they're used elsewhere, such as in ptrace.
>
> Signed-off-by: Alex Kelly <[email protected]>
> Reviewed-by: Josh Triplett <[email protected]>

Acked-by: Serge Hallyn <[email protected]>

> ---
> v2: This patch set is a second revision that follows some suggestions from
> Ingo Molnar and Josh Triplett. Specifically, authorship of commits is
> revised for consistency, and an additional two patches cleaning up artifacts
> and making headers more sane are added.
>
> v3: This version fixes a few more authorship issues and some problems caused
> by a bad git send-email config. Sorry about the extra mails
>
> v4: This version fixes some ordering issues pointed out by Kees Cook and Josh
> Triplett, such that the order of the functions moved to fs/coredump.c is now
> consistent with their original order in fs/exec.c. v4 also drops some extra
> blank lines unintentionally introduced in fs/coredump.c, to avoid the need to
> clean them up later. That left the cleanup patch just reformatting a comment,
> so I dropped that patch. Some of the functions moved to coredump.c need a lot
> of cleaning up, but I'm not sure that those formatting changes should be
> folded into this patch series.
>
> fs/Makefile | 2 +-
> fs/coredump.c | 689 ++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/exec.c | 647 +----------------------------------------------
> include/linux/sched.h | 1 +
> 4 files changed, 692 insertions(+), 647 deletions(-)
> create mode 100644 fs/coredump.c
>
> diff --git a/fs/Makefile b/fs/Makefile
> index 2fb9779..8938f82 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
> attr.o bad_inode.o file.o filesystems.o namespace.o \
> seq_file.o xattr.o libfs.o fs-writeback.o \
> pnode.o drop_caches.o splice.o sync.o utimes.o \
> - stack.o fs_struct.o statfs.o
> + stack.o fs_struct.o statfs.o coredump.o
>
> ifeq ($(CONFIG_BLOCK),y)
> obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
> diff --git a/fs/coredump.c b/fs/coredump.c
> new file mode 100644
> index 0000000..9692329
> --- /dev/null
> +++ b/fs/coredump.c
> @@ -0,0 +1,689 @@
> +#include <linux/slab.h>
> +#include <linux/file.h>
> +#include <linux/fdtable.h>
> +#include <linux/mm.h>
> +#include <linux/stat.h>
> +#include <linux/fcntl.h>
> +#include <linux/swap.h>
> +#include <linux/string.h>
> +#include <linux/init.h>
> +#include <linux/pagemap.h>
> +#include <linux/perf_event.h>
> +#include <linux/highmem.h>
> +#include <linux/spinlock.h>
> +#include <linux/key.h>
> +#include <linux/personality.h>
> +#include <linux/binfmts.h>
> +#include <linux/utsname.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/module.h>
> +#include <linux/namei.h>
> +#include <linux/mount.h>
> +#include <linux/security.h>
> +#include <linux/syscalls.h>
> +#include <linux/tsacct_kern.h>
> +#include <linux/cn_proc.h>
> +#include <linux/audit.h>
> +#include <linux/tracehook.h>
> +#include <linux/kmod.h>
> +#include <linux/fsnotify.h>
> +#include <linux/fs_struct.h>
> +#include <linux/pipe_fs_i.h>
> +#include <linux/oom.h>
> +#include <linux/compat.h>
> +
> +#include <asm/uaccess.h>
> +#include <asm/mmu_context.h>
> +#include <asm/tlb.h>
> +#include <asm/exec.h>
> +
> +#include <trace/events/task.h>
> +#include "internal.h"
> +
> +#include <trace/events/sched.h>
> +
> +int core_uses_pid;
> +char core_pattern[CORENAME_MAX_SIZE] = "core";
> +unsigned int core_pipe_limit;
> +
> +struct core_name {
> + char *corename;
> + int used, size;
> +};
> +static atomic_t call_count = ATOMIC_INIT(1);
> +
> +/* The maximal length of core_pattern is also specified in sysctl.c */
> +
> +static int expand_corename(struct core_name *cn)
> +{
> + char *old_corename = cn->corename;
> +
> + cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
> + cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
> +
> + if (!cn->corename) {
> + kfree(old_corename);
> + return -ENOMEM;
> + }
> +
> + return 0;
> +}
> +
> +static int cn_printf(struct core_name *cn, const char *fmt, ...)
> +{
> + char *cur;
> + int need;
> + int ret;
> + va_list arg;
> +
> + va_start(arg, fmt);
> + need = vsnprintf(NULL, 0, fmt, arg);
> + va_end(arg);
> +
> + if (likely(need < cn->size - cn->used - 1))
> + goto out_printf;
> +
> + ret = expand_corename(cn);
> + if (ret)
> + goto expand_fail;
> +
> +out_printf:
> + cur = cn->corename + cn->used;
> + va_start(arg, fmt);
> + vsnprintf(cur, need + 1, fmt, arg);
> + va_end(arg);
> + cn->used += need;
> + return 0;
> +
> +expand_fail:
> + return ret;
> +}
> +
> +static void cn_escape(char *str)
> +{
> + for (; *str; str++)
> + if (*str == '/')
> + *str = '!';
> +}
> +
> +static int cn_print_exe_file(struct core_name *cn)
> +{
> + struct file *exe_file;
> + char *pathbuf, *path;
> + int ret;
> +
> + exe_file = get_mm_exe_file(current->mm);
> + if (!exe_file) {
> + char *commstart = cn->corename + cn->used;
> + ret = cn_printf(cn, "%s (path unknown)", current->comm);
> + cn_escape(commstart);
> + return ret;
> + }
> +
> + pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
> + if (!pathbuf) {
> + ret = -ENOMEM;
> + goto put_exe_file;
> + }
> +
> + path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
> + if (IS_ERR(path)) {
> + ret = PTR_ERR(path);
> + goto free_buf;
> + }
> +
> + cn_escape(path);
> +
> + ret = cn_printf(cn, "%s", path);
> +
> +free_buf:
> + kfree(pathbuf);
> +put_exe_file:
> + fput(exe_file);
> + return ret;
> +}
> +
> +/* format_corename will inspect the pattern parameter, and output a
> + * name into corename, which must have space for at least
> + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
> + */
> +static int format_corename(struct core_name *cn, long signr)
> +{
> + const struct cred *cred = current_cred();
> + const char *pat_ptr = core_pattern;
> + int ispipe = (*pat_ptr == '|');
> + int pid_in_pattern = 0;
> + int err = 0;
> +
> + cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
> + cn->corename = kmalloc(cn->size, GFP_KERNEL);
> + cn->used = 0;
> +
> + if (!cn->corename)
> + return -ENOMEM;
> +
> + /* Repeat as long as we have more pattern to process and more output
> + space */
> + while (*pat_ptr) {
> + if (*pat_ptr != '%') {
> + if (*pat_ptr == 0)
> + goto out;
> + err = cn_printf(cn, "%c", *pat_ptr++);
> + } else {
> + switch (*++pat_ptr) {
> + /* single % at the end, drop that */
> + case 0:
> + goto out;
> + /* Double percent, output one percent */
> + case '%':
> + err = cn_printf(cn, "%c", '%');
> + break;
> + /* pid */
> + case 'p':
> + pid_in_pattern = 1;
> + err = cn_printf(cn, "%d",
> + task_tgid_vnr(current));
> + break;
> + /* uid */
> + case 'u':
> + err = cn_printf(cn, "%d", cred->uid);
> + break;
> + /* gid */
> + case 'g':
> + err = cn_printf(cn, "%d", cred->gid);
> + break;
> + /* signal that caused the coredump */
> + case 's':
> + err = cn_printf(cn, "%ld", signr);
> + break;
> + /* UNIX time of coredump */
> + case 't': {
> + struct timeval tv;
> + do_gettimeofday(&tv);
> + err = cn_printf(cn, "%lu", tv.tv_sec);
> + break;
> + }
> + /* hostname */
> + case 'h': {
> + char *namestart = cn->corename + cn->used;
> + down_read(&uts_sem);
> + err = cn_printf(cn, "%s",
> + utsname()->nodename);
> + up_read(&uts_sem);
> + cn_escape(namestart);
> + break;
> + }
> + /* executable */
> + case 'e': {
> + char *commstart = cn->corename + cn->used;
> + err = cn_printf(cn, "%s", current->comm);
> + cn_escape(commstart);
> + break;
> + }
> + case 'E':
> + err = cn_print_exe_file(cn);
> + break;
> + /* core limit size */
> + case 'c':
> + err = cn_printf(cn, "%lu",
> + rlimit(RLIMIT_CORE));
> + break;
> + default:
> + break;
> + }
> + ++pat_ptr;
> + }
> +
> + if (err)
> + return err;
> + }
> +
> + /* Backward compatibility with core_uses_pid:
> + *
> + * If core_pattern does not include a %p (as is the default)
> + * and core_uses_pid is set, then .%pid will be appended to
> + * the filename. Do not do this for piped commands. */
> + if (!ispipe && !pid_in_pattern && core_uses_pid) {
> + err = cn_printf(cn, ".%d", task_tgid_vnr(current));
> + if (err)
> + return err;
> + }
> +out:
> + return ispipe;
> +}
> +
> +static int zap_process(struct task_struct *start, int exit_code)
> +{
> + struct task_struct *t;
> + int nr = 0;
> +
> + start->signal->flags = SIGNAL_GROUP_EXIT;
> + start->signal->group_exit_code = exit_code;
> + start->signal->group_stop_count = 0;
> +
> + t = start;
> + do {
> + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
> + if (t != current && t->mm) {
> + sigaddset(&t->pending.signal, SIGKILL);
> + signal_wake_up(t, 1);
> + nr++;
> + }
> + } while_each_thread(start, t);
> +
> + return nr;
> +}
> +
> +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
> + struct core_state *core_state, int exit_code)
> +{
> + struct task_struct *g, *p;
> + unsigned long flags;
> + int nr = -EAGAIN;
> +
> + spin_lock_irq(&tsk->sighand->siglock);
> + if (!signal_group_exit(tsk->signal)) {
> + mm->core_state = core_state;
> + nr = zap_process(tsk, exit_code);
> + }
> + spin_unlock_irq(&tsk->sighand->siglock);
> + if (unlikely(nr < 0))
> + return nr;
> +
> + if (atomic_read(&mm->mm_users) == nr + 1)
> + goto done;
> + /*
> + * We should find and kill all tasks which use this mm, and we should
> + * count them correctly into ->nr_threads. We don't take tasklist
> + * lock, but this is safe wrt:
> + *
> + * fork:
> + * None of sub-threads can fork after zap_process(leader). All
> + * processes which were created before this point should be
> + * visible to zap_threads() because copy_process() adds the new
> + * process to the tail of init_task.tasks list, and lock/unlock
> + * of ->siglock provides a memory barrier.
> + *
> + * do_exit:
> + * The caller holds mm->mmap_sem. This means that the task which
> + * uses this mm can't pass exit_mm(), so it can't exit or clear
> + * its ->mm.
> + *
> + * de_thread:
> + * It does list_replace_rcu(&leader->tasks, &current->tasks),
> + * we must see either old or new leader, this does not matter.
> + * However, it can change p->sighand, so lock_task_sighand(p)
> + * must be used. Since p->mm != NULL and we hold ->mmap_sem
> + * it can't fail.
> + *
> + * Note also that "g" can be the old leader with ->mm == NULL
> + * and already unhashed and thus removed from ->thread_group.
> + * This is OK, __unhash_process()->list_del_rcu() does not
> + * clear the ->next pointer, we will find the new leader via
> + * next_thread().
> + */
> + rcu_read_lock();
> + for_each_process(g) {
> + if (g == tsk->group_leader)
> + continue;
> + if (g->flags & PF_KTHREAD)
> + continue;
> + p = g;
> + do {
> + if (p->mm) {
> + if (unlikely(p->mm == mm)) {
> + lock_task_sighand(p, &flags);
> + nr += zap_process(p, exit_code);
> + unlock_task_sighand(p, &flags);
> + }
> + break;
> + }
> + } while_each_thread(g, p);
> + }
> + rcu_read_unlock();
> +done:
> + atomic_set(&core_state->nr_threads, nr);
> + return nr;
> +}
> +
> +static int coredump_wait(int exit_code, struct core_state *core_state)
> +{
> + struct task_struct *tsk = current;
> + struct mm_struct *mm = tsk->mm;
> + int core_waiters = -EBUSY;
> +
> + init_completion(&core_state->startup);
> + core_state->dumper.task = tsk;
> + core_state->dumper.next = NULL;
> +
> + down_write(&mm->mmap_sem);
> + if (!mm->core_state)
> + core_waiters = zap_threads(tsk, mm, core_state, exit_code);
> + up_write(&mm->mmap_sem);
> +
> + if (core_waiters > 0) {
> + struct core_thread *ptr;
> +
> + wait_for_completion(&core_state->startup);
> + /*
> + * Wait for all the threads to become inactive, so that
> + * all the thread context (extended register state, like
> + * fpu etc) gets copied to the memory.
> + */
> + ptr = core_state->dumper.next;
> + while (ptr != NULL) {
> + wait_task_inactive(ptr->task, 0);
> + ptr = ptr->next;
> + }
> + }
> +
> + return core_waiters;
> +}
> +
> +static void coredump_finish(struct mm_struct *mm)
> +{
> + struct core_thread *curr, *next;
> + struct task_struct *task;
> +
> + next = mm->core_state->dumper.next;
> + while ((curr = next) != NULL) {
> + next = curr->next;
> + task = curr->task;
> + /*
> + * see exit_mm(), curr->task must not see
> + * ->task == NULL before we read ->next.
> + */
> + smp_mb();
> + curr->task = NULL;
> + wake_up_process(task);
> + }
> +
> + mm->core_state = NULL;
> +}
> +
> +static void wait_for_dump_helpers(struct file *file)
> +{
> + struct pipe_inode_info *pipe;
> +
> + pipe = file->f_path.dentry->d_inode->i_pipe;
> +
> + pipe_lock(pipe);
> + pipe->readers++;
> + pipe->writers--;
> +
> + while ((pipe->readers > 1) && (!signal_pending(current))) {
> + wake_up_interruptible_sync(&pipe->wait);
> + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
> + pipe_wait(pipe);
> + }
> +
> + pipe->readers--;
> + pipe->writers++;
> + pipe_unlock(pipe);
> +
> +}
> +
> +
> +/*
> + * umh_pipe_setup
> + * helper function to customize the process used
> + * to collect the core in userspace. Specifically
> + * it sets up a pipe and installs it as fd 0 (stdin)
> + * for the process. Returns 0 on success, or
> + * PTR_ERR on failure.
> + * Note that it also sets the core limit to 1. This
> + * is a special value that we use to trap recursive
> + * core dumps
> + */
> +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
> +{
> + struct file *files[2];
> + struct fdtable *fdt;
> + struct coredump_params *cp = (struct coredump_params *)info->data;
> + struct files_struct *cf = current->files;
> + int err = create_pipe_files(files, 0);
> + if (err)
> + return err;
> +
> + cp->file = files[1];
> +
> + sys_close(0);
> + fd_install(0, files[0]);
> + spin_lock(&cf->file_lock);
> + fdt = files_fdtable(cf);
> + __set_open_fd(0, fdt);
> + __clear_close_on_exec(0, fdt);
> + spin_unlock(&cf->file_lock);
> +
> + /* and disallow core files too */
> + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
> +
> + return 0;
> +}
> +
> +void do_coredump(long signr, int exit_code, struct pt_regs *regs)
> +{
> + struct core_state core_state;
> + struct core_name cn;
> + struct mm_struct *mm = current->mm;
> + struct linux_binfmt * binfmt;
> + const struct cred *old_cred;
> + struct cred *cred;
> + int retval = 0;
> + int flag = 0;
> + int ispipe;
> + bool need_nonrelative = false;
> + static atomic_t core_dump_count = ATOMIC_INIT(0);
> + struct coredump_params cprm = {
> + .signr = signr,
> + .regs = regs,
> + .limit = rlimit(RLIMIT_CORE),
> + /*
> + * We must use the same mm->flags while dumping core to avoid
> + * inconsistency of bit flags, since this flag is not protected
> + * by any locks.
> + */
> + .mm_flags = mm->flags,
> + };
> +
> + audit_core_dumps(signr);
> +
> + binfmt = mm->binfmt;
> + if (!binfmt || !binfmt->core_dump)
> + goto fail;
> + if (!__get_dumpable(cprm.mm_flags))
> + goto fail;
> +
> + cred = prepare_creds();
> + if (!cred)
> + goto fail;
> + /*
> + * We cannot trust fsuid as being the "true" uid of the process
> + * nor do we know its entire history. We only know it was tainted
> + * so we dump it as root in mode 2, and only into a controlled
> + * environment (pipe handler or fully qualified path).
> + */
> + if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
> + /* Setuid core dump mode */
> + flag = O_EXCL; /* Stop rewrite attacks */
> + cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
> + need_nonrelative = true;
> + }
> +
> + retval = coredump_wait(exit_code, &core_state);
> + if (retval < 0)
> + goto fail_creds;
> +
> + old_cred = override_creds(cred);
> +
> + /*
> + * Clear any false indication of pending signals that might
> + * be seen by the filesystem code called to write the core file.
> + */
> + clear_thread_flag(TIF_SIGPENDING);
> +
> + ispipe = format_corename(&cn, signr);
> +
> + if (ispipe) {
> + int dump_count;
> + char **helper_argv;
> +
> + if (ispipe < 0) {
> + printk(KERN_WARNING "format_corename failed\n");
> + printk(KERN_WARNING "Aborting core\n");
> + goto fail_corename;
> + }
> +
> + if (cprm.limit == 1) {
> + /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
> + *
> + * Normally core limits are irrelevant to pipes, since
> + * we're not writing to the file system, but we use
> + * cprm.limit of 1 here as a speacial value, this is a
> + * consistent way to catch recursive crashes.
> + * We can still crash if the core_pattern binary sets
> + * RLIM_CORE = !1, but it runs as root, and can do
> + * lots of stupid things.
> + *
> + * Note that we use task_tgid_vnr here to grab the pid
> + * of the process group leader. That way we get the
> + * right pid if a thread in a multi-threaded
> + * core_pattern process dies.
> + */
> + printk(KERN_WARNING
> + "Process %d(%s) has RLIMIT_CORE set to 1\n",
> + task_tgid_vnr(current), current->comm);
> + printk(KERN_WARNING "Aborting core\n");
> + goto fail_unlock;
> + }
> + cprm.limit = RLIM_INFINITY;
> +
> + dump_count = atomic_inc_return(&core_dump_count);
> + if (core_pipe_limit && (core_pipe_limit < dump_count)) {
> + printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
> + task_tgid_vnr(current), current->comm);
> + printk(KERN_WARNING "Skipping core dump\n");
> + goto fail_dropcount;
> + }
> +
> + helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
> + if (!helper_argv) {
> + printk(KERN_WARNING "%s failed to allocate memory\n",
> + __func__);
> + goto fail_dropcount;
> + }
> +
> + retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
> + NULL, UMH_WAIT_EXEC, umh_pipe_setup,
> + NULL, &cprm);
> + argv_free(helper_argv);
> + if (retval) {
> + printk(KERN_INFO "Core dump to %s pipe failed\n",
> + cn.corename);
> + goto close_fail;
> + }
> + } else {
> + struct inode *inode;
> +
> + if (cprm.limit < binfmt->min_coredump)
> + goto fail_unlock;
> +
> + if (need_nonrelative && cn.corename[0] != '/') {
> + printk(KERN_WARNING "Pid %d(%s) can only dump core "\
> + "to fully qualified path!\n",
> + task_tgid_vnr(current), current->comm);
> + printk(KERN_WARNING "Skipping core dump\n");
> + goto fail_unlock;
> + }
> +
> + cprm.file = filp_open(cn.corename,
> + O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
> + 0600);
> + if (IS_ERR(cprm.file))
> + goto fail_unlock;
> +
> + inode = cprm.file->f_path.dentry->d_inode;
> + if (inode->i_nlink > 1)
> + goto close_fail;
> + if (d_unhashed(cprm.file->f_path.dentry))
> + goto close_fail;
> + /*
> + * AK: actually i see no reason to not allow this for named
> + * pipes etc, but keep the previous behaviour for now.
> + */
> + if (!S_ISREG(inode->i_mode))
> + goto close_fail;
> + /*
> + * Dont allow local users get cute and trick others to coredump
> + * into their pre-created files.
> + */
> + if (!uid_eq(inode->i_uid, current_fsuid()))
> + goto close_fail;
> + if (!cprm.file->f_op || !cprm.file->f_op->write)
> + goto close_fail;
> + if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
> + goto close_fail;
> + }
> +
> + retval = binfmt->core_dump(&cprm);
> + if (retval)
> + current->signal->group_exit_code |= 0x80;
> +
> + if (ispipe && core_pipe_limit)
> + wait_for_dump_helpers(cprm.file);
> +close_fail:
> + if (cprm.file)
> + filp_close(cprm.file, NULL);
> +fail_dropcount:
> + if (ispipe)
> + atomic_dec(&core_dump_count);
> +fail_unlock:
> + kfree(cn.corename);
> +fail_corename:
> + coredump_finish(mm);
> + revert_creds(old_cred);
> +fail_creds:
> + put_cred(cred);
> +fail:
> + return;
> +}
> +
> +/*
> + * Core dumping helper functions. These are the only things you should
> + * do on a core-file: use only these functions to write out all the
> + * necessary info.
> + */
> +int dump_write(struct file *file, const void *addr, int nr)
> +{
> + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
> +}
> +EXPORT_SYMBOL(dump_write);
> +
> +int dump_seek(struct file *file, loff_t off)
> +{
> + int ret = 1;
> +
> + if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
> + if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
> + return 0;
> + } else {
> + char *buf = (char *)get_zeroed_page(GFP_KERNEL);
> +
> + if (!buf)
> + return 0;
> + while (off > 0) {
> + unsigned long n = off;
> +
> + if (n > PAGE_SIZE)
> + n = PAGE_SIZE;
> + if (!dump_write(file, buf, n)) {
> + ret = 0;
> + break;
> + }
> + off -= n;
> + }
> + free_page((unsigned long)buf);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(dump_seek);
> diff --git a/fs/exec.c b/fs/exec.c
> index 574cf4d..b604050 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -66,19 +66,8 @@
>
> #include <trace/events/sched.h>
>
> -int core_uses_pid;
> -char core_pattern[CORENAME_MAX_SIZE] = "core";
> -unsigned int core_pipe_limit;
> int suid_dumpable = 0;
>
> -struct core_name {
> - char *corename;
> - int used, size;
> -};
> -static atomic_t call_count = ATOMIC_INIT(1);
> -
> -/* The maximal length of core_pattern is also specified in sysctl.c */
> -
> static LIST_HEAD(formats);
> static DEFINE_RWLOCK(binfmt_lock);
>
> @@ -1632,353 +1621,6 @@ void set_binfmt(struct linux_binfmt *new)
>
> EXPORT_SYMBOL(set_binfmt);
>
> -static int expand_corename(struct core_name *cn)
> -{
> - char *old_corename = cn->corename;
> -
> - cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
> - cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
> -
> - if (!cn->corename) {
> - kfree(old_corename);
> - return -ENOMEM;
> - }
> -
> - return 0;
> -}
> -
> -static int cn_printf(struct core_name *cn, const char *fmt, ...)
> -{
> - char *cur;
> - int need;
> - int ret;
> - va_list arg;
> -
> - va_start(arg, fmt);
> - need = vsnprintf(NULL, 0, fmt, arg);
> - va_end(arg);
> -
> - if (likely(need < cn->size - cn->used - 1))
> - goto out_printf;
> -
> - ret = expand_corename(cn);
> - if (ret)
> - goto expand_fail;
> -
> -out_printf:
> - cur = cn->corename + cn->used;
> - va_start(arg, fmt);
> - vsnprintf(cur, need + 1, fmt, arg);
> - va_end(arg);
> - cn->used += need;
> - return 0;
> -
> -expand_fail:
> - return ret;
> -}
> -
> -static void cn_escape(char *str)
> -{
> - for (; *str; str++)
> - if (*str == '/')
> - *str = '!';
> -}
> -
> -static int cn_print_exe_file(struct core_name *cn)
> -{
> - struct file *exe_file;
> - char *pathbuf, *path;
> - int ret;
> -
> - exe_file = get_mm_exe_file(current->mm);
> - if (!exe_file) {
> - char *commstart = cn->corename + cn->used;
> - ret = cn_printf(cn, "%s (path unknown)", current->comm);
> - cn_escape(commstart);
> - return ret;
> - }
> -
> - pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
> - if (!pathbuf) {
> - ret = -ENOMEM;
> - goto put_exe_file;
> - }
> -
> - path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
> - if (IS_ERR(path)) {
> - ret = PTR_ERR(path);
> - goto free_buf;
> - }
> -
> - cn_escape(path);
> -
> - ret = cn_printf(cn, "%s", path);
> -
> -free_buf:
> - kfree(pathbuf);
> -put_exe_file:
> - fput(exe_file);
> - return ret;
> -}
> -
> -/* format_corename will inspect the pattern parameter, and output a
> - * name into corename, which must have space for at least
> - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
> - */
> -static int format_corename(struct core_name *cn, long signr)
> -{
> - const struct cred *cred = current_cred();
> - const char *pat_ptr = core_pattern;
> - int ispipe = (*pat_ptr == '|');
> - int pid_in_pattern = 0;
> - int err = 0;
> -
> - cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
> - cn->corename = kmalloc(cn->size, GFP_KERNEL);
> - cn->used = 0;
> -
> - if (!cn->corename)
> - return -ENOMEM;
> -
> - /* Repeat as long as we have more pattern to process and more output
> - space */
> - while (*pat_ptr) {
> - if (*pat_ptr != '%') {
> - if (*pat_ptr == 0)
> - goto out;
> - err = cn_printf(cn, "%c", *pat_ptr++);
> - } else {
> - switch (*++pat_ptr) {
> - /* single % at the end, drop that */
> - case 0:
> - goto out;
> - /* Double percent, output one percent */
> - case '%':
> - err = cn_printf(cn, "%c", '%');
> - break;
> - /* pid */
> - case 'p':
> - pid_in_pattern = 1;
> - err = cn_printf(cn, "%d",
> - task_tgid_vnr(current));
> - break;
> - /* uid */
> - case 'u':
> - err = cn_printf(cn, "%d", cred->uid);
> - break;
> - /* gid */
> - case 'g':
> - err = cn_printf(cn, "%d", cred->gid);
> - break;
> - /* signal that caused the coredump */
> - case 's':
> - err = cn_printf(cn, "%ld", signr);
> - break;
> - /* UNIX time of coredump */
> - case 't': {
> - struct timeval tv;
> - do_gettimeofday(&tv);
> - err = cn_printf(cn, "%lu", tv.tv_sec);
> - break;
> - }
> - /* hostname */
> - case 'h': {
> - char *namestart = cn->corename + cn->used;
> - down_read(&uts_sem);
> - err = cn_printf(cn, "%s",
> - utsname()->nodename);
> - up_read(&uts_sem);
> - cn_escape(namestart);
> - break;
> - }
> - /* executable */
> - case 'e': {
> - char *commstart = cn->corename + cn->used;
> - err = cn_printf(cn, "%s", current->comm);
> - cn_escape(commstart);
> - break;
> - }
> - case 'E':
> - err = cn_print_exe_file(cn);
> - break;
> - /* core limit size */
> - case 'c':
> - err = cn_printf(cn, "%lu",
> - rlimit(RLIMIT_CORE));
> - break;
> - default:
> - break;
> - }
> - ++pat_ptr;
> - }
> -
> - if (err)
> - return err;
> - }
> -
> - /* Backward compatibility with core_uses_pid:
> - *
> - * If core_pattern does not include a %p (as is the default)
> - * and core_uses_pid is set, then .%pid will be appended to
> - * the filename. Do not do this for piped commands. */
> - if (!ispipe && !pid_in_pattern && core_uses_pid) {
> - err = cn_printf(cn, ".%d", task_tgid_vnr(current));
> - if (err)
> - return err;
> - }
> -out:
> - return ispipe;
> -}
> -
> -static int zap_process(struct task_struct *start, int exit_code)
> -{
> - struct task_struct *t;
> - int nr = 0;
> -
> - start->signal->flags = SIGNAL_GROUP_EXIT;
> - start->signal->group_exit_code = exit_code;
> - start->signal->group_stop_count = 0;
> -
> - t = start;
> - do {
> - task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
> - if (t != current && t->mm) {
> - sigaddset(&t->pending.signal, SIGKILL);
> - signal_wake_up(t, 1);
> - nr++;
> - }
> - } while_each_thread(start, t);
> -
> - return nr;
> -}
> -
> -static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
> - struct core_state *core_state, int exit_code)
> -{
> - struct task_struct *g, *p;
> - unsigned long flags;
> - int nr = -EAGAIN;
> -
> - spin_lock_irq(&tsk->sighand->siglock);
> - if (!signal_group_exit(tsk->signal)) {
> - mm->core_state = core_state;
> - nr = zap_process(tsk, exit_code);
> - }
> - spin_unlock_irq(&tsk->sighand->siglock);
> - if (unlikely(nr < 0))
> - return nr;
> -
> - if (atomic_read(&mm->mm_users) == nr + 1)
> - goto done;
> - /*
> - * We should find and kill all tasks which use this mm, and we should
> - * count them correctly into ->nr_threads. We don't take tasklist
> - * lock, but this is safe wrt:
> - *
> - * fork:
> - * None of sub-threads can fork after zap_process(leader). All
> - * processes which were created before this point should be
> - * visible to zap_threads() because copy_process() adds the new
> - * process to the tail of init_task.tasks list, and lock/unlock
> - * of ->siglock provides a memory barrier.
> - *
> - * do_exit:
> - * The caller holds mm->mmap_sem. This means that the task which
> - * uses this mm can't pass exit_mm(), so it can't exit or clear
> - * its ->mm.
> - *
> - * de_thread:
> - * It does list_replace_rcu(&leader->tasks, &current->tasks),
> - * we must see either old or new leader, this does not matter.
> - * However, it can change p->sighand, so lock_task_sighand(p)
> - * must be used. Since p->mm != NULL and we hold ->mmap_sem
> - * it can't fail.
> - *
> - * Note also that "g" can be the old leader with ->mm == NULL
> - * and already unhashed and thus removed from ->thread_group.
> - * This is OK, __unhash_process()->list_del_rcu() does not
> - * clear the ->next pointer, we will find the new leader via
> - * next_thread().
> - */
> - rcu_read_lock();
> - for_each_process(g) {
> - if (g == tsk->group_leader)
> - continue;
> - if (g->flags & PF_KTHREAD)
> - continue;
> - p = g;
> - do {
> - if (p->mm) {
> - if (unlikely(p->mm == mm)) {
> - lock_task_sighand(p, &flags);
> - nr += zap_process(p, exit_code);
> - unlock_task_sighand(p, &flags);
> - }
> - break;
> - }
> - } while_each_thread(g, p);
> - }
> - rcu_read_unlock();
> -done:
> - atomic_set(&core_state->nr_threads, nr);
> - return nr;
> -}
> -
> -static int coredump_wait(int exit_code, struct core_state *core_state)
> -{
> - struct task_struct *tsk = current;
> - struct mm_struct *mm = tsk->mm;
> - int core_waiters = -EBUSY;
> -
> - init_completion(&core_state->startup);
> - core_state->dumper.task = tsk;
> - core_state->dumper.next = NULL;
> -
> - down_write(&mm->mmap_sem);
> - if (!mm->core_state)
> - core_waiters = zap_threads(tsk, mm, core_state, exit_code);
> - up_write(&mm->mmap_sem);
> -
> - if (core_waiters > 0) {
> - struct core_thread *ptr;
> -
> - wait_for_completion(&core_state->startup);
> - /*
> - * Wait for all the threads to become inactive, so that
> - * all the thread context (extended register state, like
> - * fpu etc) gets copied to the memory.
> - */
> - ptr = core_state->dumper.next;
> - while (ptr != NULL) {
> - wait_task_inactive(ptr->task, 0);
> - ptr = ptr->next;
> - }
> - }
> -
> - return core_waiters;
> -}
> -
> -static void coredump_finish(struct mm_struct *mm)
> -{
> - struct core_thread *curr, *next;
> - struct task_struct *task;
> -
> - next = mm->core_state->dumper.next;
> - while ((curr = next) != NULL) {
> - next = curr->next;
> - task = curr->task;
> - /*
> - * see exit_mm(), curr->task must not see
> - * ->task == NULL before we read ->next.
> - */
> - smp_mb();
> - curr->task = NULL;
> - wake_up_process(task);
> - }
> -
> - mm->core_state = NULL;
> -}
> -
> /*
> * set_dumpable converts traditional three-value dumpable to two flags and
> * stores them into mm->flags. It modifies lower two bits of mm->flags, but
> @@ -2020,7 +1662,7 @@ void set_dumpable(struct mm_struct *mm, int value)
> }
> }
>
> -static int __get_dumpable(unsigned long mm_flags)
> +int __get_dumpable(unsigned long mm_flags)
> {
> int ret;
>
> @@ -2032,290 +1674,3 @@ int get_dumpable(struct mm_struct *mm)
> {
> return __get_dumpable(mm->flags);
> }
> -
> -static void wait_for_dump_helpers(struct file *file)
> -{
> - struct pipe_inode_info *pipe;
> -
> - pipe = file->f_path.dentry->d_inode->i_pipe;
> -
> - pipe_lock(pipe);
> - pipe->readers++;
> - pipe->writers--;
> -
> - while ((pipe->readers > 1) && (!signal_pending(current))) {
> - wake_up_interruptible_sync(&pipe->wait);
> - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
> - pipe_wait(pipe);
> - }
> -
> - pipe->readers--;
> - pipe->writers++;
> - pipe_unlock(pipe);
> -
> -}
> -
> -
> -/*
> - * umh_pipe_setup
> - * helper function to customize the process used
> - * to collect the core in userspace. Specifically
> - * it sets up a pipe and installs it as fd 0 (stdin)
> - * for the process. Returns 0 on success, or
> - * PTR_ERR on failure.
> - * Note that it also sets the core limit to 1. This
> - * is a special value that we use to trap recursive
> - * core dumps
> - */
> -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
> -{
> - struct file *files[2];
> - struct fdtable *fdt;
> - struct coredump_params *cp = (struct coredump_params *)info->data;
> - struct files_struct *cf = current->files;
> - int err = create_pipe_files(files, 0);
> - if (err)
> - return err;
> -
> - cp->file = files[1];
> -
> - sys_close(0);
> - fd_install(0, files[0]);
> - spin_lock(&cf->file_lock);
> - fdt = files_fdtable(cf);
> - __set_open_fd(0, fdt);
> - __clear_close_on_exec(0, fdt);
> - spin_unlock(&cf->file_lock);
> -
> - /* and disallow core files too */
> - current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
> -
> - return 0;
> -}
> -
> -void do_coredump(long signr, int exit_code, struct pt_regs *regs)
> -{
> - struct core_state core_state;
> - struct core_name cn;
> - struct mm_struct *mm = current->mm;
> - struct linux_binfmt * binfmt;
> - const struct cred *old_cred;
> - struct cred *cred;
> - int retval = 0;
> - int flag = 0;
> - int ispipe;
> - bool need_nonrelative = false;
> - static atomic_t core_dump_count = ATOMIC_INIT(0);
> - struct coredump_params cprm = {
> - .signr = signr,
> - .regs = regs,
> - .limit = rlimit(RLIMIT_CORE),
> - /*
> - * We must use the same mm->flags while dumping core to avoid
> - * inconsistency of bit flags, since this flag is not protected
> - * by any locks.
> - */
> - .mm_flags = mm->flags,
> - };
> -
> - audit_core_dumps(signr);
> -
> - binfmt = mm->binfmt;
> - if (!binfmt || !binfmt->core_dump)
> - goto fail;
> - if (!__get_dumpable(cprm.mm_flags))
> - goto fail;
> -
> - cred = prepare_creds();
> - if (!cred)
> - goto fail;
> - /*
> - * We cannot trust fsuid as being the "true" uid of the process
> - * nor do we know its entire history. We only know it was tainted
> - * so we dump it as root in mode 2, and only into a controlled
> - * environment (pipe handler or fully qualified path).
> - */
> - if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
> - /* Setuid core dump mode */
> - flag = O_EXCL; /* Stop rewrite attacks */
> - cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
> - need_nonrelative = true;
> - }
> -
> - retval = coredump_wait(exit_code, &core_state);
> - if (retval < 0)
> - goto fail_creds;
> -
> - old_cred = override_creds(cred);
> -
> - /*
> - * Clear any false indication of pending signals that might
> - * be seen by the filesystem code called to write the core file.
> - */
> - clear_thread_flag(TIF_SIGPENDING);
> -
> - ispipe = format_corename(&cn, signr);
> -
> - if (ispipe) {
> - int dump_count;
> - char **helper_argv;
> -
> - if (ispipe < 0) {
> - printk(KERN_WARNING "format_corename failed\n");
> - printk(KERN_WARNING "Aborting core\n");
> - goto fail_corename;
> - }
> -
> - if (cprm.limit == 1) {
> - /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
> - *
> - * Normally core limits are irrelevant to pipes, since
> - * we're not writing to the file system, but we use
> - * cprm.limit of 1 here as a speacial value, this is a
> - * consistent way to catch recursive crashes.
> - * We can still crash if the core_pattern binary sets
> - * RLIM_CORE = !1, but it runs as root, and can do
> - * lots of stupid things.
> - *
> - * Note that we use task_tgid_vnr here to grab the pid
> - * of the process group leader. That way we get the
> - * right pid if a thread in a multi-threaded
> - * core_pattern process dies.
> - */
> - printk(KERN_WARNING
> - "Process %d(%s) has RLIMIT_CORE set to 1\n",
> - task_tgid_vnr(current), current->comm);
> - printk(KERN_WARNING "Aborting core\n");
> - goto fail_unlock;
> - }
> - cprm.limit = RLIM_INFINITY;
> -
> - dump_count = atomic_inc_return(&core_dump_count);
> - if (core_pipe_limit && (core_pipe_limit < dump_count)) {
> - printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
> - task_tgid_vnr(current), current->comm);
> - printk(KERN_WARNING "Skipping core dump\n");
> - goto fail_dropcount;
> - }
> -
> - helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
> - if (!helper_argv) {
> - printk(KERN_WARNING "%s failed to allocate memory\n",
> - __func__);
> - goto fail_dropcount;
> - }
> -
> - retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
> - NULL, UMH_WAIT_EXEC, umh_pipe_setup,
> - NULL, &cprm);
> - argv_free(helper_argv);
> - if (retval) {
> - printk(KERN_INFO "Core dump to %s pipe failed\n",
> - cn.corename);
> - goto close_fail;
> - }
> - } else {
> - struct inode *inode;
> -
> - if (cprm.limit < binfmt->min_coredump)
> - goto fail_unlock;
> -
> - if (need_nonrelative && cn.corename[0] != '/') {
> - printk(KERN_WARNING "Pid %d(%s) can only dump core "\
> - "to fully qualified path!\n",
> - task_tgid_vnr(current), current->comm);
> - printk(KERN_WARNING "Skipping core dump\n");
> - goto fail_unlock;
> - }
> -
> - cprm.file = filp_open(cn.corename,
> - O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
> - 0600);
> - if (IS_ERR(cprm.file))
> - goto fail_unlock;
> -
> - inode = cprm.file->f_path.dentry->d_inode;
> - if (inode->i_nlink > 1)
> - goto close_fail;
> - if (d_unhashed(cprm.file->f_path.dentry))
> - goto close_fail;
> - /*
> - * AK: actually i see no reason to not allow this for named
> - * pipes etc, but keep the previous behaviour for now.
> - */
> - if (!S_ISREG(inode->i_mode))
> - goto close_fail;
> - /*
> - * Dont allow local users get cute and trick others to coredump
> - * into their pre-created files.
> - */
> - if (!uid_eq(inode->i_uid, current_fsuid()))
> - goto close_fail;
> - if (!cprm.file->f_op || !cprm.file->f_op->write)
> - goto close_fail;
> - if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
> - goto close_fail;
> - }
> -
> - retval = binfmt->core_dump(&cprm);
> - if (retval)
> - current->signal->group_exit_code |= 0x80;
> -
> - if (ispipe && core_pipe_limit)
> - wait_for_dump_helpers(cprm.file);
> -close_fail:
> - if (cprm.file)
> - filp_close(cprm.file, NULL);
> -fail_dropcount:
> - if (ispipe)
> - atomic_dec(&core_dump_count);
> -fail_unlock:
> - kfree(cn.corename);
> -fail_corename:
> - coredump_finish(mm);
> - revert_creds(old_cred);
> -fail_creds:
> - put_cred(cred);
> -fail:
> - return;
> -}
> -
> -/*
> - * Core dumping helper functions. These are the only things you should
> - * do on a core-file: use only these functions to write out all the
> - * necessary info.
> - */
> -int dump_write(struct file *file, const void *addr, int nr)
> -{
> - return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
> -}
> -EXPORT_SYMBOL(dump_write);
> -
> -int dump_seek(struct file *file, loff_t off)
> -{
> - int ret = 1;
> -
> - if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
> - if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
> - return 0;
> - } else {
> - char *buf = (char *)get_zeroed_page(GFP_KERNEL);
> -
> - if (!buf)
> - return 0;
> - while (off > 0) {
> - unsigned long n = off;
> -
> - if (n > PAGE_SIZE)
> - n = PAGE_SIZE;
> - if (!dump_write(file, buf, n)) {
> - ret = 0;
> - break;
> - }
> - off -= n;
> - }
> - free_page((unsigned long)buf);
> - }
> - return ret;
> -}
> -EXPORT_SYMBOL(dump_seek);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c147e70..7bb5047 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -413,6 +413,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
>
> extern void set_dumpable(struct mm_struct *mm, int value);
> extern int get_dumpable(struct mm_struct *mm);
> +extern int __get_dumpable(unsigned long mm_flags);
>
> /* get/set_dumpable() values */
> #define SUID_DUMPABLE_DISABLED 0
> --
> 1.7.11.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2012-08-10 15:32:23

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [PATCHv4 2/3] fs: Make core dump functionality optional

Quoting Alex Kelly ([email protected]):
> Adds an expert Kconfig option, CONFIG_COREDUMP, which allows disabling of core dump.
> This saves approximately 2.6k in the compiled kernel, and complements CONFIG_ELF_CORE,
> which now depends on it.
>
> CONFIG_COREDUMP also disables coredump-related sysctls, except for suid_dumpable and
> related functions, which are necessary for ptrace.
>
> Signed-off-by: Alex Kelly <[email protected]>
> Reviewed-by: Josh Triplett <[email protected]>

Acked-by: Serge Hallyn <[email protected]>

> ---
> fs/Kconfig.binfmt | 8 ++++++++
> fs/Makefile | 3 ++-
> include/linux/binfmts.h | 4 ++++
> init/Kconfig | 1 +
> kernel/sysctl.c | 6 +++++-
> 5 files changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
> index 0225742..0efd152 100644
> --- a/fs/Kconfig.binfmt
> +++ b/fs/Kconfig.binfmt
> @@ -164,3 +164,11 @@ config BINFMT_MISC
> You may say M here for module support and later load the module when
> you have use for it; the module is called binfmt_misc. If you
> don't know what to answer at this point, say Y.
> +
> +config COREDUMP
> + bool "Enable core dump support" if EXPERT
> + default y
> + help
> + This option enables support for performing core dumps. You almost
> + certainly want to say Y here. Not necessary on systems that never
> + need debugging or only ever run flawless code.
> diff --git a/fs/Makefile b/fs/Makefile
> index 8938f82..1d7af79 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
> attr.o bad_inode.o file.o filesystems.o namespace.o \
> seq_file.o xattr.o libfs.o fs-writeback.o \
> pnode.o drop_caches.o splice.o sync.o utimes.o \
> - stack.o fs_struct.o statfs.o coredump.o
> + stack.o fs_struct.o statfs.o
>
> ifeq ($(CONFIG_BLOCK),y)
> obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
> @@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
> obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
> obj-$(CONFIG_NFS_COMMON) += nfs_common/
> obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
> +obj-$(CONFIG_COREDUMP) += coredump.o
>
> obj-$(CONFIG_FHANDLE) += fhandle.o
>
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 366422b..00e2e89 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -132,7 +132,11 @@ extern int copy_strings_kernel(int argc, const char *const *argv,
> struct linux_binprm *bprm);
> extern int prepare_bprm_creds(struct linux_binprm *bprm);
> extern void install_exec_creds(struct linux_binprm *bprm);
> +#ifdef CONFIG_COREDUMP
> extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
> +#else
> +static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
> +#endif
> extern void set_binfmt(struct linux_binfmt *new);
> extern void free_bprm(struct linux_binprm *);
>
> diff --git a/init/Kconfig b/init/Kconfig
> index af6c7f8..0e75056 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1230,6 +1230,7 @@ config BUG
> Just say Y.
>
> config ELF_CORE
> + depends on COREDUMP
> default y
> bool "Enable ELF core dumps" if EXPERT
> help
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 87174ef..af57e84 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -97,10 +97,12 @@
> extern int sysctl_overcommit_memory;
> extern int sysctl_overcommit_ratio;
> extern int max_threads;
> -extern int core_uses_pid;
> extern int suid_dumpable;
> +#ifdef CONFIG_COREDUMP
> +extern int core_uses_pid;
> extern char core_pattern[];
> extern unsigned int core_pipe_limit;
> +#endif
> extern int pid_max;
> extern int min_free_kbytes;
> extern int pid_max_min, pid_max_max;
> @@ -404,6 +406,7 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> +#ifdef CONFIG_COREDUMP
> {
> .procname = "core_uses_pid",
> .data = &core_uses_pid,
> @@ -425,6 +428,7 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> +#endif
> #ifdef CONFIG_PROC_SYSCTL
> {
> .procname = "tainted",
> --
> 1.7.11.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2012-08-10 15:36:04

by Serge E. Hallyn

[permalink] [raw]

Subject: Re: [PATCHv4 3/3] fs: Update coredump-related headers

Quoting Alex Kelly ([email protected]):
> This patch creates a new header file, fs/coredump.h, which contains
> functions only used by the new coredump.c. It also moves do_coredump
> to the include/linux/coredump.h header file, for consistency.
>
> Signed-off-by: Alex Kelly <[email protected]>
> Reviewed-by: Josh Triplett <[email protected]>

Acked-by: Serge Hallyn <[email protected]>

> ---
> fs/coredump.c | 2 ++
> fs/coredump.h | 6 ++++++
> fs/exec.c | 1 +
> include/linux/binfmts.h | 5 -----
> include/linux/coredump.h | 5 +++++
> include/linux/sched.h | 1 -
> kernel/signal.c | 1 +
> 7 files changed, 15 insertions(+), 6 deletions(-)
> create mode 100644 fs/coredump.h
>
> diff --git a/fs/coredump.c b/fs/coredump.c
> index 9692329..1935b4d 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -14,6 +14,7 @@
> #include <linux/key.h>
> #include <linux/personality.h>
> #include <linux/binfmts.h>
> +#include <linux/coredump.h>
> #include <linux/utsname.h>
> #include <linux/pid_namespace.h>
> #include <linux/module.h>
> @@ -39,6 +40,7 @@
>
> #include <trace/events/task.h>
> #include "internal.h"
> +#include "coredump.h"
>
> #include <trace/events/sched.h>
>
> diff --git a/fs/coredump.h b/fs/coredump.h
> new file mode 100644
> index 0000000..e39ff07
> --- /dev/null
> +++ b/fs/coredump.h
> @@ -0,0 +1,6 @@
> +#ifndef _FS_COREDUMP_H
> +#define _FS_COREDUMP_H
> +
> +extern int __get_dumpable(unsigned long mm_flags);
> +
> +#endif
> diff --git a/fs/exec.c b/fs/exec.c
> index b604050..a0ad3a2 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -63,6 +63,7 @@
>
> #include <trace/events/task.h>
> #include "internal.h"
> +#include "coredump.h"
>
> #include <trace/events/sched.h>
>
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 00e2e89..c7b16ee 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -132,11 +132,6 @@ extern int copy_strings_kernel(int argc, const char *const *argv,
> struct linux_binprm *bprm);
> extern int prepare_bprm_creds(struct linux_binprm *bprm);
> extern void install_exec_creds(struct linux_binprm *bprm);
> -#ifdef CONFIG_COREDUMP
> -extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
> -#else
> -static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
> -#endif
> extern void set_binfmt(struct linux_binfmt *new);
> extern void free_bprm(struct linux_binprm *);
>
> diff --git a/include/linux/coredump.h b/include/linux/coredump.h
> index ba4b85a..42f9752 100644
> --- a/include/linux/coredump.h
> +++ b/include/linux/coredump.h
> @@ -11,5 +11,10 @@
> */
> extern int dump_write(struct file *file, const void *addr, int nr);
> extern int dump_seek(struct file *file, loff_t off);
> +#ifdef CONFIG_COREDUMP
> +extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
> +#else
> +static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
> +#endif
>
> #endif /* _LINUX_COREDUMP_H */
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 7bb5047..c147e70 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -413,7 +413,6 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
>
> extern void set_dumpable(struct mm_struct *mm, int value);
> extern int get_dumpable(struct mm_struct *mm);
> -extern int __get_dumpable(unsigned long mm_flags);
>
> /* get/set_dumpable() values */
> #define SUID_DUMPABLE_DISABLED 0
> diff --git a/kernel/signal.c b/kernel/signal.c
> index be4f856..fb4fd72 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -17,6 +17,7 @@
> #include <linux/fs.h>
> #include <linux/tty.h>
> #include <linux/binfmts.h>
> +#include <linux/coredump.h>
> #include <linux/security.h>
> #include <linux/syscalls.h>
> #include <linux/ptrace.h>
> --
> 1.7.11.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2012-08-10 16:17:58

by Kees Cook

[permalink] [raw]

Subject: Re: [PATCHv4 1/3] fs: Move core dump functionality into its own file

On Fri, Aug 10, 2012 at 1:26 AM, Alex Kelly <[email protected]> wrote:
> This prepares for making core dump functionality optional.
>
> The variable "suid_dumpable" and associated functions are left in fs/exec.c
> because they're used elsewhere, such as in ptrace.
>
> Signed-off-by: Alex Kelly <[email protected]>
> Reviewed-by: Josh Triplett <[email protected]>
> ---
> v2: This patch set is a second revision that follows some suggestions from
> Ingo Molnar and Josh Triplett. Specifically, authorship of commits is
> revised for consistency, and an additional two patches cleaning up artifacts
> and making headers more sane are added.
>
> v3: This version fixes a few more authorship issues and some problems caused
> by a bad git send-email config. Sorry about the extra mails
>
> v4: This version fixes some ordering issues pointed out by Kees Cook and Josh
> Triplett, such that the order of the functions moved to fs/coredump.c is now
> consistent with their original order in fs/exec.c. v4 also drops some extra
> blank lines unintentionally introduced in fs/coredump.c, to avoid the need to
> clean them up later. That left the cleanup patch just reformatting a comment,
> so I dropped that patch. Some of the functions moved to coredump.c need a lot
> of cleaning up, but I'm not sure that those formatting changes should be
> folded into this patch series.

Thanks for the cleanups! This looks great now.

For all three patches:
Acked-by: Kees Cook <[email protected]>

-Kees

--
Kees Cook
Chrome OS Security