Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751842Ab0KBXHB (ORCPT ); Tue, 2 Nov 2010 19:07:01 -0400 Received: from serrano.cc.columbia.edu ([128.59.29.6]:45339 "EHLO serrano.cc.columbia.edu" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750796Ab0KBXG4 (ORCPT ); Tue, 2 Nov 2010 19:06:56 -0400 X-Greylist: delayed 1777 seconds by postgrey-1.27 at vger.kernel.org; Tue, 02 Nov 2010 19:06:55 EDT Message-ID: <4CD09235.9010107@cs.columbia.edu> Date: Tue, 02 Nov 2010 18:35:33 -0400 From: Oren Laadan Organization: Columbia University User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.12) Gecko/20100915 Lightning/1.0b1 Thunderbird/3.0.8 MIME-Version: 1.0 To: ksummit-2010-discuss@lists.linux-foundation.org CC: Linux-Kernel , Linus Torvalds Subject: checkpoint-restart: naked patch Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit X-No-Spam-Score: Local Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 186471 Lines: 5686 [resending with the missing CC's] Hi, Following the discussion yesterday, here is a linux-cr diff that that is limited to changes to existing code. The diff doesn't include the eclone() patches. I also tried to strip off the new c/r code (either code in new files, or new code within #ifdef CONFIG_CHECKPOINT in existing files). I left a few such snippets in, e.g. c/r syscalls templates and declaration of c/r specific methods in, e.g. file_operations. The remaining changes in this patch include new freezer state ("CHECKPOINTING"), mostly refactoring of exsiting code, and a bit of new helpers. Disclaimer: don't try to compile (or apply) - this is *only* intended to give a ballpark of how the c/r patches change existing code. Thanks, Oren. Documentation/cgroups/freezer-subsystem.txt | 10 ++ Documentation/credentials.txt | 14 ++ MAINTAINERS | 12 ++ Makefile | 2 +- arch/arm/Kconfig | 4 + arch/arm/include/asm/ptrace.h | 1 + arch/arm/include/asm/syscall.h | 32 ++++ arch/arm/include/asm/unistd.h | 3 + arch/arm/kernel/Makefile | 1 + arch/arm/kernel/calls.S | 3 + arch/arm/kernel/entry-common.S | 6 + arch/arm/kernel/ptrace.c | 69 +++++++++ arch/arm/kernel/signal.c | 5 + arch/arm/kernel/sys_arm.c | 13 ++ arch/powerpc/Kconfig | 3 + arch/powerpc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/elf.h | 1 + arch/powerpc/include/asm/ptrace.h | 7 + arch/powerpc/include/asm/systbl.h | 2 + arch/powerpc/include/asm/unistd.h | 4 +- arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/entry_32.S | 23 +++ arch/powerpc/kernel/entry_64.S | 16 ++ arch/powerpc/kernel/process.c | 1 + arch/powerpc/kernel/ptrace.c | 83 ++++++++--- arch/powerpc/kernel/signal.c | 6 + arch/powerpc/kernel/vdso.c | 13 ++- arch/s390/Kconfig | 4 + arch/s390/include/asm/Kbuild | 1 + arch/s390/include/asm/elf.h | 2 +- arch/s390/include/asm/thread_info.h | 2 + arch/s390/include/asm/unistd.h | 4 +- arch/s390/kernel/Makefile | 1 + arch/s390/kernel/compat_wrapper.S | 16 ++ arch/s390/kernel/process.c | 27 ++++ arch/s390/kernel/signal.c | 21 +++ arch/s390/kernel/syscalls.S | 2 + arch/s390/kernel/vdso.c | 13 ++- arch/s390/mm/Makefile | 1 + arch/sh/include/asm/elf.h | 1 + arch/sh/kernel/vsyscall/vsyscall.c | 2 +- arch/x86/Kconfig | 4 + arch/x86/ia32/ia32entry.S | 9 + arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/elf.h | 3 +- arch/x86/include/asm/ldt.h | 7 + arch/x86/include/asm/syscalls.h | 6 + arch/x86/include/asm/unistd_32.h | 4 +- arch/x86/include/asm/unistd_64.h | 4 + arch/x86/kernel/Makefile | 10 ++ arch/x86/kernel/entry_32.S | 8 + arch/x86/kernel/entry_64.S | 7 + arch/x86/kernel/signal.c | 5 + arch/x86/kernel/syscall_table_32.S | 2 + arch/x86/vdso/vdso32-setup.c | 9 +- arch/x86/vdso/vma.c | 11 +- drivers/char/pty.c | 42 +++++- drivers/char/tty_io.c | 35 ++++- drivers/net/loopback.c | 9 +- drivers/net/macvlan.c | 3 + drivers/net/veth.c | 3 + fs/Makefile | 1 + fs/binfmt_elf.c | 2 +- fs/devpts/inode.c | 13 ++- fs/eventfd.c | 1 + fs/eventpoll.c | 70 ++++++---- fs/exec.c | 71 ++++++++- fs/fcntl.c | 21 ++- fs/fs_struct.c | 21 +++ fs/namespace.c | 36 +++-- fs/nilfs2/dir.c | 1 - fs/notify/dnotify/dnotify.c | 18 +++ fs/open.c | 58 +++++--- fs/pipe.c | 1 + fs/read_write.c | 10 -- fs/select.c | 2 +- fs/splice.c | 78 ++++++----- fs/squashfs/dir.c | 2 +- include/linux/Kbuild | 3 + include/linux/aio.h | 2 + include/linux/compat.h | 3 +- include/linux/cred.h | 8 + include/linux/devpts_fs.h | 6 +- include/linux/dnotify.h | 6 + include/linux/eventpoll.h | 6 +- include/linux/freezer.h | 9 + include/linux/fs.h | 35 +++++- include/linux/fs_struct.h | 2 + include/linux/futex.h | 12 ++ include/linux/hrtimer.h | 8 +- include/linux/magic.h | 3 + include/linux/mm.h | 38 +++++ include/linux/net.h | 11 ++ include/linux/netdevice.h | 6 + include/linux/poll.h | 3 + include/linux/posix-timers.h | 15 ++ include/linux/resource.h | 1 + include/linux/sched.h | 10 +- include/linux/security.h | 11 ++ include/linux/sem.h | 2 + include/linux/shm.h | 7 + include/linux/signal.h | 3 + include/linux/splice.h | 9 + include/linux/tty.h | 4 + include/linux/user.h | 9 + include/linux/user_namespace.h | 8 + include/linux/utsname.h | 1 + include/net/af_unix.h | 1 + include/net/sock.h | 48 ++++++ init/Kconfig | 10 +- ipc/Makefile | 3 +- ipc/msg.c | 23 ++-- ipc/msgutil.c | 8 - ipc/namespace.c | 2 +- ipc/sem.c | 113 ++++++++++----- ipc/shm.c | 55 ++++++-- ipc/util.c | 42 ++++-- ipc/util.h | 32 ++++- kernel/Makefile | 2 + kernel/capability.c | 96 +++++++++++-- kernel/cgroup_freezer.c | 214 ++++++++++++++++++++------ kernel/compat.c | 4 +- kernel/cred.c | 116 +++++++++++++++ kernel/exit.c | 11 ++- kernel/fork.c | 10 ++ kernel/futex.c | 31 ++--- kernel/futex_compat.c | 13 ++- kernel/groups.c | 1 + kernel/nsproxy.c | 5 + kernel/posix-cpu-timers.c | 9 - kernel/posix-timers.c | 2 +- kernel/signal.c | 13 ++ kernel/sys.c | 170 ++++++---------------- kernel/sys_ni.c | 4 + kernel/sysctl.c | 1 + kernel/user.c | 5 + kernel/user_namespace.c | 54 +++++-- kernel/utsname.c | 3 +- kernel/utsname_sysctl.c | 7 + lib/Kconfig.debug | 13 ++ mm/Makefile | 1 + mm/filemap.c | 1 + mm/memory.c | 95 ++++++++++++- mm/mmap.c | 39 +++++- mm/shmem.c | 16 +-- net/Kconfig | 4 + net/Makefile | 3 + net/ipv4/Makefile | 1 + net/ipv4/af_inet.c | 6 + net/ipv6/sit.c | 3 + net/socket.c | 31 +--- net/unix/Makefile | 1 + security/capability.c | 1 + security/commoncap.c | 19 +-- security/selinux/include/classmap.h | 9 +- security/smack/smack.h | 1 + security/smack/smack_lsm.c | 1 + security/smack/smackfs.c | 1 + 159 files changed, 2031 insertions(+), 587 deletions(-) diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt index 41f37fe..92b68e6 100644 --- a/Documentation/cgroups/freezer-subsystem.txt +++ b/Documentation/cgroups/freezer-subsystem.txt @@ -100,3 +100,13 @@ things happens: and returns EINVAL) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. + +When the cgroup freezer is used to guard container checkpoint operations the +freezer.state may be "CHECKPOINTING". "CHECKPOINTING" can only be set on a +"FROZEN" cgroup using the checkpoint system call. Once in the "CHECKPOINTING" +state, the cgroup may not leave until the checkpoint system call returns the +freezer state to "FROZEN". Writing any new state to freezer.state while +checkpointing will return EBUSY. These semantics ensure that userspace cannot +unfreeze the cgroup midway through the checkpoint system call. Note that, +unlike "FROZEN" and "FREEZING", there is no corresponding "CHECKPOINTED" +state. diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt index df03169..55dd589 100644 --- a/Documentation/credentials.txt +++ b/Documentation/credentials.txt @@ -530,6 +530,20 @@ A typical credentials alteration function would look something like this: } +SETUID/SETGID HELPERS +--------------------- + +Helpers exist to perform the core of uid and gid alterations: + +cred_setresuid(struct cred *new, uid_t ruid, uid_t euid, uid_t suid); +cred_setresgid(struct cred *new, gid_t rgid, gid_t egid, gid_t sgid); +cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid); +cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid); + +These helpers are used in kernel/sys.c for the analogous syscalls. +As can be seen in those examples, these helpers are to be wrapped +between calls to prepare_creds() and commit_creds() or abort_creds(). + MANAGING CREDENTIALS -------------------- diff --git a/MAINTAINERS b/MAINTAINERS index a0e3c3a..e4494d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1501,6 +1501,18 @@ M: Andy Whitcroft S: Supported F: scripts/checkpatch.pl +CHECKPOINT-RESTART +M: Oren Laadan +M: Serge E. Hallyn +L: containers@lists.linux-foundation.org +W: http://ckpt.wiki.kernel.org/index.php/Main_Page +S: Maintained +F: *checkpoint* +K: checkpoint +K: restore +K: ckpt +K: c/r + CISCO 10G ETHERNET DRIVER M: Scott Feldman M: Joe Eykholt diff --git a/Makefile b/Makefile index fa1db90..93be4e1 100644 --- a/Makefile +++ b/Makefile @@ -409,7 +409,7 @@ endif # of make so .config is not included in this case either (for *config). no-dot-config-targets := clean mrproper distclean \ - cscope TAGS tags help %docs check% \ + cscope TAGS tags help %docs checkstack \ include/linux/version.h headers_% \ kernelrelease kernelversion diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index c5408bf..14c7c84 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -100,6 +100,10 @@ config HAVE_LATENCYTOP_SUPPORT depends on !SMP default y +config CHECKPOINT_SUPPORT + bool + default y + config LOCKDEP_SUPPORT bool default y diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index 9dcb11e..9999568 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h @@ -57,6 +57,7 @@ #define PSR_C_BIT 0x20000000 #define PSR_Z_BIT 0x40000000 #define PSR_N_BIT 0x80000000 +#define PSR_GE_BITS 0x000f0000 /* * Groups of PSR bits diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h new file mode 100644 index 0000000..1a6ca68 --- /dev/null +++ b/arch/arm/include/asm/syscall.h @@ -0,0 +1,32 @@ +/* + * syscall.h - Linux syscall interfaces for ARM + * + * Copyright (c) 2010 Christoffer Dall + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#ifndef _ASM_ARM_SYSCALLS_H +#define _ASM_ARM_SYSCALLS_H + +#include +#include +#include +#include + +int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ARM_r0; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ARM_r0; +} + +#endif /* _ASM_ARM_SYSCALLS_H */ diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index dd2bf53..89484b4 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -392,6 +392,9 @@ #define __NR_rt_tgsigqueueinfo (__NR_SYSCALL_BASE+363) #define __NR_perf_event_open (__NR_SYSCALL_BASE+364) #define __NR_recvmmsg (__NR_SYSCALL_BASE+365) +#define __NR_eclone (__NR_SYSCALL_BASE+366) +#define __NR_checkpoint (__NR_SYSCALL_BASE+367) +#define __NR_restart (__NR_SYSCALL_BASE+368) /* * The following SWIs are ARM private. diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 26d302c..bfe39d8 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_ARM_THUMBEE) += thumbee.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_ARM_UNWIND) += unwind.o obj-$(CONFIG_HAVE_TCM) += tcm.o +obj-$(CONFIG_CHECKPOINT) += checkpoint.o obj-$(CONFIG_CRUNCH) += crunch.o crunch-bits.o AFLAGS_crunch-bits.o := -Wa,-mcpu=ep9312 diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 37ae301..aa38a4e 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -375,6 +375,9 @@ CALL(sys_rt_tgsigqueueinfo) CALL(sys_perf_event_open) /* 365 */ CALL(sys_recvmmsg) + CALL(sys_eclone_wrapper) + CALL(sys_checkpoint) + CALL(sys_restart) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 2c1db77..ba365dc 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -380,6 +380,12 @@ sys_clone_wrapper: b sys_clone ENDPROC(sys_clone_wrapper) +sys_eclone_wrapper: + add ip, sp, #S_OFF + str ip, [sp, #0] + b sys_eclone +ENDPROC(sys_eclone_wrapper) + sys_sigreturn_wrapper: add r0, sp, #S_OFF b sys_sigreturn diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 3f562a7..26ac9ef 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "ptrace.h" @@ -863,3 +864,71 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) return current_thread_info()->syscall; } + +/* + * This function essentially duplicates the logic from vector_swi in + * arch/arm/kernel/entry-common.S. However, that code is in the + * critical path for system calls and is hard to factor out without + * compromising performance. + */ +int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + int ret; + int scno; + unsigned long instr; + bool config_oabi = false; + bool config_aeabi = false; + bool config_arm_thumb = false; + bool config_cpu_endian_be8 = false; + +#ifdef CONFIG_OABI_COMPAT + config_oabi = true; +#endif +#ifdef CONFIG_AEABI + config_aeabi = true; +#endif +#ifdef CONFIG_ARM_THUMB + config_arm_thumb = true; +#endif +#ifdef CONFIG_CPU_ENDIAN_BE8 + config_cpu_endian_be8 = true; +#endif +#ifdef CONFIG_CPU_ARM710 + return -1; +#endif + + if (config_aeabi && !config_oabi) { + /* Pure EABI */ + return regs->ARM_r7; + } else if (config_oabi) { + if (config_arm_thumb && (regs->ARM_cpsr & PSR_T_BIT)) + return -1; + + ret = access_process_vm(task, regs->ARM_pc - 4, &instr, + sizeof(unsigned long), 0); + if (ret != sizeof(unsigned long)) + return -1; + + if (config_cpu_endian_be8) + asm ("rev %[out], %[in]": [out] "=r" (instr): + [in] "r" (instr)); + + if ((instr & 0x00ffffff) == 0) + return regs->ARM_r7; /* EABI call */ + else + return (instr & 0x00ffffff) | __NR_OABI_SYSCALL_BASE; + } else { + /* Legacy ABI only */ + if (config_arm_thumb && (regs->ARM_cpsr & PSR_T_BIT)) { + /* Thumb mode ABI */ + scno = regs->ARM_r7 + __NR_SYSCALL_BASE; + } else { + ret = access_process_vm(task, regs->ARM_pc - 4, &instr, + sizeof(unsigned long), 0); + if (ret != sizeof(unsigned long)) + return -1; + scno = instr; + } + return scno & 0x00ffffff; + } +} diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 907d5a6..d37ef41 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -773,6 +773,11 @@ static void do_signal(struct pt_regs *regs, int syscall) single_step_set(current); } +int task_has_saved_sigmask(struct task_struct *task) +{ + return !!(task_thread_info(task)->flags & _TIF_RESTORE_SIGMASK); +} + asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int thread_flags, int syscall) { diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c index c235018..5473ebd 100644 --- a/arch/arm/kernel/sys_arm.c +++ b/arch/arm/kernel/sys_arm.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Fork a new task - this creates a new program thread. * This is called indirectly via a small wrapper @@ -127,3 +128,15 @@ asmlinkage long sys_arm_fadvise64_64(int fd, int advice, { return sys_fadvise64_64(fd, offset, len, advice); } + +asmlinkage long sys_checkpoint(unsigned long pid, unsigned long fd, + unsigned long flags, unsigned long logfd) +{ + return do_sys_checkpoint(pid, fd, flags, logfd); +} + +asmlinkage long sys_restart(unsigned long pid, unsigned long fd, + unsigned long flags, unsigned long logfd) +{ + return do_sys_restart(pid, fd, flags, logfd); +} diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2e19500..16416b0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -26,6 +26,9 @@ config MMU bool default y +config CHECKPOINT_SUPPORT + def_bool y + config GENERIC_CMOS_UPDATE def_bool y diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 5ab7d7f..20379f1 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -12,6 +12,7 @@ header-y += shmbuf.h header-y += socket.h header-y += termbits.h header-y += fcntl.h +header-y += checkpoint_hdr.h header-y += poll.h header-y += sockios.h header-y += ucontext.h diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index c376eda..0b06255 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -266,6 +266,7 @@ extern int ucache_bsize; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, + unsigned long start, int uses_interp); #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 9e2d84c..a88d711 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -87,6 +87,8 @@ struct pt_regs { #ifndef __ASSEMBLY__ +#include + #define instruction_pointer(regs) ((regs)->nip) #define user_stack_pointer(regs) ((regs)->gpr[1]) #define regs_return_value(regs) ((regs)->gpr[3]) @@ -141,6 +143,11 @@ do { \ #define arch_has_block_step() (!cpu_has_feature(CPU_FTR_601)) #define ARCH_HAS_USER_SINGLE_STEP_INFO +/* for reprogramming DABR/DAC during restart of a checkpointed task */ +extern bool debugreg_valid(unsigned long val, unsigned int index); +extern void debugreg_update(struct task_struct *task, unsigned long val, + unsigned int index); + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index f94fc43..b5afba3 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -327,3 +327,5 @@ COMPAT_SYS_SPU(preadv) COMPAT_SYS_SPU(pwritev) COMPAT_SYS(rt_tgsigqueueinfo) PPC_SYS(eclone) +PPC_SYS(checkpoint) +PPC_SYS(restart) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 4cdbd5c..54f6ecb 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -346,10 +346,12 @@ #define __NR_pwritev 321 #define __NR_rt_tgsigqueueinfo 322 #define __NR_eclone 323 +#define __NR_checkpoint 324 +#define __NR_restart 325 #ifdef __KERNEL__ -#define __NR_syscalls 324 +#define __NR_syscalls 326 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 8773263..6d294a4 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -63,6 +63,7 @@ obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_44x) += cpu_setup_44x.o obj-$(CONFIG_FSL_BOOKE) += cpu_setup_fsl_booke.o dbell.o +obj-$(CONFIG_CHECKPOINT) += checkpoint.o extra-y := head_$(CONFIG_WORD_SIZE).o extra-$(CONFIG_PPC_BOOK3E_32) := head_new_booke.o diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 579f1da..853814b 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -594,6 +594,29 @@ ppc_eclone: stw r0,_TRAP(r1) /* register set saved */ b sys_eclone +/* To handle self-checkpoint we must save nvpgprs */ + .globl ppc_checkpoint +ppc_checkpoint: + SAVE_NVGPRS(r1) + lwz r0,_TRAP(r1) + rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ + stw r0,_TRAP(r1) /* register set saved */ + b sys_checkpoint + +/* The full register set must be restored upon return from restart. + * Save nvgprs unconditionally so the caller's state is + * restored correctly in case of error. + */ + .globl ppc_restart +ppc_restart: + SAVE_NVGPRS(r1) + lwz r0,_TRAP(r1) + rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ + stw r0,_TRAP(r1) /* register set saved */ + bl sys_restart + REST_NVGPRS(r1) + b ret_from_syscall + .globl ppc_swapcontext ppc_swapcontext: SAVE_NVGPRS(r1) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index b763340..228f592 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -349,6 +349,22 @@ _GLOBAL(ppc_eclone) bl .sys_eclone b syscall_exit +/* To handle self-checkpoint we must save nvpgprs */ +_GLOBAL(ppc_checkpoint) + bl .save_nvgprs + bl .sys_checkpoint + b syscall_exit + +/* The full register set must be restored upon return from restart. + * Save nvgprs unconditionally so the caller's state is + * restored correctly in case of error. + */ +_GLOBAL(ppc_restart) + bl .save_nvgprs + bl .sys_restart + REST_NVGPRS(r1) + b syscall_exit + _GLOBAL(ppc32_swapcontext) bl .save_nvgprs bl .compat_sys_swapcontext diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index b183287..1664586 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index ed2cfe1..972e6a1 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -763,19 +763,23 @@ void user_disable_single_step(struct task_struct *task) clear_tsk_thread_flag(task, TIF_SINGLESTEP); } -int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, - unsigned long data) +/** + * debugreg_valid() - validate the value to be written to a debug register + * @val: The prospective contents of the register. + * @index: Must be zero. + * + * Returns true if @val is an acceptable value for the register indicated by + * @index, false otherwise. + */ +bool debugreg_valid(unsigned long val, unsigned int index) { - /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). - * For embedded processors we support one DAC and no IAC's at the - * moment. - */ - if (addr > 0) - return -EINVAL; + /* We support only one debug register for now */ + if (index != 0) + return false; /* The bottom 3 bits in dabr are flags */ - if ((data & ~0x7UL) >= TASK_SIZE) - return -EIO; + if ((val & ~0x7UL) >= TASK_SIZE) + return false; #ifndef CONFIG_PPC_ADV_DEBUG_REGS /* For processors using DABR (i.e. 970), the bottom 3 bits are flags. @@ -791,19 +795,38 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, */ /* Ensure breakpoint translation bit is set */ - if (data && !(data & DABR_TRANSLATION)) - return -EIO; - - /* Move contents to the DABR register */ - task->thread.dabr = data; -#else /* CONFIG_PPC_ADV_DEBUG_REGS */ + if (val && !(val & DABR_TRANSLATION)) + return false; +#else /* As described above, it was assumed 3 bits were passed with the data * address, but we will assume only the mode bits will be passed * as to not cause alignment restrictions for DAC-based processors. */ + /* Read or Write bits must be set */ + if (!(val & 0x3UL)) + return -EINVAL; +#endif + return true; +} + +/** + * debugreg_update() - update a debug register associated with a task + * @task: The task whose register state is to be modified. + * @val: The value to be written to the debug register. + * @index: Specifies the debug register. Currently unused. + * + * Set a task's DABR/DAC to @val, which should be validated with + * debugreg_valid() beforehand. + */ +void debugreg_update(struct task_struct *task, unsigned long val, + unsigned int index) +{ +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + task->thread.dabr = val; +#else /* DAC's hold the whole address without any mode flags */ - task->thread.dac1 = data & ~0x3UL; + task->thread.dabr = val & ~0x3UL; if (task->thread.dac1 == 0) { dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W); @@ -812,13 +835,8 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, task->thread.regs->msr &= ~MSR_DE; task->thread.dbcr0 &= ~DBCR0_IDM; } - return 0; } - /* Read or Write bits must be set */ - - if (!(data & 0x3UL)) - return -EINVAL; /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0 register */ @@ -827,12 +845,29 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, /* Check for write and read flags and set DBCR0 accordingly */ dbcr_dac(task) &= ~(DBCR_DAC1R|DBCR_DAC1W); - if (data & 0x1UL) + if (val & 0x1UL) dbcr_dac(task) |= DBCR_DAC1R; - if (data & 0x2UL) + if (val & 0x2UL) dbcr_dac(task) |= DBCR_DAC1W; task->thread.regs->msr |= MSR_DE; #endif /* CONFIG_PPC_ADV_DEBUG_REGS */ +} + +static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, + unsigned long data) +{ + /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). + * For embedded processors we support one DAC and no IAC's at the + * moment. + */ + if (addr > 0) + return -EINVAL; + + if (!debugreg_valid(data, 0)) + return -EIO; + + debugreg_update(task, data, 0); + return 0; } diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index a0afb55..b3337ad 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -186,6 +186,12 @@ static int do_signal_pending(sigset_t *oldset, struct pt_regs *regs) return ret; } +int task_has_saved_sigmask(struct task_struct *task) +{ + struct thread_info *ti = task_thread_info(task); + return !!(ti->local_flags & _TLF_RESTORE_SIGMASK); +} + void do_signal(struct pt_regs *regs, unsigned long thread_info_flags) { if (thread_info_flags & _TIF_SIGPENDING) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index d84d192..74210ab 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -188,7 +188,8 @@ static void dump_vdso_pages(struct vm_area_struct * vma) * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, + unsigned long start, int uses_interp) { struct mm_struct *mm = current->mm; struct page **vdso_pagelist; @@ -220,6 +221,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vdso_base = VDSO32_MBASE; #endif + /* in case restart(2) mandates a specific location */ + if (start) + vdso_base = start; + current->mm->context.vdso_base = 0; /* vDSO has a problem and was disabled, just don't "enable" it for the @@ -249,6 +254,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) /* Add required alignment. */ vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); + /* for restart(2), double check that we got we asked for */ + if (start && vdso_base != start) { + rc = -EBUSY; + goto fail_mmapsem; + } + /* * Put vDSO base into mm struct. We need to do this before calling * install_special_mapping or the perf counter mmap tracking code diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 0d8cd9b..b358e63 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -49,6 +49,10 @@ config GENERIC_TIME_VSYSCALL config GENERIC_CLOCKEVENTS def_bool y +config CHECKPOINT_SUPPORT + bool + default y if 64BIT + config GENERIC_BUG bool depends on BUG diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 63a2341..3282a6e 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -8,6 +8,7 @@ header-y += ucontext.h header-y += vtoc.h header-y += zcrypt.h header-y += chsc.h +header-y += checkpoint_hdr.h unifdef-y += cmb.h unifdef-y += debug.h diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 354d426..5081938 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -216,6 +216,6 @@ do { \ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -int arch_setup_additional_pages(struct linux_binprm *, int); +int arch_setup_additional_pages(struct linux_binprm *, unsigned long, int); #endif diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 34f0873..60f932e 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -99,6 +99,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 18 #define TIF_RESTORE_SIGMASK 19 /* restore signal mask in do_signal() */ #define TIF_FREEZE 20 /* thread is freezing for suspend */ +#define TIF_SIG_RESTARTBLOCK 23 /* restart must set TIF_RESTART_SVC */ #define _TIF_NOTIFY_RESUME (1< #include #include +#include #include #include #include @@ -240,6 +241,32 @@ SYSCALL_DEFINE4(clone, unsigned long, newsp, unsigned long, clone_flags, parent_tidptr, child_tidptr); } +#ifdef CONFIG_CHECKPOINT +SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd, unsigned long, flags, + int, logfd) +{ + return do_sys_checkpoint(pid, fd, flags, logfd); +} + +SYSCALL_DEFINE4(restart, pid_t, pid, int, fd, unsigned long, flags, + int, logfd) +{ + return do_sys_restart(pid, fd, flags, logfd); +} +#else +SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd, unsigned long, flags, + int, logfd) +{ + return -ENOSYS; +} + +SYSCALL_DEFINE4(restart, pid_t, pid, int, fd, unsigned long, flags, + int, logfd) +{ + return -ENOSYS; +} +#endif + SYSCALL_DEFINE4(eclone, unsigned int, flags_low, struct clone_args __user *, uca, int, args_size, pid_t __user *, pids) { diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 6289945..41e03d3 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -459,6 +459,16 @@ void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: regs->gprs[2] = -EINTR; + /* + * This condition is the only one which requires + * special care after handling a signr==0. So if + * we get frozen and checkpointed at the + * get_signal_to_deliver() below, then we need + * to convey this condition to sys_restart() so it + * can set the restored thread up to run the restart + * block. + */ + set_thread_flag(TIF_SIG_RESTARTBLOCK); } regs->svcnr = 0; /* Don't deal with this again. */ } @@ -467,6 +477,12 @@ void do_signal(struct pt_regs *regs) the debugger may change all our registers ... */ signr = get_signal_to_deliver(&info, &ka, regs, NULL); + /* + * we won't get frozen past this so clear the thread flag hinting + * to sys_restart that TIF_RESTART_SVC must be set. + */ + clear_thread_flag(TIF_SIG_RESTARTBLOCK); + /* Depending on the signal settings we may need to revert the decision to restart the system call. */ if (signr > 0 && regs->psw.addr == restart_addr) { @@ -524,6 +540,11 @@ void do_signal(struct pt_regs *regs) } } +int task_has_saved_sigmask(struct task_struct *task) +{ + return !!(test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK)); +} + void do_notify_resume(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 08eab1d..9f1f28e 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -341,3 +341,5 @@ SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper) SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */ SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper) SYSCALL(sys_eclone,sys_eclone,sys_eclone_wrapper) +SYSCALL(sys_checkpoint,sys_checkpoint,sys_checkpoint_wrapper) +SYSCALL(sys_restart,sys_restart,sys_restart_wrapper) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 6bc9c19..54dad2f 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -195,7 +195,8 @@ static void vdso_init_cr5(void) * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, + unsigned long start, int uses_interp) { struct mm_struct *mm = current->mm; struct page **vdso_pagelist; @@ -226,6 +227,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vdso_pages = vdso32_pages; #endif + /* in case restart(2) mandates a specific location */ + if (start) + vdso_base = start; + /* * vDSO has a problem and was disabled, just don't "enable" it for * the process @@ -248,6 +253,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out_up; } + /* for restart(2), double check that we got we asked for */ + if (start && vdso_base != start) { + rc = -EINVAL; + goto out_up; + } + /* * Put vDSO base into mm struct. We need to do this before calling * install_special_mapping or the perf counter mmap tracking code diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index eec0544..359a3bc 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -6,3 +6,4 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \ page-states.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_PAGE_STATES) += page-states.o diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h index ce830fa..4128c30 100644 --- a/arch/sh/include/asm/elf.h +++ b/arch/sh/include/asm/elf.h @@ -201,6 +201,7 @@ do { \ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, + unsigned long start, int uses_interp); extern unsigned int vdso_enabled; diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 242117c..6dbdfe1 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -58,7 +58,7 @@ int __init vsyscall_init(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, unsigned long start, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9458685..335a4b3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,10 @@ config STACKTRACE_SUPPORT config HAVE_LATENCYTOP_SUPPORT def_bool y +config CHECKPOINT_SUPPORT + bool + default y + config MMU def_bool y diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index b7f3f34..2efc4db 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -478,6 +478,13 @@ quiet_ni_syscall: PTREGSCALL stub32_vfork, sys_vfork, %rdi PTREGSCALL stub32_iopl, sys_iopl, %rsi PTREGSCALL stub32_eclone, sys_eclone, %r8 +#ifdef CONFIG_CHECKPOINT + PTREGSCALL stub32_checkpoint, sys_checkpoint, %r8 + PTREGSCALL stub32_restart, sys_restart, %r8 +#else + PTREGSCALL stub32_checkpoint, sys_ni_syscall, %r8 + PTREGSCALL stub32_restart, sys_ni_syscall, %r8 +#endif ENTRY(ia32_ptregs_common) popq %r11 @@ -844,4 +851,6 @@ ia32_sys_call_table: .quad sys_perf_event_open .quad compat_sys_recvmmsg .quad stub32_eclone + .quad stub32_checkpoint + .quad stub32_restart /* 340 */ ia32_syscall_end: diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 493092e..0893cfa 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -2,6 +2,7 @@ include include/asm-generic/Kbuild.asm header-y += boot.h header-y += bootparam.h +header-y += checkpoint_hdr.h header-y += debugreg.h header-y += ldt.h header-y += msr-index.h diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f2ad216..3761be8 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -312,9 +312,10 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, + unsigned long start, int uses_interp); -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); +extern int syscall32_setup_pages(struct linux_binprm *, unsigned long start, int exstack); #define compat_arch_setup_additional_pages syscall32_setup_pages extern unsigned long arch_randomize_brk(struct mm_struct *mm); diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h index 46727eb..f2845f9 100644 --- a/arch/x86/include/asm/ldt.h +++ b/arch/x86/include/asm/ldt.h @@ -37,4 +37,11 @@ struct user_desc { #define MODIFY_LDT_CONTENTS_CODE 2 #endif /* !__ASSEMBLY__ */ + +#ifdef __KERNEL__ +#include +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount); +#endif + #endif /* _ASM_X86_LDT_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index d525677..538a1ef 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -29,6 +29,12 @@ long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); long sys_eclone(unsigned flags_low, struct clone_args __user *uca, int args_size, pid_t __user *pids, struct pt_regs *regs); +#ifdef CONFIG_CHECKPOINT +long sys_checkpoint(pid_t pid, int fd, unsigned long flags, + int logfd, struct pt_regs *regs); +long sys_restart(pid_t pid, int fd, unsigned long flags, + int logfd, struct pt_regs *regs); +#endif /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index e543b0e..007d7cd 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -344,10 +344,12 @@ #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 #define __NR_eclone 338 +#define __NR_checkpoint 339 +#define __NR_restart 340 #ifdef __KERNEL__ -#define NR_syscalls 339 +#define NR_syscalls 341 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 1cd16af..2b162e1 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -665,6 +665,10 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __NR_eclone 300 __SYSCALL(__NR_eclone, stub_eclone) +#define __NR_checkpoint 301 +__SYSCALL(__NR_checkpoint, stub_checkpoint) +#define __NR_restart 302 +__SYSCALL(__NR_restart, stub_restart) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4c58352..916a7e1 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -117,6 +117,14 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_CHECKPOINT) += checkpoint.o + +### +# 32 bit specific files +ifeq ($(CONFIG_X86_32),y) + obj-$(CONFIG_CHECKPOINT) += checkpoint_32.o +endif + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) @@ -130,4 +138,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o + + obj-$(CONFIG_CHECKPOINT) += checkpoint_64.o endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 65e1735..49d6628 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -781,6 +781,14 @@ PTREGSCALL0(rt_sigreturn) PTREGSCALL2(vm86) PTREGSCALL1(vm86old) PTREGSCALL4(eclone) +#ifdef CONFIG_CHECKPOINT +PTREGSCALL4(checkpoint) +PTREGSCALL4(restart) +#else +/* Use the weak defs in kernel/sys_ni.c */ +#define ptregs_checkpoint sys_checkpoint +#define ptregs_restart sys_restart +#endif /* Clone is an oddball. The 4th arg is in %edi */ ALIGN; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 216681e..c2ece28 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -699,6 +699,13 @@ END(\label) PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx PTREGSCALL stub_iopl, sys_iopl, %rsi PTREGSCALL stub_eclone, sys_eclone, %r8 +#ifdef CONFIG_CHECKPOINT + PTREGSCALL stub_checkpoint, sys_checkpoint, %r8 + PTREGSCALL stub_restart, sys_restart, %r8 +#else + PTREGSCALL stub_checkpoint, sys_ni_syscall, %r8 + PTREGSCALL stub_restart, sys_ni_syscall, %r8 +#endif ENTRY(ptregscall_common) DEFAULT_FRAME 1 8 /* offset 8: return address */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4fd173c..eb63d59 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -831,6 +831,11 @@ static void do_signal(struct pt_regs *regs) } } +int task_has_saved_sigmask(struct task_struct *task) +{ + return !!(task_thread_info(task)->status & TS_RESTORE_SIGMASK); +} + /* * notification of userspace execution resumption * - triggered by the TIF_WORK_MASK flags diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 0c92570..2485482 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -338,3 +338,5 @@ ENTRY(sys_call_table) .long sys_perf_event_open .long sys_recvmmsg .long ptregs_eclone + .long ptregs_checkpoint + .long ptregs_restart /* 340 */ diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 02b442e..62043c1 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -310,7 +310,8 @@ int __init sysenter_setup(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, + unsigned long start, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; @@ -331,13 +332,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (compat) addr = VDSO_HIGH_BASE; else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + addr = get_unmapped_area(NULL, start, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } } + /* for restart(2), double check that we got we asked for */ + if (start && addr != start) + goto up_fail; + current->mm->context.vdso = (void *)addr; if (compat_uses_vma || !compat) { diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ac74869..b813286 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -100,23 +100,28 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, + unsigned long start, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; - int ret; + int ret = -EINVAL; if (!vdso_enabled) return 0; down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, vdso_size); + addr = start ? : vdso_addr(mm->start_stack, vdso_size); addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } + /* for restart(2), double check that we got we asked for */ + if (start && addr != start) + goto up_fail; + current->mm->context.vdso = (void *)addr; ret = install_special_mapping(mm, addr, vdso_size, diff --git a/drivers/char/pty.c b/drivers/char/pty.c index d83a431..77c2d70 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include @@ -615,9 +617,10 @@ static const struct tty_operations pty_unix98_ops = { }; /** - * ptmx_open - open a unix 98 pty master + * __ptmx_open - open a unix 98 pty master * @inode: inode of device file * @filp: file pointer to tty + * @index: desired slave index * * Allocate a unix98 pty master device from the ptmx driver. * @@ -626,16 +629,15 @@ static const struct tty_operations pty_unix98_ops = { * allocated_ptys_lock handles the list of free pty numbers */ -static int __ptmx_open(struct inode *inode, struct file *filp) +static int __ptmx_open(struct inode *inode, struct file *filp, int index) { struct tty_struct *tty; int retval; - int index; nonseekable_open(inode, filp); /* find a device that is not in use. */ - index = devpts_new_index(inode); + index = devpts_new_index(inode, index); if (index < 0) return index; @@ -672,11 +674,40 @@ static int ptmx_open(struct inode *inode, struct file *filp) int ret; lock_kernel(); - ret = __ptmx_open(inode, filp); + ret = __ptmx_open(inode, filp, UNSPECIFIED_PTY_INDEX); unlock_kernel(); return ret; } +static int ptmx_release(struct inode *inode, struct file *filp) +{ + return tty_release(inode, filp); +} + +struct file *pty_open_by_index(char *ptmxpath, int index) +{ + struct file *ptmxfile; + int ret; + + /* + * We need to pick a way to specify which devpts mountpoint to + * use. For now, we'll just use whatever /dev/ptmx points to. + */ + ptmxfile = filp_open(ptmxpath, O_RDWR|O_NOCTTY, 0); + if (IS_ERR(ptmxfile)) + return ptmxfile; + + lock_kernel(); + ret = __ptmx_open(ptmxfile->f_dentry->d_inode, ptmxfile, index); + unlock_kernel(); + if (ret) { + fput(ptmxfile); + return ERR_PTR(ret); + } + + return ptmxfile; +} + static struct file_operations ptmx_fops; static void __init unix98_pty_init(void) @@ -733,6 +764,7 @@ static void __init unix98_pty_init(void) /* Now create the /dev/ptmx special device */ tty_default_fops(&ptmx_fops); ptmx_fops.open = ptmx_open; + ptmx_fops.release = ptmx_release; cdev_init(&ptmx_cdev, &ptmx_fops); if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 6da962c..3977322 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include @@ -106,6 +107,7 @@ #include #include +#include #undef TTY_DEBUG_HANGUP @@ -2162,7 +2164,7 @@ static int fionbio(struct file *file, int __user *p) * Takes ->siglock() when updating signal->tty */ -static int tiocsctty(struct tty_struct *tty, int arg) +int tiocsctty(struct tty_struct *tty, int arg) { int ret = 0; if (current->signal->leader && (task_session(current) == tty->session)) @@ -2251,10 +2253,10 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t } /** - * tiocspgrp - attempt to set process group + * do_tiocspgrp - attempt to set process group * @tty: tty passed by user * @real_tty: tty side device matching tty passed by user - * @p: pid pointer + * @pid: pgrp_nr * * Set the process group of the tty to the session passed. Only * permitted where the tty session is our session. @@ -2262,10 +2264,10 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t * Locking: RCU, ctrl lock */ -static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) +int do_tiocspgrp(struct tty_struct *tty, + struct tty_struct *real_tty, pid_t pgrp_nr) { struct pid *pgrp; - pid_t pgrp_nr; int retval = tty_check_change(real_tty); unsigned long flags; @@ -2277,8 +2279,6 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t (current->signal->tty != real_tty) || (real_tty->session != task_session(current))) return -ENOTTY; - if (get_user(pgrp_nr, p)) - return -EFAULT; if (pgrp_nr < 0) return -EINVAL; rcu_read_lock(); @@ -2300,6 +2300,27 @@ out_unlock: } /** + * tiocspgrp - attempt to set process group + * @tty: tty passed by user + * @real_tty: tty side device matching tty passed by user + * @p: pid pointer + * + * Set the process group of the tty to the session passed. Only + * permitted where the tty session is our session. + * + * Locking: RCU, ctrl lock + */ + +static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) +{ + pid_t pgrp_nr; + + if (get_user(pgrp_nr, p)) + return -EFAULT; + return do_tiocspgrp(tty, real_tty, pgrp_nr); +} + +/** * tiocgsid - get session id * @tty: tty passed by user * @real_tty: tty side of the tty pased by the user if a pty else the tty diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 72b7949..83c9bf7 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -156,9 +156,12 @@ static void loopback_dev_free(struct net_device *dev) } static const struct net_device_ops loopback_ops = { - .ndo_init = loopback_dev_init, - .ndo_start_xmit= loopback_xmit, - .ndo_get_stats = loopback_get_stats, + .ndo_init = loopback_dev_init, + .ndo_start_xmit = loopback_xmit, + .ndo_get_stats = loopback_get_stats, +#ifdef CONFIG_NETNS_CHECKPOINT + .ndo_checkpoint = loopback_checkpoint, +#endif }; /* diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 40faa36..8bd6be9 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -501,6 +501,9 @@ static const struct net_device_ops macvlan_netdev_ops = { .ndo_set_multicast_list = macvlan_set_multicast_list, .ndo_get_stats = macvlan_dev_get_stats, .ndo_validate_addr = eth_validate_addr, +#ifdef CONFIG_NETNS_CHECKPOINT + .ndo_checkpoint = macvlan_checkpoint, +#endif }; static void macvlan_setup(struct net_device *dev) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f9f0730..9d776c9 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -293,6 +293,9 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_change_mtu = veth_change_mtu, .ndo_get_stats = veth_get_stats, .ndo_set_mac_address = eth_mac_addr, +#ifdef CONFIG_NETNS_CHECKPOINT + .ndo_checkpoint = veth_checkpoint, +#endif }; static void veth_setup(struct net_device *dev) diff --git a/fs/Makefile b/fs/Makefile index 97f340f..aa25755 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o +obj-$(CONFIG_CHECKPOINT) += checkpoint.o nfsd-$(CONFIG_NFSD) := nfsctl.o obj-y += $(nfsd-y) $(nfsd-m) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 535e763..6434003 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -923,7 +923,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, !!elf_interpreter); + retval = arch_setup_additional_pages(bprm, 0, !!elf_interpreter); if (retval < 0) { send_sig(SIGKILL, current, 0); goto out; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0120247..75fb8c5 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -433,11 +433,11 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -int devpts_new_index(struct inode *ptmx_inode) +int devpts_new_index(struct inode *ptmx_inode, int req_idx) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct pts_fs_info *fsi = DEVPTS_SB(sb); - int index; + int index = req_idx; int ida_ret; retry: @@ -445,7 +445,9 @@ retry: return -ENOMEM; mutex_lock(&allocated_ptys_lock); - ida_ret = ida_get_new(&fsi->allocated_ptys, &index); + if (index == UNSPECIFIED_PTY_INDEX) + index = 0; + ida_ret = ida_get_new_above(&fsi->allocated_ptys, index, &index); if (ida_ret < 0) { mutex_unlock(&allocated_ptys_lock); if (ida_ret == -EAGAIN) @@ -453,6 +455,11 @@ retry: return -EIO; } + if (req_idx != UNSPECIFIED_PTY_INDEX && index != req_idx) { + ida_remove(&fsi->allocated_ptys, index); + mutex_unlock(&allocated_ptys_lock); + return -EBUSY; + } if (index >= pty_limit) { ida_remove(&fsi->allocated_ptys, index); mutex_unlock(&allocated_ptys_lock); diff --git a/fs/eventfd.c b/fs/eventfd.c index 6bd3f76..92fdbfa 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -19,6 +19,7 @@ #include #include #include +#include struct eventfd_ctx { struct kref kref; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index bd056a5..95da38a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -674,7 +674,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, - .poll = ep_eventpoll_poll + .poll = ep_eventpoll_poll, }; /* Fast test to see if the file is an evenpoll file */ @@ -1226,35 +1226,18 @@ SYSCALL_DEFINE1(epoll_create, int, size) * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +int do_epoll_ctl(int op, int fd, + struct file *file, struct file *tfile, + struct epoll_event *epds) { int error; - struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; - - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; - - /* Get the "struct file *" for the eventpoll file */ - error = -EBADF; - file = fget(epfd); - if (!file) - goto error_return; - - /* Get the "struct file *" for the target file */ - tfile = fget(fd); - if (!tfile) - goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) - goto error_tgt_fput; + return error; /* * We have to check that the file structure underneath the file descriptor @@ -1263,7 +1246,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ error = -EINVAL; if (file == tfile || !is_file_epoll(file)) - goto error_tgt_fput; + return error; /* * At this point it is safe to assume that the "private_data" contains @@ -1284,8 +1267,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); + epds->events |= POLLERR | POLLHUP; + error = ep_insert(ep, epds, tfile, fd); } else error = -EEXIST; break; @@ -1297,15 +1280,46 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, epds); } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); -error_tgt_fput: + return error; +} + +/* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + int error; + struct file *file, *tfile; + struct epoll_event epds; + + error = -EFAULT; + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + goto error_return; + + /* Get the "struct file *" for the eventpoll file */ + error = -EBADF; + file = fget(epfd); + if (!file) + goto error_return; + + /* Get the "struct file *" for the target file */ + tfile = fget(fd); + if (!tfile) + goto error_fput; + + error = do_epoll_ctl(op, fd, file, tfile, &epds); fput(tfile); error_fput: fput(file); diff --git a/fs/exec.c b/fs/exec.c index 49cdaa1..06f93d8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -693,24 +693,83 @@ exit: } EXPORT_SYMBOL(open_exec); -int kernel_read(struct file *file, loff_t offset, - char *addr, unsigned long count) +static ssize_t _kernel_read(struct file *file, loff_t offset, + char __user *ubuf, size_t count) { - mm_segment_t old_fs; + ssize_t nread; + size_t nleft; loff_t pos = offset; - int result; + + for (nleft = count; nleft; nleft -= nread) { + nread = vfs_read(file, ubuf, nleft, &pos); + if (nread <= 0) { + if (nread == -EAGAIN) { + nread = 0; + continue; + } else if (nread == 0) + break; + else + return nread; + } + ubuf += nread; + } + return count - nleft; +} + +ssize_t kernel_read(struct file *file, loff_t offset, + char *addr, size_t count) +{ + mm_segment_t old_fs; + ssize_t result; old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - result = vfs_read(file, (void __user *)addr, count, &pos); + result = _kernel_read(file, offset, (void __user *)addr, count); set_fs(old_fs); return result; } EXPORT_SYMBOL(kernel_read); -static int exec_mmap(struct mm_struct *mm) +static ssize_t _kernel_write(struct file *file, loff_t offset, + const char __user *ubuf, size_t count) +{ + ssize_t nwrite; + size_t nleft; + loff_t pos = offset; + + for (nleft = count; nleft; nleft -= nwrite) { + nwrite = vfs_write(file, ubuf, nleft, &pos); + if (nwrite < 0) { + if (nwrite == -EAGAIN) { + nwrite = 0; + continue; + } else + return nwrite; + } + ubuf += nwrite; + } + return count - nleft; +} + +ssize_t kernel_write(struct file *file, loff_t offset, + const char *addr, size_t count) +{ + mm_segment_t old_fs; + ssize_t result; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + result = _kernel_write(file, offset, (void __user *)addr, count); + set_fs(old_fs); + return result; +} + +EXPORT_SYMBOL(kernel_write); + +int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct * old_mm, *active_mm; diff --git a/fs/fcntl.c b/fs/fcntl.c index 452d02f..2079af0 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -418,6 +418,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, return err; } +int vfs_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) +{ + int err; + + err = security_file_fcntl(filp, cmd, arg); + if (err) + goto out; + err = do_fcntl(fd, cmd, arg, filp); + out: + return err; +} + SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file *filp; @@ -427,14 +439,7 @@ SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) if (!filp) goto out; - err = security_file_fcntl(filp, cmd, arg); - if (err) { - fput(filp); - return err; - } - - err = do_fcntl(fd, cmd, arg, filp); - + err = vfs_fcntl(fd, cmd, arg, filp); fput(filp); out: return err; diff --git a/fs/fs_struct.c b/fs/fs_struct.c index eee0590..2a4c6f5 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -6,6 +6,27 @@ #include /* + * call with owning task locked + */ +void get_fs_struct(struct fs_struct *fs) +{ + write_lock(&fs->lock); + fs->users++; + write_unlock(&fs->lock); +} + +void put_fs_struct(struct fs_struct *fs) +{ + int kill; + + write_lock(&fs->lock); + kill = !--fs->users; + write_unlock(&fs->lock); + if (kill) + free_fs_struct(fs); +} + +/* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. */ diff --git a/fs/namespace.c b/fs/namespace.c index 8174c8a..da36155 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include "pnode.h" @@ -2318,6 +2319,22 @@ static void __init init_mount_tree(void) set_fs_root(current->fs, &root); } +void put_mnt_ns(struct mnt_namespace *ns) +{ + LIST_HEAD(umount_list); + + if (!atomic_dec_and_test(&ns->count)) + return; + down_write(&namespace_sem); + spin_lock(&vfsmount_lock); + umount_tree(ns->root, 0, &umount_list); + spin_unlock(&vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); + kfree(ns); +} +EXPORT_SYMBOL(put_mnt_ns); + void __init mnt_init(void) { unsigned u; @@ -2347,20 +2364,7 @@ void __init mnt_init(void) printk(KERN_WARNING "%s: kobj create error\n", __func__); init_rootfs(); init_mount_tree(); +#ifdef CONFIG_CHECKPOINT + register_checkpoint_obj(&ckpt_obj_mntns_ops); +#endif } - -void put_mnt_ns(struct mnt_namespace *ns) -{ - LIST_HEAD(umount_list); - - if (!atomic_dec_and_test(&ns->count)) - return; - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); - umount_tree(ns->root, 0, &umount_list); - spin_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); - kfree(ns); -} -EXPORT_SYMBOL(put_mnt_ns); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 85c89df..e251cab 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -702,5 +702,4 @@ const struct file_operations nilfs_dir_operations = { .compat_ioctl = nilfs_ioctl, #endif /* CONFIG_COMPAT */ .fsync = nilfs_sync_file, - }; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 7e54e52..0a63bf6 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -289,6 +289,24 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent return 0; } +int is_dnotify_attached(struct file *filp) +{ + struct fsnotify_mark_entry *entry; + struct inode *inode; + + inode = filp->f_path.dentry->d_inode; + if (!S_ISDIR(inode->i_mode)) + return 0; + + spin_lock(&inode->i_lock); + entry = fsnotify_find_mark_entry(dnotify_group, inode); + spin_unlock(&inode->i_lock); + if (!entry) + return 0; + fsnotify_put_mark(entry); + return 1; +} + /* * When a process calls fcntl to attach a dnotify watch to a directory it ends * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be diff --git a/fs/open.c b/fs/open.c index 74e5cd9..e9d5626 100644 --- a/fs/open.c +++ b/fs/open.c @@ -524,6 +524,18 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) return sys_faccessat(AT_FDCWD, filename, mode); } +int do_chdir(struct fs_struct *fs, struct path *path) +{ + int error; + + error = inode_permission(path->dentry->d_inode, MAY_EXEC | MAY_ACCESS); + if (error) + return error; + + set_fs_pwd(fs, path); + return 0; +} + SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; @@ -531,17 +543,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) error = user_path_dir(filename, &path); if (error) - goto out; - - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); - if (error) - goto dput_and_out; - - set_fs_pwd(current->fs, &path); + return error; -dput_and_out: + error = do_chdir(current->fs, &path); path_put(&path); -out: return error; } @@ -571,31 +576,36 @@ out: return error; } -SYSCALL_DEFINE1(chroot, const char __user *, filename) +int do_chroot(struct fs_struct *fs, struct path *path) { - struct path path; int error; - error = user_path_dir(filename, &path); + error = inode_permission(path->dentry->d_inode, MAY_EXEC | MAY_ACCESS); if (error) - goto out; + return error; + + if (!capable(CAP_SYS_CHROOT)) + return -EPERM; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + error = security_path_chroot(path); if (error) - goto dput_and_out; + return error; - error = -EPERM; - if (!capable(CAP_SYS_CHROOT)) - goto dput_and_out; - error = security_path_chroot(&path); + set_fs_root(fs, path); + return 0; +} + +SYSCALL_DEFINE1(chroot, const char __user *, filename) +{ + struct path path; + int error; + + error = user_path_dir(filename, &path); if (error) - goto dput_and_out; + return error; - set_fs_root(current->fs, &path); - error = 0; -dput_and_out: + error = do_chroot(current->fs, &path); path_put(&path); -out: return error; } diff --git a/fs/pipe.c b/fs/pipe.c index 37ba29f..d1cb313 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/read_write.c b/fs/read_write.c index 113386d..67b7d83 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -361,16 +361,6 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ EXPORT_SYMBOL(vfs_write); -static inline loff_t file_pos_read(struct file *file) -{ - return file->f_pos; -} - -static inline void file_pos_write(struct file *file, loff_t pos) -{ - file->f_pos = pos; -} - SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { struct file *file; diff --git a/fs/select.c b/fs/select.c index 500a669..194c6d6 100644 --- a/fs/select.c +++ b/fs/select.c @@ -890,7 +890,7 @@ out_fds: return err; } -static long do_restart_poll(struct restart_block *restart_block) +long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; diff --git a/fs/splice.c b/fs/splice.c index 9313b61..ed91d7a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -538,21 +538,6 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, return res; } -static ssize_t kernel_write(struct file *file, const char *buf, size_t count, - loff_t pos) -{ - mm_segment_t old_fs; - ssize_t res; - - old_fs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_write(file, (const char __user *)buf, count, &pos); - set_fs(old_fs); - - return res; -} - ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1011,7 +996,7 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, return ret; data = buf->ops->map(pipe, buf, 0); - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + ret = kernel_write(sd->u.file, sd->pos, data + buf->offset, sd->len); buf->ops->unmap(pipe, buf, data); return ret; @@ -1052,18 +1037,43 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, EXPORT_SYMBOL(generic_splice_sendpage); /* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +static inline struct pipe_inode_info *pipe_info(struct inode *inode) +{ + if (S_ISFIFO(inode->i_mode)) + return inode->i_pipe; + + return NULL; +} + +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); + +/* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); + struct pipe_inode_info *opipe; int ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; + /* When called directly (e.g. from c/r) output may be a pipe */ + opipe = pipe_info(out->f_path.dentry->d_inode); + if (opipe) { + BUG_ON(opipe == pipe); + return splice_pipe_to_pipe(pipe, opipe, len, flags); + } + if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; @@ -1082,17 +1092,25 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + struct pipe_inode_info *ipipe; int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; + /* When called firectly (e.g. from c/r) input may be a pipe */ + ipipe = pipe_info(in->f_path.dentry->d_inode); + if (ipipe) { + BUG_ON(ipipe == pipe); + return splice_pipe_to_pipe(ipipe, pipe, len, flags); + } + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; @@ -1272,18 +1290,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -/* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. - */ -static inline struct pipe_inode_info *pipe_info(struct inode *inode) -{ - if (S_ISFIFO(inode->i_mode)) - return inode->i_pipe; - - return NULL; -} /* * Determine where to splice to/from. @@ -1888,9 +1894,9 @@ retry: /* * Link contents of ipipe to opipe. */ -static int link_pipe(struct pipe_inode_info *ipipe, - struct pipe_inode_info *opipe, - size_t len, unsigned int flags) +int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; int ret = 0, i = 0, nbuf; diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 12b933a..198865b 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -230,5 +230,5 @@ failed_read: const struct file_operations squashfs_dir_ops = { .read = generic_read_dir, - .readdir = squashfs_readdir + .readdir = squashfs_readdir, }; diff --git a/include/linux/Kbuild b/include/linux/Kbuild index e2ea0b2..71bb8d1 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -45,6 +45,9 @@ header-y += bsg.h header-y += can.h header-y += cciss_defs.h header-y += cdk.h +header-y += checkpoint.h +header-y += checkpoint_hdr.h +header-y += checkpoint_types.h header-y += chio.h header-y += coda_psdev.h header-y += coff.h diff --git a/include/linux/aio.h b/include/linux/aio.h index 811dbb3..e0b1808 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -212,6 +212,7 @@ extern void kick_iocb(struct kiocb *iocb); extern int aio_complete(struct kiocb *iocb, long res, long res2); struct mm_struct; extern void exit_aio(struct mm_struct *mm); +extern int check_for_outstanding_aio(struct mm_struct *mm); #else static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } static inline int aio_put_req(struct kiocb *iocb) { return 0; } @@ -219,6 +220,7 @@ static inline void kick_iocb(struct kiocb *iocb) { } static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; } struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } +static inline int check_for_outstanding_aio(struct mm_struct *mm) { return 0; } #endif /* CONFIG_AIO */ static inline struct kiocb *list_kiocb(struct list_head *h) diff --git a/include/linux/compat.h b/include/linux/compat.h index 717c691..89125dd 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -210,7 +210,8 @@ struct compat_robust_list_head { }; extern void compat_exit_robust_list(struct task_struct *curr); - +extern long do_compat_set_robust_list(struct compat_robust_list_head __user *head, + compat_size_t len); asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len); diff --git a/include/linux/cred.h b/include/linux/cred.h index 52507c3..8558bec 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -22,6 +22,9 @@ struct user_struct; struct cred; struct inode; +/* defined in sys.c, used in cred_setresuid */ +extern int set_user(struct cred *new); + /* * COW Supplementary groups list */ @@ -396,4 +399,9 @@ do { \ *(_fsgid) = __cred->fsgid; \ } while(0) +extern int cred_setresuid(struct cred *new, uid_t ruid, uid_t euid, uid_t suid); +extern int cred_setresgid(struct cred *new, gid_t rgid, gid_t egid, gid_t sgid); +extern int cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid); +extern int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid); + #endif /* _LINUX_CRED_H */ diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 5ce0e5f..163a70e 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -15,9 +15,13 @@ #include +#define UNSPECIFIED_PTY_INDEX -1 + #ifdef CONFIG_UNIX98_PTYS -int devpts_new_index(struct inode *ptmx_inode); +struct file *pty_open_by_index(char *ptmxpath, int index); + +int devpts_new_index(struct inode *ptmx_inode, int req_idx); void devpts_kill_index(struct inode *ptmx_inode, int idx); /* mknod in devpts */ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty); diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index ecc0628..7093052 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -29,6 +29,7 @@ struct dnotify_struct { FS_MOVED_FROM | FS_MOVED_TO) extern void dnotify_flush(struct file *, fl_owner_t); +extern int is_dnotify_attached(struct file *); extern int fcntl_dirnotify(int, struct file *, unsigned long); #else @@ -37,6 +38,11 @@ static inline void dnotify_flush(struct file *filp, fl_owner_t id) { } +static inline int is_dnotify_attached(struct file *filp) +{ + return 0; +} + static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) { return -EINVAL; diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f6856a5..0f7339d 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -56,6 +56,9 @@ struct file; #ifdef CONFIG_EPOLL +struct ckpt_ctx; +struct ckpt_hdr_file; + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) @@ -95,8 +98,9 @@ static inline void eventpoll_release(struct file *file) eventpoll_release_file(file); } -#else +#else +/* !defined(CONFIG_EPOLL) */ static inline void eventpoll_init_file(struct file *file) {} static inline void eventpoll_release(struct file *file) {} diff --git a/include/linux/freezer.h b/include/linux/freezer.h index da7e52b..0cb22cb 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -65,11 +65,20 @@ extern void cancel_freezing(struct task_struct *p); #ifdef CONFIG_CGROUP_FREEZER extern int cgroup_freezing_or_frozen(struct task_struct *task); +extern int in_same_cgroup_freezer(struct task_struct *p, struct task_struct *q); +extern int cgroup_freezer_begin_checkpoint(struct task_struct *task); +extern void cgroup_freezer_end_checkpoint(struct task_struct *task); +extern int cgroup_freezer_make_frozen(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ static inline int cgroup_freezing_or_frozen(struct task_struct *task) { return 0; } +static inline int in_same_cgroup_freezer(struct task_struct *p, + struct task_struct *q) +{ + return 0; +} #endif /* !CONFIG_CGROUP_FREEZER */ /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 39d57bc..ee725ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -397,6 +397,7 @@ struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; +struct ckpt_ctx; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -1096,6 +1097,8 @@ struct file_lock { #include +extern int vfs_fcntl(int fd, unsigned cmd, unsigned long arg, struct file *fp); + extern void send_sigio(struct fown_struct *fown, int fd, int band); #ifdef CONFIG_FILE_LOCKING @@ -1120,6 +1123,7 @@ extern void locks_remove_posix(struct file *, fl_owner_t); extern void locks_remove_flock(struct file *); extern void locks_release_private(struct file_lock *); extern void posix_test_lock(struct file *, struct file_lock *); +extern int find_locks_with_owner(struct file *filp, fl_owner_t owner); extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); extern int posix_lock_file_wait(struct file *, struct file_lock *); extern int posix_unblock_lock(struct file *, struct file_lock *); @@ -1188,6 +1192,11 @@ static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) return; } +static inline int find_locks_with_owner(struct file *filp, fl_owner_t owner) +{ + return -ENOENT; +} + static inline void locks_remove_flock(struct file *filp) { return; @@ -1509,6 +1518,10 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); +#ifdef CONFIG_CHECKPOINT + int (*checkpoint)(struct ckpt_ctx *, struct file *); + int (*collect)(struct ckpt_ctx *, struct file *); +#endif }; struct inode_operations { @@ -1548,6 +1561,16 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, struct iovec *fast_pointer, struct iovec **ret_pointer); +static inline loff_t file_pos_read(struct file *file) +{ + return file->f_pos; +} + +static inline void file_pos_write(struct file *file, loff_t pos) +{ + file->f_pos = pos; +} + extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_readv(struct file *, const struct iovec __user *, @@ -1803,6 +1826,11 @@ extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +struct fs_struct; +extern int do_chdir(struct fs_struct *fs, struct path *path); +extern int do_chroot(struct fs_struct *fs, struct path *path); + + extern int current_umask(void); /* /sys/fs */ @@ -2127,7 +2155,8 @@ extern struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode); extern int may_open(struct path *, int, int); -extern int kernel_read(struct file *, loff_t, char *, unsigned long); +extern ssize_t kernel_read(struct file *, loff_t, char *, size_t); +extern ssize_t kernel_write(struct file *, loff_t, const char *, size_t); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ @@ -2305,6 +2334,10 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); +#ifdef CONFIG_CHECKPOINT +extern int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file); +#endif + extern int vfs_readdir(struct file *, filldir_t, void *); extern int vfs_stat(char __user *, struct kstat *); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 78a05bf..a73cbcb 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -20,5 +20,7 @@ extern struct fs_struct *copy_fs_struct(struct fs_struct *); extern void free_fs_struct(struct fs_struct *); extern void daemonize_fs_struct(void); extern int unshare_fs_struct(void); +extern void get_fs_struct(struct fs_struct *); +extern void put_fs_struct(struct fs_struct *); #endif /* _LINUX_FS_STRUCT_H */ diff --git a/include/linux/futex.h b/include/linux/futex.h index 1e5a26d..c825790 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -136,6 +136,17 @@ extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); /* + * In case we must use restart_block to restart a futex_wait, + * we encode in the 'flags' shared capability + */ +#define FLAGS_SHARED 0x01 +#define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 + +/* for c/r */ +extern long futex_wait_restart(struct restart_block *restart); + +/* * Futexes are matched on equal values of this key. * The key type depends on whether it's a shared or private mapping. * Don't rearrange members without looking at hash_futex(). @@ -174,6 +185,7 @@ union futex_key { #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } #ifdef CONFIG_FUTEX +extern long do_set_robust_list(struct robust_list_head __user *head, size_t len); extern void exit_robust_list(struct task_struct *curr); extern void exit_pi_state_list(struct task_struct *curr); extern int futex_cmpxchg_enabled; diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5d86fb2..97751ad 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -244,7 +244,13 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return ktime_sub(timer->_expires, timer->base->get_time()); + return ktime_sub(timer->_expires, timer->base->get_time()); +} + +/* @after will usually be <= now */ +static inline ktime_t hrtimer_expires_remaining_after(const struct hrtimer *timer, ktime_t after) +{ + return ktime_sub(timer->_expires, after); } #ifdef CONFIG_HIGH_RES_TIMERS diff --git a/include/linux/magic.h b/include/linux/magic.h index eb9800f..e04117a 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -58,4 +58,7 @@ #define DEVPTS_SUPER_MAGIC 0x1cd1 #define SOCKFS_MAGIC 0x534F434B +#define CHECKPOINT_MAGIC_HEAD 0x00feed0cc0a2d200LL +#define CHECKPOINT_MAGIC_TAIL 0x002d2a0cc0deef00LL + #endif /* __LINUX_MAGIC_H__ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 462acaf..31520e5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -20,6 +20,7 @@ struct file_ra_state; struct user_struct; struct writeback_control; struct rlimit; +struct ckpt_ctx; #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -221,6 +222,9 @@ struct vm_operations_struct { int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, const nodemask_t *to, unsigned long flags); #endif +#ifdef CONFIG_CHECKPOINT + int (*checkpoint)(struct ckpt_ctx *ctx, struct vm_area_struct *vma); +#endif }; struct mmu_gather; @@ -336,6 +340,17 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); +/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ +enum sgp_type { + SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ + SGP_WRITE, /* may exceed i_size, may allocate page */ +}; + +extern int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp, int *type); + /* * Compound pages have a destructor function. Provide a * prototype for that function and accessor functions. @@ -842,6 +857,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); +struct page *__get_dirty_page(struct vm_area_struct *vma, unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); @@ -1282,9 +1298,13 @@ out: } extern int do_munmap(struct mm_struct *, unsigned long, size_t); +extern int destroy_mm(struct mm_struct *); extern unsigned long do_brk(unsigned long, unsigned long); +/* fs/exec.c */ +extern int exec_mmap(struct mm_struct *mm); + /* filemap.c */ extern unsigned long page_unuse(struct page *); extern void truncate_inode_pages(struct address_space *, loff_t); @@ -1294,10 +1314,27 @@ extern void truncate_inode_pages_range(struct address_space *, /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +#ifdef CONFIG_CHECKPOINT +/* generic vm_area_ops exported for mapped files checkpoint */ +extern int filemap_checkpoint(struct ckpt_ctx *, struct vm_area_struct *); +#endif + /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); void task_dirty_inc(struct task_struct *tsk); + +/* checkpoint/restart */ +#ifdef CONFIG_CHECKPOINT +struct ckpt_hdr_vma; +extern int filemap_restore(struct ckpt_ctx *ctx, struct mm_struct *mm, + struct ckpt_hdr_vma *hh); +extern int special_mapping_restore(struct ckpt_ctx *ctx, struct mm_struct *mm, + struct ckpt_hdr_vma *hh); +extern int shmem_restore(struct ckpt_ctx *ctx, struct mm_struct *mm, + struct ckpt_hdr_vma *hh); +#endif + /* readahead.c */ #define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ @@ -1369,6 +1406,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_DIRTY 0x20 /* give error on non-present file mapped */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/include/linux/net.h b/include/linux/net.h index 4157b5d..6ffe827 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -153,6 +153,9 @@ struct sockaddr; struct msghdr; struct module; +struct ckpt_ctx; +struct ckpt_hdr_socket; + struct proto_ops { int family; struct module *owner; @@ -201,6 +204,12 @@ struct proto_ops { int offset, size_t size, int flags); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); + int (*checkpoint)(struct ckpt_ctx *ctx, + struct socket *sock); + int (*collect)(struct ckpt_ctx *ctx, + struct socket *sock); + int (*restore)(struct ckpt_ctx *ctx, struct socket *sock, + struct ckpt_hdr_socket *h); }; #define DECLARE_SOCKADDR(type, dst, src) \ @@ -237,6 +246,8 @@ extern int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); extern int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); +extern int sock_alloc_file(struct socket *sock, struct file **f, + int flags); extern int sock_map_fd(struct socket *sock, int flags); extern struct socket *sockfd_lookup(int fd, int *err); #define sockfd_put(sock) fput(sock->file) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa8b476..9f6de34 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -691,6 +691,12 @@ struct net_device_ops { int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); #endif +#ifdef CONFIG_CHECKPOINT + int (*ndo_collect)(struct ckpt_ctx *ctx, + struct net_device *dev); + int (*ndo_checkpoint)(struct ckpt_ctx *ctx, + struct net_device *dev); +#endif }; /* diff --git a/include/linux/poll.h b/include/linux/poll.h index 600cc1f..03357b8 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -136,6 +136,9 @@ extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec); +/* used by checkpoint/restart */ +extern long do_restart_poll(struct restart_block *restart_block); + #endif /* KERNEL */ #endif /* _LINUX_POLL_H */ diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4f71bf4..7dd69c3 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -101,6 +101,10 @@ int posix_cpu_timer_create(struct k_itimer *timer); int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp); long posix_cpu_nsleep_restart(struct restart_block *restart_block); +#ifdef CONFIG_COMPAT +long compat_nanosleep_restart(struct restart_block *restart); +long compat_clock_nanosleep_restart(struct restart_block *restart); +#endif int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old); int posix_cpu_timer_del(struct k_itimer *timer); @@ -119,4 +123,15 @@ long clock_nanosleep_restart(struct restart_block *restart_block); void update_rlimit_cpu(unsigned long rlim_new); +int invalid_clockid(const clockid_t which_clock); + +static inline cputime_t prof_ticks(struct task_struct *p) +{ + return cputime_add(p->utime, p->stime); +} +static inline cputime_t virt_ticks(struct task_struct *p) +{ + return p->utime; +} + #endif diff --git a/include/linux/resource.h b/include/linux/resource.h index f1e914e..35f6163 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -73,6 +73,7 @@ struct rlimit { struct task_struct; int getrusage(struct task_struct *p, int who, struct rusage __user *ru); +int do_setrlimit(unsigned int resource, struct rlimit *rlim); #endif /* __KERNEL__ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8593051..3ff96d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -641,6 +641,10 @@ struct signal_struct { #endif int oom_adj; /* OOM kill score adjustment (bit shift) */ + +#ifdef CONFIG_CHECKPOINT + atomic_t restart_count; /* threads group restart sync */ +#endif }; /* Context switch must be unlocked if interrupts are to be enabled */ @@ -1518,6 +1522,9 @@ struct task_struct { unsigned long memsw_bytes; /* uncharged mem+swap usage */ } memcg_batch; #endif +#ifdef CONFIG_CHECKPOINT + struct ckpt_ctx *checkpoint_ctx; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ @@ -1711,6 +1718,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_RESTARTING 0x00000020 /* Process is restarting (c/r) */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ @@ -2212,7 +2220,7 @@ static inline int task_detached(struct task_struct *p) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. + * ->cgroup.subsys[]. Also protects ->checkpoint_ctx in checkpoint/restart. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/include/linux/security.h b/include/linux/security.h index 3158dd9..7541237 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1582,6 +1582,7 @@ struct security_operations { int (*task_create) (unsigned long clone_flags); int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp); + void (*cred_free) (struct cred *cred); int (*cred_prepare)(struct cred *new, const struct cred *old, gfp_t gfp); @@ -1913,6 +1914,9 @@ void security_release_secctx(char *secdata, u32 seclen); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); + +char *security_get_lsm_name(void); + #else /* CONFIG_SECURITY */ struct security_mnt_opts { }; @@ -1935,6 +1939,12 @@ static inline int security_init(void) return 0; } +#define DEFAULT_LSM_NAME "lsm_none" +static inline char *security_get_lsm_name(void) +{ + return DEFAULT_LSM_NAME; +} + static inline int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { @@ -2682,6 +2692,7 @@ static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 { return -EOPNOTSUPP; } + #endif /* CONFIG_SECURITY */ #ifdef CONFIG_SECURITY_NETWORK diff --git a/include/linux/sem.h b/include/linux/sem.h index 8a4adbe..8cf9636 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -127,12 +127,14 @@ struct sem_undo { short * semadj; /* array of adjustments, one per semaphore */ }; +struct ipc_namespace; /* sem_undo_list controls shared access to the list of sem_undo structures * that may be shared among all a CLONE_SYSVSEM task group. */ struct sem_undo_list { atomic_t refcnt; spinlock_t lock; + struct ipc_namespace *ipc_ns; struct list_head list_proc; }; diff --git a/include/linux/shm.h b/include/linux/shm.h index eca6235..67fe5e2 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -105,6 +105,9 @@ struct shmid_kernel /* private to the kernel */ #ifdef CONFIG_SYSVIPC long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr); +long do_shmat_pgoff(int shmid, char __user *shmaddr, + int shmflg, unsigned long *addr, + unsigned long shmsize, unsigned long shmpgoff); extern int is_file_shm_hugepages(struct file *file); #else static inline long do_shmat(int shmid, char __user *shmaddr, @@ -118,6 +121,10 @@ static inline int is_file_shm_hugepages(struct file *file) } #endif +struct ipc_namespace; +extern int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, + struct shmid_ds __user *buf, int version); + #endif /* __KERNEL__ */ #endif /* _LINUX_SHM_H_ */ diff --git a/include/linux/signal.h b/include/linux/signal.h index fcd2b14..031784c 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -378,6 +378,9 @@ int unhandled_signal(struct task_struct *tsk, int sig); void signals_init(void); +/* [arch] checkpoint: should saved_sigmask be used in place of blocked */ +int task_has_saved_sigmask(struct task_struct *task); + #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNAL_H */ diff --git a/include/linux/splice.h b/include/linux/splice.h index 18e7c7c..431662c 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -82,4 +82,13 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *, extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); +extern int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); +extern long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); +extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); + #endif diff --git a/include/linux/tty.h b/include/linux/tty.h index 4409967..b76140e 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -513,6 +513,10 @@ extern void tty_ldisc_begin(void); /* This last one is just for the tty layer internals and shouldn't be used elsewhere */ extern void tty_ldisc_enable(struct tty_struct *tty); +/* These are for checkpoint/restart */ +extern int tiocsctty(struct tty_struct *tty, int arg); +extern int do_tiocspgrp(struct tty_struct *tty, + struct tty_struct *real_tty, pid_t pgrp_nr); /* n_tty.c */ extern struct tty_ldisc_ops tty_ldisc_N_TTY; diff --git a/include/linux/user.h b/include/linux/user.h index 68daf84..c231e9c 100644 --- a/include/linux/user.h +++ b/include/linux/user.h @@ -1 +1,10 @@ +#ifndef _LINUX_USER_H +#define _LINUX_USER_H + #include +#include + +extern int may_setuid(struct user_namespace *ns, uid_t uid); +extern int may_setgid(gid_t gid); + +#endif diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index cc4f453..f6ea75d 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -20,6 +20,8 @@ extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS +struct user_namespace *new_user_ns(struct user_struct *creator, + struct user_struct **newroot); static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) @@ -38,6 +40,12 @@ static inline void put_user_ns(struct user_namespace *ns) #else +static inline struct user_namespace *new_user_ns(struct user_struct *creator, + struct user_struct **newroot) +{ + return ERR_PTR(-EINVAL); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 69f3997..774001d 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -49,6 +49,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) kref_get(&ns->kref); } +extern struct uts_namespace *create_uts_ns(void); extern struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ns); extern void free_uts_ns(struct kref *kref); diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 1614d78..f79e72b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -68,4 +68,5 @@ static inline int unix_sysctl_register(struct net *net) { return 0; } static inline void unix_sysctl_unregister(struct net *net) {} #endif #endif + #endif diff --git a/include/net/sock.h b/include/net/sock.h index b4603cd..3cf7de4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1645,6 +1645,54 @@ extern void sock_enable_timestamp(struct sock *sk, int flag); extern int sock_get_timestamp(struct sock *, struct timeval __user *); extern int sock_get_timestampns(struct sock *, struct timespec __user *); +/* bind() helper shared between any callers needing to perform a bind on + * behalf of userspace (syscall and restart) with the security hooks. + */ +static inline int sock_bind(struct socket *sock, + struct sockaddr *addr, + int addr_len) +{ + int err; + + err = security_socket_bind(sock, addr, addr_len); + if (err) + return err; + else + return sock->ops->bind(sock, addr, addr_len); +} + +/* getname() helper shared between any callers needing to perform a getname on + * behalf of userspace (syscall and restart) with the security hooks. + */ +static inline int sock_getname(struct socket *sock, + struct sockaddr *addr, + int *addr_len) +{ + int err; + + err = security_socket_getsockname(sock); + if (err) + return err; + else + return sock->ops->getname(sock, addr, addr_len, 0); +} + +/* getpeer() helper shared between any callers needing to perform a getpeer on + * behalf of userspace (syscall and restart) with the security hooks. + */ +static inline int sock_getpeer(struct socket *sock, + struct sockaddr *addr, + int *addr_len) +{ + int err; + + err = security_socket_getpeername(sock); + if (err) + return err; + else + return sock->ops->getname(sock, addr, addr_len, 1); +} + /* * Enable debug/info messages */ diff --git a/init/Kconfig b/init/Kconfig index eb77e8c..424d5b6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -201,6 +201,12 @@ config SYSVIPC section 6.4 of the Linux Programmer's Guide, available from . +config SYSVIPC_CHECKPOINT + bool + depends on SYSVIPC + depends on CHECKPOINT + default y + config SYSVIPC_SYSCTL bool depends on SYSVIPC @@ -664,7 +670,7 @@ config RELAY If unsure, say N. -config NAMESPACES +menuconfig NAMESPACES bool "Namespaces support" if EMBEDDED default !EMBEDDED help @@ -715,6 +721,8 @@ config NET_NS Allow user space to create what appear to be multiple instances of the network stack. +source "kernel/checkpoint/Kconfig" + config BLK_DEV_INITRD bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" depends on BROKEN || !FRV diff --git a/ipc/Makefile b/ipc/Makefile index 9075e17..55e38d4 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -9,4 +9,5 @@ obj_mq-$(CONFIG_COMPAT) += compat_mq.o obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) obj-$(CONFIG_IPC_NS) += namespace.o obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o - +obj-$(CONFIG_SYSVIPC_CHECKPOINT) += checkpoint.o \ + checkpoint_shm.o checkpoint_msg.o checkpoint_sem.o diff --git a/ipc/msg.c b/ipc/msg.c index 9547cb7..9ef6a5e 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -71,8 +71,7 @@ struct msg_sender { #define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) -static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); -static int newque(struct ipc_namespace *, struct ipc_params *); +static int newque(struct ipc_namespace *, struct ipc_params *, int); #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it); #endif @@ -174,10 +173,12 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) * newque - Create a new msg queue * @ns: namespace * @params: ptr to the structure that contains the key and msgflg + * @req_id: request desired id if available (-1 if don't care) * * Called with msg_ids.rw_mutex held (writer) */ -static int newque(struct ipc_namespace *ns, struct ipc_params *params) +static int +newque(struct ipc_namespace *ns, struct ipc_params *params, int req_id) { struct msg_queue *msq; int id, retval; @@ -201,7 +202,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) /* * ipc_addid() locks msq */ - id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); + id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, req_id); if (id < 0) { security_msg_queue_free(msq); ipc_rcu_putref(msq); @@ -276,7 +277,7 @@ static void expunge_all(struct msg_queue *msq, int res) * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held * before freeque() is called. msg_ids.rw_mutex remains locked on exit. */ -static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) +void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct list_head *tmp; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); @@ -309,14 +310,11 @@ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) return security_msg_queue_associate(msq, msgflg); } -SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) +int do_msgget(struct ipc_namespace *ns, key_t key, int msgflg, int req_id) { - struct ipc_namespace *ns; struct ipc_ops msg_ops; struct ipc_params msg_params; - ns = current->nsproxy->ipc_ns; - msg_ops.getnew = newque; msg_ops.associate = msg_security; msg_ops.more_checks = NULL; @@ -324,7 +322,12 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) msg_params.key = key; msg_params.flg = msgflg; - return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); + return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params, req_id); +} + +SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) +{ + return do_msgget(current->nsproxy->ipc_ns, key, msgflg, -1); } static inline unsigned long diff --git a/ipc/msgutil.c b/ipc/msgutil.c index f095ee2..e119243 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -36,14 +36,6 @@ struct ipc_namespace init_ipc_ns = { atomic_t nr_ipc_ns = ATOMIC_INIT(1); -struct msg_msgseg { - struct msg_msgseg* next; - /* the next part of the message follows immediately */ -}; - -#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) -#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) - struct msg_msg *load_msg(const void __user *src, int len) { struct msg_msg *msg; diff --git a/ipc/namespace.c b/ipc/namespace.c index a1094ff..8e5ea32 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -14,7 +14,7 @@ #include "util.h" -static struct ipc_namespace *create_ipc_ns(void) +struct ipc_namespace *create_ipc_ns(void) { struct ipc_namespace *ns; int err; diff --git a/ipc/sem.c b/ipc/sem.c index dbef95b..4fca49a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -92,8 +92,7 @@ #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) #define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid) -static int newary(struct ipc_namespace *, struct ipc_params *); -static void freeary(struct ipc_namespace *, struct kern_ipc_perm *); +static int newary(struct ipc_namespace *, struct ipc_params *, int); #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #endif @@ -133,14 +132,6 @@ void sem_exit_ns(struct ipc_namespace *ns) } #endif -void __init sem_init (void) -{ - sem_init_ns(&init_ipc_ns); - ipc_init_proc_interface("sysvipc/sem", - " key semid perms nsems uid gid cuid cgid otime ctime\n", - IPC_SEM_IDS, sysvipc_sem_proc_show); -} - /* * sem_lock_(check_) routines are called in the paths where the rw_mutex * is not held. @@ -228,11 +219,13 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * newary - Create a new semaphore set * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems + * @req_id: request desired id if available (-1 if don't care) * * Called with sem_ids.rw_mutex held (as a writer) */ -static int newary(struct ipc_namespace *ns, struct ipc_params *params) +static int +newary(struct ipc_namespace *ns, struct ipc_params *params, int req_id) { int id; int retval; @@ -265,7 +258,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) return retval; } - id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); + id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, req_id); if (id < 0) { security_sem_free(sma); ipc_rcu_putref(sma); @@ -315,14 +308,12 @@ static inline int sem_more_checks(struct kern_ipc_perm *ipcp, return 0; } -SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) +int do_semget(struct ipc_namespace *ns, key_t key, int nsems, + int semflg, int req_id) { - struct ipc_namespace *ns; struct ipc_ops sem_ops; struct ipc_params sem_params; - ns = current->nsproxy->ipc_ns; - if (nsems < 0 || nsems > ns->sc_semmsl) return -EINVAL; @@ -334,7 +325,12 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) sem_params.flg = semflg; sem_params.u.nsems = nsems; - return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); + return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params, req_id); +} + +SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) +{ + return do_semget(current->nsproxy->ipc_ns, key, nsems, semflg, -1); } /* @@ -567,7 +563,7 @@ static void free_un(struct rcu_head *head) * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex * remains locked on exit. */ -static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) +void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct sem_undo *un, *tu; struct sem_queue *q, *tq; @@ -979,6 +975,21 @@ asmlinkage long SyS_semctl(int semid, int semnum, int cmd, union semun arg) SYSCALL_ALIAS(sys_semctl, SyS_semctl); #endif +static struct sem_undo_list *alloc_undo_list(struct ipc_namespace *ipc_ns) +{ + struct sem_undo_list *undo_list; + + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + if (undo_list == NULL) + return NULL; + spin_lock_init(&undo_list->lock); + atomic_set(&undo_list->refcnt, 1); + INIT_LIST_HEAD(&undo_list->list_proc); + undo_list->ipc_ns = ipc_ns; + + return undo_list; +} + /* If the task doesn't already have a undo_list, then allocate one * here. We guarantee there is only one thread using this undo list, * and current is THE ONE @@ -990,19 +1001,16 @@ SYSCALL_ALIAS(sys_semctl, SyS_semctl); * * This can block, so callers must hold no locks. */ -static inline int get_undo_list(struct sem_undo_list **undo_listp) +static inline int get_undo_list(struct sem_undo_list **undo_listp, + struct ipc_namespace *ipc_ns) { struct sem_undo_list *undo_list; undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); - if (undo_list == NULL) + undo_list = alloc_undo_list(ipc_ns); + if (!undo_list) return -ENOMEM; - spin_lock_init(&undo_list->lock); - atomic_set(&undo_list->refcnt, 1); - INIT_LIST_HEAD(&undo_list->list_proc); - current->sysvsem.undo_list = undo_list; } *undo_listp = undo_list; @@ -1035,7 +1043,7 @@ static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid) } /** - * find_alloc_undo - Lookup (and if not present create) undo array + * __find_alloc_undo - Lookup (and if not present create) undo array * @ns: namespace * @semid: semaphore array id * @@ -1045,7 +1053,8 @@ static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid) * Lifetime-rules: sem_undo is rcu-protected, on success, the function * performs a rcu_read_lock(). */ -static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) +static struct sem_undo *__find_alloc_undo(struct ipc_namespace *ns, int semid, + short checkperms) { struct sem_array *sma; struct sem_undo_list *ulp; @@ -1053,7 +1062,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) int nsems; int error; - error = get_undo_list(&ulp); + error = get_undo_list(&ulp, ns); if (error) return ERR_PTR(error); @@ -1071,6 +1080,11 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) if (IS_ERR(sma)) return ERR_PTR(PTR_ERR(sma)); + if (checkperms && ipcperms(&sma->sem_perm, checkperms)) { + sem_unlock(sma); + return ERR_PTR(-EPERM); + } + nsems = sma->sem_nsems; sem_getref_and_unlock(sma); @@ -1117,6 +1131,11 @@ out: return un; } +static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) +{ + return __find_alloc_undo(ns, semid, 0); +} + SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1324,7 +1343,7 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) int error; if (clone_flags & CLONE_SYSVSEM) { - error = get_undo_list(&undo_list); + error = get_undo_list(&undo_list, tsk->nsproxy->ipc_ns); if (error) return error; atomic_inc(&undo_list->refcnt); @@ -1347,14 +1366,8 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) * The current implementation does not do so. The POSIX standard * and SVID should be consulted to determine what behavior is mandated. */ -void exit_sem(struct task_struct *tsk) +static void put_undo_list(struct sem_undo_list *ulp) { - struct sem_undo_list *ulp; - - ulp = tsk->sysvsem.undo_list; - if (!ulp) - return; - tsk->sysvsem.undo_list = NULL; if (!atomic_dec_and_test(&ulp->refcnt)) return; @@ -1377,7 +1390,7 @@ void exit_sem(struct task_struct *tsk) if (semid == -1) break; - sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid); + sma = sem_lock_check(ulp->ipc_ns, un->semid); /* exit_sem raced with IPC_RMID, nothing to do */ if (IS_ERR(sma)) @@ -1435,6 +1448,16 @@ void exit_sem(struct task_struct *tsk) kfree(ulp); } +void exit_sem(struct task_struct *tsk) +{ + struct sem_undo_list *ulp = tsk->sysvsem.undo_list; + + if (ulp) { + put_undo_list(ulp); + tsk->sysvsem.undo_list = NULL; + } +} + #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it) { @@ -1454,3 +1477,19 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) sma->sem_ctime); } #endif + +void __init sem_init (void) +{ + sem_init_ns(&init_ipc_ns); + ipc_init_proc_interface("sysvipc/sem", + " key semid perms nsems uid gid cuid cgid otime ctime\n", + IPC_SEM_IDS, sysvipc_sem_proc_show); + +#ifdef CONFIG_CHECKPOINT + /* sem_undo_list uses a short but we write a __s16 */ + CKPT_BUILD_BUG_ON_MISMATCH(*CKPT_STRUCT_MEMBER(sem_undo, semadj), + __s16); + + register_checkpoint_obj(&ckpt_obj_sem_undo_ops); +#endif +} diff --git a/ipc/shm.c b/ipc/shm.c index 1a314c8..ce41555 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -61,7 +62,7 @@ static const struct vm_operations_struct shm_vm_ops; #define shm_unlock(shp) \ ipc_unlock(&(shp)->shm_perm) -static int newseg(struct ipc_namespace *, struct ipc_params *); +static int newseg(struct ipc_namespace *, struct ipc_params *, int); static void shm_open(struct vm_area_struct *vma); static void shm_close(struct vm_area_struct *vma); static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); @@ -82,7 +83,7 @@ void shm_init_ns(struct ipc_namespace *ns) * Called with shm_ids.rw_mutex (writer) and the shp structure locked. * Only shm_ids.rw_mutex remains locked on exit. */ -static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) +void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); @@ -329,11 +330,13 @@ static const struct vm_operations_struct shm_vm_ops = { * newseg - Create a new shared memory segment * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg + * @req_id: request desired id if available (-1 if don't care) * * Called with shm_ids.rw_mutex held as a writer. */ -static int newseg(struct ipc_namespace *ns, struct ipc_params *params) +static int +newseg(struct ipc_namespace *ns, struct ipc_params *params, int req_id) { key_t key = params->key; int shmflg = params->flg; @@ -388,7 +391,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (IS_ERR(file)) goto no_file; - id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); + id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, req_id); if (id < 0) { error = id; goto no_id; @@ -448,14 +451,12 @@ static inline int shm_more_checks(struct kern_ipc_perm *ipcp, return 0; } -SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) +int do_shmget(struct ipc_namespace *ns, key_t key, size_t size, + int shmflg, int req_id) { - struct ipc_namespace *ns; struct ipc_ops shm_ops; struct ipc_params shm_params; - ns = current->nsproxy->ipc_ns; - shm_ops.getnew = newseg; shm_ops.associate = shm_security; shm_ops.more_checks = shm_more_checks; @@ -464,7 +465,12 @@ SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) shm_params.flg = shmflg; shm_params.u.size = size; - return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); + return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params, req_id); +} + +SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) +{ + return do_shmget(current->nsproxy->ipc_ns, key, size, shmflg, -1); } static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) @@ -595,8 +601,8 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, * to be held in write mode. * NOTE: no locks must be held, the rw_mutex is taken inside this function. */ -static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, - struct shmid_ds __user *buf, int version) +int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, + struct shmid_ds __user *buf, int version) { struct kern_ipc_perm *ipcp; struct shmid64_ds shmid64; @@ -810,11 +816,13 @@ out: * "raddr" thing points to kernel space, and there has to be a wrapper around * this. */ -long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) +long do_shmat_pgoff(int shmid, char __user *shmaddr, int shmflg, + ulong *raddr, ulong shmsize, ulong shmpgoff) { struct shmid_kernel *shp; unsigned long addr; unsigned long size; + unsigned long pgoff; struct file * file; int err; unsigned long flags; @@ -886,6 +894,17 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) size = i_size_read(path.dentry->d_inode); shm_unlock(shp); + pgoff = 0; + + err = -EINVAL; + if (shmsize) { + if (shmpgoff + shmsize > size || + shmpgoff + shmsize < shmpgoff) + goto out_put_dentry; + size = shmsize; + pgoff = shmpgoff; + } + err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); if (!sfd) @@ -919,7 +938,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) goto invalid; } - user_addr = do_mmap (file, addr, size, prot, flags, 0); + user_addr = do_mmap (file, addr, size, prot, flags, pgoff); *raddr = user_addr; err = 0; if (IS_ERR_VALUE(user_addr)) @@ -955,6 +974,16 @@ out_put_dentry: goto out_nattch; } +/* + * NOTE! Despite the name, this is NOT a direct system call entrypoint. The + * "raddr" thing points to kernel space, and there has to be a wrapper around + * this. + */ +long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) +{ + return do_shmat_pgoff(shmid, shmaddr, shmflg, raddr, 0, 0); +} + SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) { unsigned long ret; diff --git a/ipc/util.c b/ipc/util.c index 79ce84e..c4ce60d 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -247,10 +247,12 @@ int ipc_get_maxid(struct ipc_ids *ids) * Called with ipc_ids.rw_mutex held as a writer. */ -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) +int +ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size, int req_id) { uid_t euid; gid_t egid; + int lid = 0; int id, err; if (size > IPCMNI) @@ -259,28 +261,41 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) if (ids->in_use >= size) return -ENOSPC; + if (req_id >= 0) + lid = ipcid_to_idx(req_id); + spin_lock_init(&new->lock); new->deleted = 0; rcu_read_lock(); spin_lock(&new->lock); - err = idr_get_new(&ids->ipcs_idr, new, &id); + err = idr_get_new_above(&ids->ipcs_idr, new, lid, &id); if (err) { spin_unlock(&new->lock); rcu_read_unlock(); return err; } + if (req_id >= 0) { + if (id != lid) { + idr_remove(&ids->ipcs_idr, id); + spin_unlock(&new->lock); + rcu_read_unlock(); + return -EBUSY; + } + new->seq = req_id / SEQ_MULTIPLIER; + } else { + new->seq = ids->seq++; + if (ids->seq > ids->seq_max) + ids->seq = 0; + } + ids->in_use++; current_euid_egid(&euid, &egid); new->cuid = new->uid = euid; new->gid = new->cgid = egid; - new->seq = ids->seq++; - if(ids->seq > ids->seq_max) - ids->seq = 0; - new->id = ipc_buildid(id, new->seq); return id; } @@ -296,7 +311,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) * when the key is IPC_PRIVATE. */ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, - struct ipc_ops *ops, struct ipc_params *params) + struct ipc_ops *ops, struct ipc_params *params, int req_id) { int err; retry: @@ -306,7 +321,7 @@ retry: return -ENOMEM; down_write(&ids->rw_mutex); - err = ops->getnew(ns, params); + err = ops->getnew(ns, params, req_id); up_write(&ids->rw_mutex); if (err == -EAGAIN) @@ -351,6 +366,7 @@ static int ipc_check_perms(struct kern_ipc_perm *ipcp, struct ipc_ops *ops, * @ids: IPC identifer set * @ops: the actual creation routine to call * @params: its parameters + * @req_id: request desired id if available (-1 if don't care) * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE. @@ -360,7 +376,7 @@ static int ipc_check_perms(struct kern_ipc_perm *ipcp, struct ipc_ops *ops, * On success, the ipc id is returned. */ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, - struct ipc_ops *ops, struct ipc_params *params) + struct ipc_ops *ops, struct ipc_params *params, int req_id) { struct kern_ipc_perm *ipcp; int flg = params->flg; @@ -381,7 +397,7 @@ retry: else if (!err) err = -ENOMEM; else - err = ops->getnew(ns, params); + err = ops->getnew(ns, params, req_id); } else { /* ipc object has been locked by ipc_findkey() */ @@ -742,12 +758,12 @@ struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id) * Common routine called by sys_msgget(), sys_semget() and sys_shmget(). */ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, - struct ipc_ops *ops, struct ipc_params *params) + struct ipc_ops *ops, struct ipc_params *params, int req_id) { if (params->key == IPC_PRIVATE) - return ipcget_new(ns, ids, ops, params); + return ipcget_new(ns, ids, ops, params, req_id); else - return ipcget_public(ns, ids, ops, params); + return ipcget_public(ns, ids, ops, params, req_id); } /** diff --git a/ipc/util.h b/ipc/util.h index 764b51a..62ea760 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -12,6 +12,7 @@ #include #include +#include #define SEQ_MULTIPLIER (IPCMNI) @@ -71,7 +72,7 @@ struct ipc_params { * . routine to call for an extra check if needed */ struct ipc_ops { - int (*getnew) (struct ipc_namespace *, struct ipc_params *); + int (*getnew) (struct ipc_namespace *, struct ipc_params *, int); int (*associate) (struct kern_ipc_perm *, int); int (*more_checks) (struct kern_ipc_perm *, struct ipc_params *); }; @@ -94,7 +95,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) /* must be called with ids->rw_mutex acquired for writing */ -int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); +int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int); /* must be called with ids->rw_mutex acquired for reading */ int ipc_get_maxid(struct ipc_ids *); @@ -140,6 +141,14 @@ extern void free_msg(struct msg_msg *msg); extern struct msg_msg *load_msg(const void __user *src, int len); extern int store_msg(void __user *dest, struct msg_msg *msg, int len); +struct msg_msgseg { + struct msg_msgseg *next; + /* the next part of the message follows immediately */ +}; + +#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) +#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) + extern void recompute_msgmni(struct ipc_namespace *); static inline int ipc_buildid(int id, int seq) @@ -171,7 +180,22 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm) struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, - struct ipc_ops *ops, struct ipc_params *params); + struct ipc_ops *ops, struct ipc_params *params, int req_id); void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, - void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)); + void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)); + +struct ipc_namespace *create_ipc_ns(void); + +int do_shmget(struct ipc_namespace *ns, key_t key, size_t size, int shmflg, + int req_id); +void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp); + +int do_msgget(struct ipc_namespace *ns, key_t key, int msgflg, int req_id); +void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp); + +int do_semget(struct ipc_namespace *ns, key_t key, int nsems, int semflg, + int req_id); +void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp); + + #endif diff --git a/kernel/Makefile b/kernel/Makefile index a987aa1..67159cd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -25,6 +25,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_perf_event.o = -pg endif +obj-$(CONFIG_DEFERQUEUE) += deferqueue.o obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o @@ -105,6 +106,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CHECKPOINT) += checkpoint/ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/capability.c b/kernel/capability.c index 9e4697e..4f868b3 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include "cred-internals.h" @@ -215,6 +217,45 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) return ret; } +static int do_capset_tocred(kernel_cap_t *effective, kernel_cap_t *inheritable, + kernel_cap_t *permitted, struct cred *new) +{ + int ret; + + ret = security_capset(new, current_cred(), + effective, inheritable, permitted); + if (ret < 0) + return ret; + + /* + * for checkpoint-restart, do we want to wait until end of restart? + * not sure we care */ + audit_log_capset(current->pid, new, current_cred()); + + return 0; +} + +static int do_capset(kernel_cap_t *effective, kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = do_capset_tocred(effective, inheritable, permitted, new); + if (ret < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return ret; +} + /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and @@ -238,7 +279,6 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; - struct cred *new; int ret; pid_t pid; @@ -272,22 +312,52 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) i++; } - new = prepare_creds(); - if (!new) - return -ENOMEM; + return do_capset(&effective, &inheritable, &permitted); - ret = security_capset(new, current_cred(), - &effective, &inheritable, &permitted); - if (ret < 0) - goto error; +} + +int apply_securebits(unsigned securebits, struct cred *new) +{ + if ((((new->securebits & SECURE_ALL_LOCKS) >> 1) + & (new->securebits ^ securebits)) /*[1]*/ + || ((new->securebits & SECURE_ALL_LOCKS & ~securebits)) /*[2]*/ + || (securebits & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ + || (cap_capable(current, current_cred(), CAP_SETPCAP, + SECURITY_CAP_AUDIT) != 0) /*[4]*/ + /* + * [1] no changing of bits that are locked + * [2] no unlocking of locks + * [3] no setting of unsupported bits + * [4] doing anything requires privilege (go read about + * the "sendmail capabilities bug") + */ + ) + /* cannot change a locked bit */ + return -EPERM; + new->securebits = securebits; + return 0; +} - audit_log_capset(pid, new, current_cred()); +static void do_capbset_drop(struct cred *cred, int cap) +{ + cap_lower(cred->cap_bset, cap); +} - return commit_creds(new); +static inline int restore_cap_bset(kernel_cap_t bset, struct cred *cred) +{ + int i, may_dropbcap = capable(CAP_SETPCAP); + + for (i = 0; i < CAP_LAST_CAP; i++) { + if (cap_raised(bset, i)) + continue; + if (!cap_raised(current_cred()->cap_bset, i)) + continue; + if (!may_dropbcap) + return -EPERM; + do_capbset_drop(cred, i); + } -error: - abort_creds(new); - return ret; + return 0; } /** diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index da5e139..8f923d8 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -26,6 +26,7 @@ enum freezer_state { CGROUP_THAWED = 0, CGROUP_FREEZING, CGROUP_FROZEN, + CGROUP_CHECKPOINTING, }; struct freezer { @@ -64,6 +65,44 @@ int cgroup_freezing_or_frozen(struct task_struct *task) return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); } +/* Task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} + +/* + * caller must hold freezer->lock + */ +static void update_freezer_state(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + if (is_task_frozen_enough(task)) + nfrozen++; + } + + /* + * Transition to FROZEN when no new tasks can be added ensures + * that we never exist in the FROZEN state while there are unfrozen + * tasks. + */ + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + else if (nfrozen > 0) + freezer->state = CGROUP_FREEZING; + else + freezer->state = CGROUP_THAWED; + cgroup_iter_end(cgroup, &it); +} + /* * cgroups_write_string() limits the size of freezer state strings to * CGROUP_LOCAL_BUFFER_SIZE @@ -72,6 +111,7 @@ static const char *freezer_state_strs[] = { "THAWED", "FREEZING", "FROZEN", + "CHECKPOINTING", }; /* @@ -79,9 +119,9 @@ static const char *freezer_state_strs[] = { * Transitions are caused by userspace writes to the freezer.state file. * The values in parenthesis are state labels. The rest are edge labels. * - * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) - * ^ ^ | | - * | \_______THAWED_______/ | + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) --> (CHECKPOINTING) + * ^ ^ | | ^ | + * | \_______THAWED_______/ | \_____________/ * \__________________________THAWED____________/ */ @@ -89,10 +129,10 @@ struct cgroup_subsys freezer_subsys; /* Locks taken and their ordering * ------------------------------ - * css_set_lock * cgroup_mutex (AKA cgroup_lock) - * task->alloc_lock (AKA task_lock) * freezer->lock + * css_set_lock + * task->alloc_lock (AKA task_lock) * task->sighand->siglock * * cgroup code forces css_set_lock to be taken before task->alloc_lock @@ -100,33 +140,38 @@ struct cgroup_subsys freezer_subsys; * freezer_create(), freezer_destroy(): * cgroup_mutex [ by cgroup core ] * - * can_attach(): - * cgroup_mutex + * freezer_can_attach(): + * cgroup_mutex (held by caller of can_attach) * - * cgroup_frozen(): + * cgroup_freezing_or_frozen(): * task->alloc_lock (to get task's cgroup) * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * task->alloc_lock (to get task's cgroup) * freezer->lock * sighand->siglock (if the cgroup is freezing) * * freezer_read(): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) * * freezer_write() (freeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock + * sighand->siglock (fake signal delivery inside freeze_task()) * * freezer_write() (unfreeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (to prevent races with freeze_task()) + * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, @@ -149,13 +194,6 @@ static void freezer_destroy(struct cgroup_subsys *ss, kfree(cgroup_freezer(cgroup)); } -/* Task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -225,37 +263,6 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) spin_unlock_irq(&freezer->lock); } -/* - * caller must hold freezer->lock - */ -static void update_freezer_state(struct cgroup *cgroup, - struct freezer *freezer) -{ - struct cgroup_iter it; - struct task_struct *task; - unsigned int nfrozen = 0, ntotal = 0; - - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - ntotal++; - if (is_task_frozen_enough(task)) - nfrozen++; - } - - /* - * Transition to FROZEN when no new tasks can be added ensures - * that we never exist in the FROZEN state while there are unfrozen - * tasks. - */ - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - else if (nfrozen > 0) - freezer->state = CGROUP_FREEZING; - else - freezer->state = CGROUP_THAWED; - cgroup_iter_end(cgroup, &it); -} - static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { @@ -326,7 +333,10 @@ static int freezer_change_state(struct cgroup *cgroup, freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); - + if (freezer->state == CGROUP_CHECKPOINTING) { + retval = -EBUSY; + goto out; + } update_freezer_state(cgroup, freezer); if (goal_state == freezer->state) goto out; @@ -394,3 +404,107 @@ struct cgroup_subsys freezer_subsys = { .fork = freezer_fork, .exit = NULL, }; + +#ifdef CONFIG_CHECKPOINT +/* + * Caller is expected to ensure that neither @p nor @q may change its + * freezer cgroup during this test in a way that may affect the result. + * E.g., when called form c/r, @p must be in CHECKPOINTING cgroup, so + * may not change cgroup, and either @q is also there, or is not there + * and may not join. + */ +int in_same_cgroup_freezer(struct task_struct *p, struct task_struct *q) +{ + struct cgroup_subsys_state *p_css, *q_css; + + task_lock(p); + p_css = task_subsys_state(p, freezer_subsys_id); + task_unlock(p); + + task_lock(q); + q_css = task_subsys_state(q, freezer_subsys_id); + task_unlock(q); + + return (p_css == q_css); +} + +/* + * cgroup freezer state changes made without the aid of the cgroup filesystem + * must go through this function to ensure proper locking is observed. + */ +static int freezer_checkpointing(struct task_struct *task, + enum freezer_state next_state) +{ + struct freezer *freezer; + struct cgroup_subsys_state *css; + enum freezer_state state; + + task_lock(task); + css = task_subsys_state(task, freezer_subsys_id); + css_get(css); /* make sure freezer doesn't go away */ + freezer = container_of(css, struct freezer, css); + task_unlock(task); + + if (freezer->state == CGROUP_FREEZING) { + /* May be in middle of a lazy FREEZING -> FROZEN transition */ + if (cgroup_lock_live_group(css->cgroup)) { + spin_lock_irq(&freezer->lock); + update_freezer_state(css->cgroup, freezer); + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + } + } + + spin_lock_irq(&freezer->lock); + state = freezer->state; + if ((state == CGROUP_FROZEN && next_state == CGROUP_CHECKPOINTING) || + (state == CGROUP_CHECKPOINTING && next_state == CGROUP_FROZEN)) + freezer->state = next_state; + spin_unlock_irq(&freezer->lock); + css_put(css); + return state; +} + +int cgroup_freezer_begin_checkpoint(struct task_struct *task) +{ + if (freezer_checkpointing(task, CGROUP_CHECKPOINTING) != CGROUP_FROZEN) + return -EBUSY; + return 0; +} + +void cgroup_freezer_end_checkpoint(struct task_struct *task) +{ + /* + * If we weren't in CHECKPOINTING state then userspace could have + * unfrozen a task and given us an inconsistent checkpoint image + */ + WARN_ON(freezer_checkpointing(task, CGROUP_FROZEN) != CGROUP_CHECKPOINTING); +} + +int cgroup_freezer_make_frozen(struct task_struct *task) +{ + struct freezer *freezer; + struct cgroup_subsys_state *css; + int ret = -ENODEV; + + task_lock(task); + css = task_subsys_state(task, freezer_subsys_id); + css_get(css); /* make sure freezer doesn't go away */ + freezer = container_of(css, struct freezer, css); + task_unlock(task); + + /* Never freeze the root cgroup */ + if (!test_bit(CSS_ROOT, &css->flags) && + cgroup_lock_live_group(css->cgroup)) { + /* do not freeze outselves, ei ?! */ + if (css != task_subsys_state(current, freezer_subsys_id)) + ret = freezer_change_state(css->cgroup, CGROUP_FROZEN); + else + ret = -EPERM; + cgroup_unlock(); + } + + css_put(css); + return ret; +} +#endif /* CONFIG_CHECKPOINT */ diff --git a/kernel/compat.c b/kernel/compat.c index 7f40e92..8b18f5d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -101,7 +101,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static long compat_nanosleep_restart(struct restart_block *restart) +long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; struct timespec rmt; @@ -648,7 +648,7 @@ long compat_sys_clock_getres(clockid_t which_clock, return err; } -static long compat_clock_nanosleep_restart(struct restart_block *restart) +long compat_clock_nanosleep_restart(struct restart_block *restart) { long err; mm_segment_t oldfs; diff --git a/kernel/cred.c b/kernel/cred.c index e1dbe9e..9abe8fa 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "cred-internals.h" #if 0 @@ -895,3 +896,118 @@ void validate_creds_for_do_exit(struct task_struct *tsk) } #endif /* CONFIG_DEBUG_CREDENTIALS */ + +int cred_setresuid(struct cred *new, uid_t ruid, uid_t euid, uid_t suid) +{ + int retval; + const struct cred *old; + + retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); + if (retval) + return retval; + old = current_cred(); + + if (!capable(CAP_SETUID)) { + if (ruid != (uid_t) -1 && ruid != old->uid && + ruid != old->euid && ruid != old->suid) + return -EPERM; + if (euid != (uid_t) -1 && euid != old->uid && + euid != old->euid && euid != old->suid) + return -EPERM; + if (suid != (uid_t) -1 && suid != old->uid && + suid != old->euid && suid != old->suid) + return -EPERM; + } + + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + return retval; + } + } + if (euid != (uid_t) -1) + new->euid = euid; + if (suid != (uid_t) -1) + new->suid = suid; + new->fsuid = new->euid; + + return security_task_fix_setuid(new, old, LSM_SETID_RES); +} + +int cred_setresgid(struct cred *new, gid_t rgid, gid_t egid, + gid_t sgid) +{ + const struct cred *old = current_cred(); + int retval; + + retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); + if (retval) + return retval; + + if (!capable(CAP_SETGID)) { + if (rgid != (gid_t) -1 && rgid != old->gid && + rgid != old->egid && rgid != old->sgid) + return -EPERM; + if (egid != (gid_t) -1 && egid != old->gid && + egid != old->egid && egid != old->sgid) + return -EPERM; + if (sgid != (gid_t) -1 && sgid != old->gid && + sgid != old->egid && sgid != old->sgid) + return -EPERM; + } + + if (rgid != (gid_t) -1) + new->gid = rgid; + if (egid != (gid_t) -1) + new->egid = egid; + if (sgid != (gid_t) -1) + new->sgid = sgid; + new->fsgid = new->egid; + return 0; +} + +int cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid) +{ + const struct cred *old; + + old = current_cred(); + *old_fsuid = old->fsuid; + + if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) + return -EPERM; + + if (uid == old->uid || uid == old->euid || + uid == old->suid || uid == old->fsuid || + capable(CAP_SETUID)) { + if (uid != *old_fsuid) { + new->fsuid = uid; + if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) + return 0; + } + } + return -EPERM; +} + +int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid) +{ + const struct cred *old; + + old = current_cred(); + *old_fsgid = old->fsgid; + + if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) + return -EPERM; + + if (gid == old->gid || gid == old->egid || + gid == old->sgid || gid == old->fsgid || + capable(CAP_SETGID)) { + if (gid != *old_fsgid) { + new->fsgid = gid; + return 0; + } + } + return -EPERM; +} + diff --git a/kernel/exit.c b/kernel/exit.c index 7f2683a..0ef6685 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -303,6 +304,10 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; + /* restarting zombie doesn't trigger signals */ + if (tsk->flags & PF_RESTARTING) + return; + if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. @@ -792,7 +797,7 @@ static void forget_original_parent(struct task_struct *father) BUG_ON(task_ptrace(t)); t->parent = t->real_parent; } - if (t->pdeath_signal) + if (t->pdeath_signal && !(t->flags & PF_RESTARTING)) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t); } while_each_thread(p, t); @@ -1010,6 +1015,10 @@ NORET_TYPE void do_exit(long code) if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif +#ifdef CONFIG_CHECKPOINT + if (unlikely(tsk->checkpoint_ctx)) + exit_checkpoint(tsk); +#endif /* * Make sure we are holding no locks: */ diff --git a/kernel/fork.c b/kernel/fork.c index 9d5be5c..86ced8c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -886,6 +887,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; +#ifdef CONFIG_CHECKPOINT + atomic_set(&sig->restart_count, 0); +#endif return 0; } @@ -1226,6 +1230,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); +#ifdef CONFIG_CHECKPOINT + /* If parent is restarting, child should be too */ + if (unlikely(current->checkpoint_ctx)) + p->checkpoint_ctx = ckpt_ctx_get(current->checkpoint_ctx); +#endif + /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; diff --git a/kernel/futex.c b/kernel/futex.c index e7a35f1..baaecb4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1593,16 +1593,6 @@ handle_fault: goto retry; } -/* - * In case we must use restart_block to restart a futex_wait, - * we encode in the 'flags' shared capability - */ -#define FLAGS_SHARED 0x01 -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 - -static long futex_wait_restart(struct restart_block *restart); - /** * fixup_owner() - Post lock pi_state and corner case management * @uaddr: user address of the futex @@ -1876,7 +1866,7 @@ out: } -static long futex_wait_restart(struct restart_block *restart) +long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; @@ -2352,13 +2342,7 @@ out: * the list. There can only be one such pending lock. */ -/** - * sys_set_robust_list() - Set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects - */ -SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, - size_t, len) +long do_set_robust_list(struct robust_list_head __user *head, size_t len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -2374,6 +2358,17 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, } /** + * sys_set_robust_list() - Set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects + */ +SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, + size_t, len) +{ + return do_set_robust_list(head, len); +} + +/** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] * @head_ptr: pointer to a list-head pointer, the kernel fills it in diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2..900bb2b 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -114,9 +114,9 @@ void compat_exit_robust_list(struct task_struct *curr) } } -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, - compat_size_t len) +long +do_compat_set_robust_list(struct compat_robust_list_head __user *head, + compat_size_t len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -130,6 +130,13 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, } asmlinkage long +compat_sys_set_robust_list(struct compat_robust_list_head __user *head, + compat_size_t len) +{ + return do_compat_set_robust_list(head, len); +} + +asmlinkage long compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr) { diff --git a/kernel/groups.c b/kernel/groups.c index 2b45b2e..9b0a176 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -6,6 +6,7 @@ #include #include #include +#include #include /* init to 2 - one for init_task, one to ensure it is never freed */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c0..5f96b1c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,6 +22,7 @@ #include #include #include +#include static struct kmem_cache *nsproxy_cachep; @@ -236,7 +237,11 @@ void exit_task_namespaces(struct task_struct *p) static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); +#ifdef CONFIG_CHECKPOINT + return checkpoint_register_nsproxy(); +#else return 0; +#endif } module_init(nsproxy_cache_init); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bc7704b..fd35ef1 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -168,15 +168,6 @@ static void bump_cpu_timer(struct k_itimer *timer, } } -static inline cputime_t prof_ticks(struct task_struct *p) -{ - return cputime_add(p->utime, p->stime); -} -static inline cputime_t virt_ticks(struct task_struct *p) -{ - return p->utime; -} - int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { int error = check_clock(which_clock); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 00d1fda..ec2e802 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -211,7 +211,7 @@ static int no_nsleep(const clockid_t which_clock, int flags, /* * Return nonzero if we know a priori this clockid_t value is bogus. */ -static inline int invalid_clockid(const clockid_t which_clock) +int invalid_clockid(const clockid_t which_clock) { if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ return 0; diff --git a/kernel/signal.c b/kernel/signal.c index dbd7fe0..32dc1cd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -30,6 +30,12 @@ #include #define CREATE_TRACE_POINTS #include +#define CKPT_DFLAG CKPT_DSYS +#include +#include +#include +#include +#include #include #include @@ -1449,6 +1455,10 @@ int do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!task_ptrace(tsk) && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + /* restarting zombie doesn't notify parent */ + if (tsk->flags & PF_RESTARTING) + return ret; + info.si_signo = sig; info.si_errno = 0; /* @@ -2734,4 +2744,7 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); +#ifdef CONFIG_CHECKPOINT + checkpoint_register_signal(); +#endif } diff --git a/kernel/sys.c b/kernel/sys.c index 6d1a7e0..9a98d05 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -565,11 +565,12 @@ error: /* * change the user struct in a credentials set to match the new UID */ -static int set_user(struct cred *new) +int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current_user_ns(), new->uid); + /* is this ok? */ + new_user = alloc_uid(new->user->user_ns, new->uid); if (!new_user) return -EAGAIN; @@ -704,14 +705,12 @@ error: return retval; } - /* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { - const struct cred *old; struct cred *new; int retval; @@ -719,45 +718,10 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) if (!new) return -ENOMEM; - retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); - if (retval) - goto error; - old = current_cred(); - - retval = -EPERM; - if (!capable(CAP_SETUID)) { - if (ruid != (uid_t) -1 && ruid != old->uid && - ruid != old->euid && ruid != old->suid) - goto error; - if (euid != (uid_t) -1 && euid != old->uid && - euid != old->euid && euid != old->suid) - goto error; - if (suid != (uid_t) -1 && suid != old->uid && - suid != old->euid && suid != old->suid) - goto error; - } - - if (ruid != (uid_t) -1) { - new->uid = ruid; - if (ruid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } - } - if (euid != (uid_t) -1) - new->euid = euid; - if (suid != (uid_t) -1) - new->suid = suid; - new->fsuid = new->euid; - - retval = security_task_fix_setuid(new, old, LSM_SETID_RES); - if (retval < 0) - goto error; - - return commit_creds(new); + retval = cred_setresuid(new, ruid, euid, suid); + if (retval == 0) + return commit_creds(new); -error: abort_creds(new); return retval; } @@ -779,43 +743,17 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u */ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { - const struct cred *old; struct cred *new; int retval; new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); - if (retval) - goto error; - - retval = -EPERM; - if (!capable(CAP_SETGID)) { - if (rgid != (gid_t) -1 && rgid != old->gid && - rgid != old->egid && rgid != old->sgid) - goto error; - if (egid != (gid_t) -1 && egid != old->gid && - egid != old->egid && egid != old->sgid) - goto error; - if (sgid != (gid_t) -1 && sgid != old->gid && - sgid != old->egid && sgid != old->sgid) - goto error; - } - - if (rgid != (gid_t) -1) - new->gid = rgid; - if (egid != (gid_t) -1) - new->egid = egid; - if (sgid != (gid_t) -1) - new->sgid = sgid; - new->fsgid = new->egid; - return commit_creds(new); + retval = cred_setresgid(new, rgid, egid, sgid); + if (retval == 0) + return commit_creds(new); -error: abort_creds(new); return retval; } @@ -832,7 +770,6 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u return retval; } - /* * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This * is used for "access()" and for the NFS daemon (letting nfsd stay at @@ -841,35 +778,20 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u */ SYSCALL_DEFINE1(setfsuid, uid_t, uid) { - const struct cred *old; struct cred *new; uid_t old_fsuid; + int retval; new = prepare_creds(); if (!new) return current_fsuid(); - old = current_cred(); - old_fsuid = old->fsuid; - - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) - goto error; - - if (uid == old->uid || uid == old->euid || - uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { - if (uid != old_fsuid) { - new->fsuid = uid; - if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) - goto change_okay; - } - } -error: - abort_creds(new); - return old_fsuid; + retval = cred_setfsuid(new, uid, &old_fsuid); + if (retval == 0) + commit_creds(new); + else + abort_creds(new); -change_okay: - commit_creds(new); return old_fsuid; } @@ -878,34 +800,20 @@ change_okay: */ SYSCALL_DEFINE1(setfsgid, gid_t, gid) { - const struct cred *old; struct cred *new; gid_t old_fsgid; + int retval; new = prepare_creds(); if (!new) return current_fsgid(); - old = current_cred(); - old_fsgid = old->fsgid; - - if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) - goto error; - if (gid == old->gid || gid == old->egid || - gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { - if (gid != old_fsgid) { - new->fsgid = gid; - goto change_okay; - } - } - -error: - abort_creds(new); - return old_fsgid; + retval = cred_setfsgid(new, gid, &old_fsgid); + if (retval == 0) + commit_creds(new); + else + abort_creds(new); -change_okay: - commit_creds(new); return old_fsgid; } @@ -1303,40 +1211,39 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, #endif -SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +int do_setrlimit(unsigned int resource, struct rlimit *new_rlim) { - struct rlimit new_rlim, *old_rlim; + struct rlimit *old_rlim; int retval; if (resource >= RLIM_NLIMITS) return -EINVAL; - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) + if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; + old_rlim = current->signal->rlim + resource; - if ((new_rlim.rlim_max > old_rlim->rlim_max) && + if ((new_rlim->rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) + if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; - retval = security_task_setrlimit(resource, &new_rlim); + retval = security_task_setrlimit(resource, new_rlim); if (retval) return retval; - if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { /* * The caller is asking for an immediate RLIMIT_CPU * expiry. But we use the zero value to mean "it was * never set". So let's cheat and make it one second * instead */ - new_rlim.rlim_cur = 1; + new_rlim->rlim_cur = 1; } task_lock(current->group_leader); - *old_rlim = new_rlim; + *old_rlim = *new_rlim; task_unlock(current->group_leader); if (resource != RLIMIT_CPU) @@ -1348,14 +1255,25 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) * very long-standing error, and fixing it now risks breakage of * applications, so we live with it */ - if (new_rlim.rlim_cur == RLIM_INFINITY) + if (new_rlim->rlim_cur == RLIM_INFINITY) goto out; - update_rlimit_cpu(new_rlim.rlim_cur); + update_rlimit_cpu(new_rlim->rlim_cur); out: return 0; } +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + return do_setrlimit(resource, &new_rlim); +} + /* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 70f2ea7..0206aca 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2); /* performance counters: */ cond_syscall(sys_perf_event_open); + +/* checkpoint/restart */ +cond_syscall(sys_checkpoint); +cond_syscall(sys_restart); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8686b0f..967fa2a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -926,6 +926,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/user.c b/kernel/user.c index 766467b..3c78366 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "cred-internals.h" struct user_namespace init_user_ns = { @@ -199,7 +200,11 @@ static int __init uid_cache_init(void) uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); spin_unlock_irq(&uidhash_lock); +#ifdef CONFIG_CHECKPOINT + return checkpoint_register_userns(); +#else return 0; +#endif } module_init(uid_cache_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 076c7c8..ca4790f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,17 +9,11 @@ #include #include #include +#include #include -/* - * Create a new user namespace, deriving the creator from the user in the - * passed credentials, and replacing that user with the new root user for the - * new namespace. - * - * This is called by copy_creds(), which will finish setting the target task's - * credentials. - */ -int create_user_ns(struct cred *new) +static struct user_namespace *_new_user_ns(struct user_struct *creator, + struct user_struct **newroot) { struct user_namespace *ns; struct user_struct *root_user; @@ -27,7 +21,7 @@ int create_user_ns(struct cred *new) ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); if (!ns) - return -ENOMEM; + return ERR_PTR(-ENOMEM); kref_init(&ns->kref); @@ -38,12 +32,43 @@ int create_user_ns(struct cred *new) root_user = alloc_uid(ns, 0); if (!root_user) { kfree(ns); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } /* set the new root user in the credentials under preparation */ - ns->creator = new->user; - new->user = root_user; + ns->creator = creator; + + /* alloc_uid() incremented the userns refcount. Just set it to 1 */ + kref_set(&ns->kref, 1); + + *newroot = root_user; + return ns; +} + +struct user_namespace *new_user_ns(struct user_struct *creator, + struct user_struct **newroot) +{ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + return _new_user_ns(creator, newroot); +} + +/* + * Create a new user namespace, deriving the creator from the user in the + * passed credentials, and replacing that user with the new root user for the + * new namespace. + * + * This is called by copy_creds(), which will finish setting the target task's + * credentials. + */ +int create_user_ns(struct cred *new) +{ + struct user_namespace *ns; + + ns = new_user_ns(new->user, &new->user); + if (IS_ERR(ns)) + return PTR_ERR(ns); + new->uid = new->euid = new->suid = new->fsuid = 0; new->gid = new->egid = new->sgid = new->fsgid = 0; put_group_info(new->group_info); @@ -54,9 +79,6 @@ int create_user_ns(struct cred *new) #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - /* alloc_uid() incremented the userns refcount. Just set it to 1 */ - kref_set(&ns->kref, 1); - return 0; } diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b..c82ed83 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,8 +14,9 @@ #include #include #include +#include -static struct uts_namespace *create_uts_ns(void) +struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e..41c837d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -14,6 +14,10 @@ #include #include +#define CKPT_DFLAG CKPT_DSYS +#include +#include + static void *get_uts(ctl_table *table, int write) { char *which = table->data; @@ -108,6 +112,9 @@ static struct ctl_table uts_root_table[] = { static int __init utsname_sysctl_init(void) { register_sysctl_table(uts_root_table); +#ifdef CONFIG_CHECKPOINT + checkpoint_register_utsname(); +#endif return 0; } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 935248b..75d413e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1086,6 +1086,19 @@ config DMA_API_DEBUG This option causes a performance degredation. Use only if you want to debug device drivers. If unsure, say N. +config CHECKPOINT_DEBUG + bool "Checkpoint/restart debugging (EXPERIMENTAL)" + depends on CHECKPOINT + default y + help + This options turns on the debugging output of checkpoint/restart. + The level of verbosity is controlled by 'ckpt_debug_level' and can + be set at boot time with "ckpt_debug=" option. + + Turning this option off will reduce the size of the c/r code. If + turned on, it is unlikely to incur visible overhead if the debug + level is set to zero. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/mm/Makefile b/mm/Makefile index 6c2a73a..e779b69 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -38,6 +38,7 @@ obj-y += percpu.o else obj-y += percpu_up.o endif +obj-$(CONFIG_CHECKPOINT) += checkpoint.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/filemap.c b/mm/filemap.c index 140ebda..d59417a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,6 +34,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ +#include #include "internal.h" /* diff --git a/mm/memory.c b/mm/memory.c index 833952d..21de72d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1314,8 +1314,17 @@ bad_page: no_page: pte_unmap_unlock(ptep, ptl); - if (!pte_none(pte)) + if (!pte_none(pte)) { + /* + * When checkpointing we only care about dirty pages. + * If a file-backed page is missing, then return an + * error to tell __get_dirty_page() that it's clean, + * so it won't try to demand page it into memory. + */ + if ((flags & FOLL_DIRTY) && pte_file(pte)) + page = ERR_PTR(-EFAULT); return page; + } no_page_table: /* @@ -1329,6 +1338,16 @@ no_page_table: if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); + + /* + * When checkpointing we only care about dirty pages. If there + * is no page table for a non-anonymous page, we return an + * error to tell __get_dirty_page() that the page is clean, so + * it won't allocate page tables and the page unnecessarily. + */ + if ((flags & FOLL_DIRTY) && vma->vm_ops) + return ERR_PTR(-EFAULT); + return page; } @@ -1586,6 +1605,80 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, return NULL; } +/** + * __get_dirty_page - return page pointer for dirty user page + * @vma - target vma + * @addr - page address + * + * Looks up the page that correspond to the address in the vma, and + * return the page if it was modified (and grabs a reference to it), + * or otherwise returns NULL or error. + * + * Should only be called for private vma. + * Must be called with mmap_sem held for read or write. + */ +struct page *__get_dirty_page(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + + BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)); + + /* + * FOLL_DUMP tells follow_page() to return -EFAULT for either + * non-present anonymous pages, or memory "holes". + * FOLL_DIRTY tells follow_page() to return -EFAULT also for + * non-present file-mapped pages. + * Otherwise, follow_page() returns the page, or NULL if the + * page is swapped out. + */ + + cond_resched(); + while (!(page = follow_page(vma, addr, + FOLL_GET | FOLL_DUMP | FOLL_DIRTY))) { + int ret; + + /* the page is swapped out - bring it in (optimize ?) */ + ret = handle_mm_fault(vma->vm_mm, vma, addr, 0); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return ERR_PTR(-ENOMEM); + else if (ret & VM_FAULT_SIGBUS) + return ERR_PTR(-EFAULT); + else + BUG(); + break; + } + cond_resched(); + } + + /* -EFAULT means that the page is clean (see above) */ + if (PTR_ERR(page) == -EFAULT) + return NULL; + else if (IS_ERR(page)) + return page; + + /* + * Only care about dirty pages: either anonymous non-zero pages, + * or file-backed COW (copy-on-write) pages that were modified. + * A clean COW page is not interesting because its contents are + * identical to the backing file; ignore such pages. + * A file-backed broken COW is identified by its page_mapping() + * being unset (NULL) because the page will no longer be mapped + * to the original file after having been modified. + */ + if (is_zero_pfn(page_to_pfn(page))) { + /* this is the zero page: ignore */ + page_cache_release(page); + page = NULL; + } else if (vma->vm_file && (page_mapping(page) != NULL)) { + /* file backed clean cow: ignore */ + page_cache_release(page); + page = NULL; + } + + return page; +} + /* * This is the old fallback for page remapping. * diff --git a/mm/mmap.c b/mm/mmap.c index f90ea92..a13d645 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -2009,14 +2010,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap_nocheck(struct mm_struct *mm, unsigned long start, size_t len) { unsigned long end; struct vm_area_struct *vma, *prev, *last; - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; - if ((len = PAGE_ALIGN(len)) == 0) return -EINVAL; @@ -2090,8 +2088,39 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return 0; } +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + return do_munmap_nocheck(mm, start, len); +} + EXPORT_SYMBOL(do_munmap); +/* + * called with mm->mmap-sem held + * only called from checkpoint/memory.c:restore_mm() + */ +int destroy_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vmnext = mm->mmap; + struct vm_area_struct *vma; + int ret; + + while (vmnext) { + vma = vmnext; + vmnext = vmnext->vm_next; + ret = do_munmap_nocheck(mm, vma->vm_start, + vma->vm_end-vma->vm_start); + if (ret < 0) { + pr_warning("%s: failed munmap (%d)\n", __func__, ret); + return ret; + } + } + return 0; +} + SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; @@ -2248,7 +2277,7 @@ void exit_mmap(struct mm_struct *mm) tlb = tlb_gather_mmu(mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + end = vma ? unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL) : 0; vm_unacct_memory(nr_accounted); free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); diff --git a/mm/shmem.c b/mm/shmem.c index eef4ebe..df30acc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -29,6 +29,7 @@ #include #include #include +#include static struct vfsmount *shm_mnt; @@ -98,14 +99,6 @@ static struct vfsmount *shm_mnt; /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 -/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ -enum sgp_type { - SGP_READ, /* don't exceed i_size, don't allocate page */ - SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ - SGP_WRITE, /* may exceed i_size, may allocate page */ -}; - #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -118,9 +111,6 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type); - static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) { /* @@ -1213,8 +1203,8 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type) +int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp, int *type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); diff --git a/net/Kconfig b/net/Kconfig index 041c35e..c1cb774 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -276,4 +276,8 @@ source "net/wimax/Kconfig" source "net/rfkill/Kconfig" source "net/9p/Kconfig" +config NETNS_CHECKPOINT + bool + default y if NET && NET_NS && CHECKPOINT + endif # if NET diff --git a/net/Makefile b/net/Makefile index 1542e72..b7d78f4 100644 --- a/net/Makefile +++ b/net/Makefile @@ -65,3 +65,6 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_SYSCTL) += sysctl_net.o endif obj-$(CONFIG_WIMAX) += wimax/ + +obj-$(CONFIG_CHECKPOINT) += checkpoint.o +obj-$(CONFIG_NETNS_CHECKPOINT) += checkpoint_dev.o diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87c..c00d8ce 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-$(CONFIG_CHECKPOINT) += checkpoint.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f713574..8b7d3dd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -876,6 +876,9 @@ const struct proto_ops inet_stream_ops = { .mmap = sock_no_mmap, .sendpage = tcp_sendpage, .splice_read = tcp_splice_read, + .checkpoint = inet_checkpoint, + .restore = inet_restore, + .collect = inet_collect, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -902,6 +905,9 @@ const struct proto_ops inet_dgram_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, + .checkpoint = inet_checkpoint, + .restore = inet_restore, + .collect = inet_collect, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5abae10..4105bfe 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1089,6 +1089,9 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_start_xmit = ipip6_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_change_mtu = ipip6_tunnel_change_mtu, +#ifdef CONFIG_NETNS_CHECKPOINT + .ndo_checkpoint = ipip6_checkpoint, +#endif }; static void ipip6_tunnel_setup(struct net_device *dev) diff --git a/net/socket.c b/net/socket.c index 5e8d0af..c5be3a4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -343,7 +343,7 @@ static const struct dentry_operations sockfs_dentry_operations = { * but we take care of internal coherence yet. */ -static int sock_alloc_file(struct socket *sock, struct file **f, int flags) +int sock_alloc_file(struct socket *sock, struct file **f, int flags) { struct qstr name = { .name = "" }; struct path path; @@ -1422,15 +1422,10 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address); - if (err >= 0) { - err = security_socket_bind(sock, - (struct sockaddr *)&address, - addrlen); - if (!err) - err = sock->ops->bind(sock, - (struct sockaddr *) - &address, addrlen); - } + if (err >= 0) + err = sock_bind(sock, + (struct sockaddr *)&address, + addrlen); fput_light(sock->file, fput_needed); } return err; @@ -1609,11 +1604,7 @@ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, if (!sock) goto out; - err = security_socket_getsockname(sock); - if (err) - goto out_put; - - err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0); + err = sock_getname(sock, (struct sockaddr *)&address, &len); if (err) goto out_put; err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, usockaddr_len); @@ -1638,15 +1629,7 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { - err = security_socket_getpeername(sock); - if (err) { - fput_light(sock->file, fput_needed); - return err; - } - - err = - sock->ops->getname(sock, (struct sockaddr *)&address, &len, - 1); + err = sock_getpeer(sock, (struct sockaddr *)&address, &len); if (!err) err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, usockaddr_len); diff --git a/net/unix/Makefile b/net/unix/Makefile index b852a2b..fbff1e6 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_UNIX) += unix.o unix-y := af_unix.o garbage.o unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o +unix-$(CONFIG_CHECKPOINT) += checkpoint.o diff --git a/security/capability.c b/security/capability.c index 4875142..0876984 100644 --- a/security/capability.c +++ b/security/capability.c @@ -852,6 +852,7 @@ static int cap_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { return 0; } + #ifdef CONFIG_KEYS static int cap_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) diff --git a/security/commoncap.c b/security/commoncap.c index 6166973..532b971 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -828,24 +828,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, * capability-based-privilege environment. */ case PR_SET_SECUREBITS: - error = -EPERM; - if ((((new->securebits & SECURE_ALL_LOCKS) >> 1) - & (new->securebits ^ arg2)) /*[1]*/ - || ((new->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ - || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ - || (cap_capable(current, current_cred(), CAP_SETPCAP, - SECURITY_CAP_AUDIT) != 0) /*[4]*/ - /* - * [1] no changing of bits that are locked - * [2] no unlocking of locks - * [3] no setting of unsupported bits - * [4] doing anything requires privilege (go read about - * the "sendmail capabilities bug") - */ - ) - /* cannot change a locked bit */ + error = apply_securebits(arg2, new); + if (error) goto error; - new->securebits = arg2; goto changed; case PR_GET_SECUREBITS: diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 8b32e95..b1cde03 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -24,7 +24,7 @@ struct security_class_mapping secclass_map[] = { "getattr", "setexec", "setfscreate", "noatsecure", "siginh", "setrlimit", "rlimitinh", "dyntransition", "setcurrent", "execmem", "execstack", "execheap", "setkeycreate", - "setsockcreate", NULL } }, + "setsockcreate", "restore", NULL } }, { "system", { "ipc_info", "syslog_read", "syslog_mod", "syslog_console", "module_request", NULL } }, @@ -43,7 +43,8 @@ struct security_class_mapping secclass_map[] = { "quotaget", NULL } }, { "file", { COMMON_FILE_PERMS, - "execute_no_trans", "entrypoint", "execmod", "open", NULL } }, + "execute_no_trans", "entrypoint", "execmod", "open", + "restore", "fown_restore", NULL } }, { "dir", { COMMON_FILE_PERMS, "add_name", "remove_name", "reparent", "search", "rmdir", "open", NULL } }, @@ -93,13 +94,13 @@ struct security_class_mapping secclass_map[] = { } }, { "sem", { COMMON_IPC_PERMS, NULL } }, - { "msg", { "send", "receive", NULL } }, + { "msg", { "send", "receive", "restore", NULL } }, { "msgq", { COMMON_IPC_PERMS, "enqueue", NULL } }, { "shm", { COMMON_IPC_PERMS, "lock", NULL } }, { "ipc", - { COMMON_IPC_PERMS, NULL } }, + { COMMON_IPC_PERMS, "restore", NULL } }, { "netlink_route_socket", { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } }, diff --git a/security/smack/smack.h b/security/smack/smack.h index c6e9aca..a8917b0 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -216,6 +216,7 @@ u32 smack_to_secid(const char *); extern int smack_cipso_direct; extern char *smack_net_ambient; extern char *smack_onlycap; +extern char *smack_version; extern const char *smack_cipso_option; extern struct smack_known smack_known_floor; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fdfeaa2..501e66a 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3119,6 +3119,7 @@ struct security_operations smack_ops = { .file_receive = smack_file_receive, .cred_alloc_blank = smack_cred_alloc_blank, + .cred_free = smack_cred_free, .cred_prepare = smack_cred_prepare, .cred_commit = smack_cred_commit, diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index a2b72d7..cc3046b 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -1256,6 +1256,7 @@ static const struct file_operations smk_logging_ops = { .read = smk_read_logging, .write = smk_write_logging, }; + /** * smk_fill_super - fill the /smackfs superblock * @sb: the empty superblock -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/