Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757206AbXJCWao (ORCPT ); Wed, 3 Oct 2007 18:30:44 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1752929AbXJCWaf (ORCPT ); Wed, 3 Oct 2007 18:30:35 -0400 Received: from e6.ny.us.ibm.com ([32.97.182.146]:53650 "EHLO e6.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752357AbXJCWad (ORCPT ); Wed, 3 Oct 2007 18:30:33 -0400 Date: Wed, 3 Oct 2007 17:30:28 -0500 From: "Serge E. Hallyn" To: lkml , Chris Wright , Andrew Morgan , linux-security-module@vger.kernel.org Subject: [PATCH 2/2 -mm] capabilities: introduce per-process capability bounding set (v4) Message-ID: <20071003223028.GA2544@sergelap.austin.ibm.com> References: <20071003222746.GA1562@sergelap.austin.ibm.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20071003222746.GA1562@sergelap.austin.ibm.com> User-Agent: Mutt/1.5.16 (2007-06-09) Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 17741 Lines: 599 >From d93ecb90d82f9e2b7f48c74f5e6ed97cac3683c7 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Fri, 28 Sep 2007 10:33:56 -0500 Subject: [PATCH 2/2 -mm] capabilities: introduce per-process capability bounding set (v4) The capability bounding set is a set beyond which capabilities cannot grow. Currently cap_bset is per-system. It can be manipulated through sysctl, but only init can add capabilities. Root can remove capabilities. By default it includes all caps except CAP_SETPCAP. This patch makes the bounding set per-process. It is inherited at fork from parent. Noone can add elements. CAP_SYS_ADMIN is required to remove them. The reason for that is to stop an unprivileged user from removing key capabilities, then running a setuid root binary (or one with file capabilities) which assumes it got all, not some, of the capabilities it needs and fails unsafely partway through. Perhaps a new capability should be introduced rather than using CAP_SYS_ADMIN. Or, since I don't have a *concrete* example of an exploit, perhaps any user should be able to drop capabilities from his capbset. One example use of this is to start a safer container. For instance, until device namespaces or per-container device whitelists are introduced, it is probably wise to take CAP_MKNOD away from a container. The following hacky test program will get and set the bounding set. For instance ./bset get (lists capabilities in bset) ./bset strset cap_sys_admin (starts shell with new bset) (use capset, setuid binary, or binary with file capabilities to try to increase caps) =========================================================== bset.c: =========================================================== unsigned long newval; int cmd_getbcap; char *captable[] = { "cap_dac_override", "cap_dac_read_search", "cap_fowner", "cap_fsetid", "cap_kill", "cap_setgid", "cap_setuid", "cap_setpcap", "cap_linux_immutable", "cap_net_bind_service", "cap_net_broadcast", "cap_net_admin", "cap_net_raw", "cap_ipc_lock", "cap_ipc_owner", "cap_sys_module", "cap_sys_rawio", "cap_sys_chroot", "cap_sys_ptrace", "cap_sys_pacct", "cap_sys_admin", "cap_sys_boot", "cap_sys_nice", "cap_sys_resource", "cap_sys_time", "cap_sys_tty_config", "cap_mknod", "cap_lease", "cap_audit_write", "cap_audit_control", "cap_setfcap"}; char *inttocap(unsigned long v) { char *str = NULL; int i; str = malloc(1); str[0] = '\0'; for (i=0; i<31; i++) { if (v & (1 << (i+1))) { char *tmp = captable[i]; str = realloc(str, strlen(str)+2+strlen(tmp)); sprintf(str+strlen(str), ",%s", tmp); } } return str; } int getbcap(void) { unsigned long bcap; int ret; unsigned int ver; ret = prctl(PR_GET_CAPBSET, &ver, &bcap); if (ret == -1) perror("prctl"); if (ver != _LINUX_CAPABILITY_VERSION) printf("wrong capability version: %lu not %lu\n", ver, _LINUX_CAPABILITY_VERSION); printf("prctl get_bcap returned %lu (ret %d)\n", bcap, ret); printf("that is %s\n", inttocap(bcap)); return ret; } int setbcap(unsigned long val) { int ret; ret = prctl(PR_SET_CAPBSET, _LINUX_CAPABILITY_VERSION, val); return ret; } int usage(char *me) { printf("Usage: %s get\n", me); printf(" %s set \n", me); printf(" %s strset capability_string\n", me); printf(" capability_string is for instance:\n"); printf(" cap_sys_admin,cap_mknod,cap_dac_override\n"); return 1; } unsigned long captoint(char *cap) { if (strcmp(cap, "cap_dac_override") == 0) return 1; else if (strcmp(cap, "cap_dac_read_search") == 0) return 2; else if (strcmp(cap, "cap_fowner") == 0) return 3; else if (strcmp(cap, "cap_fsetid") == 0) return 4; else if (strcmp(cap, "cap_kill") == 0) return 5; else if (strcmp(cap, "cap_setgid") == 0) return 6; else if (strcmp(cap, "cap_setuid") == 0) return 7; else if (strcmp(cap, "cap_setpcap") == 0) return 8; else if (strcmp(cap, "cap_linux_immutable") == 0) return 9; else if (strcmp(cap, "cap_net_bind_service") == 0) return 10; else if (strcmp(cap, "cap_net_broadcast") == 0) return 11; else if (strcmp(cap, "cap_net_admin") == 0) return 12; else if (strcmp(cap, "cap_net_raw") == 0) return 13; else if (strcmp(cap, "cap_ipc_lock") == 0) return 14; else if (strcmp(cap, "cap_ipc_owner") == 0) return 15; else if (strcmp(cap, "cap_sys_module") == 0) return 16; else if (strcmp(cap, "cap_sys_rawio") == 0) return 17; else if (strcmp(cap, "cap_sys_chroot") == 0) return 18; else if (strcmp(cap, "cap_sys_ptrace") == 0) return 19; else if (strcmp(cap, "cap_sys_pacct") == 0) return 20; else if (strcmp(cap, "cap_sys_admin") == 0) return 21; else if (strcmp(cap, "cap_sys_boot") == 0) return 22; else if (strcmp(cap, "cap_sys_nice") == 0) return 23; else if (strcmp(cap, "cap_sys_resource") == 0) return 24; else if (strcmp(cap, "cap_sys_time") == 0) return 25; else if (strcmp(cap, "cap_sys_tty_config") == 0) return 26; else if (strcmp(cap, "cap_mknod") == 0) return 27; else if (strcmp(cap, "cap_lease") == 0) return 28; else if (strcmp(cap, "cap_audit_write") == 0) return 29; else if (strcmp(cap, "cap_audit_control") == 0) return 30; else if (strcmp(cap, "cap_setfcap") == 0) return 31; } unsigned long parse_cap_string(char *capstring) { unsigned long tmp, newval = 0; char *token = strtok(capstring, ","); while (token) { tmp = captoint(token); if (tmp < 0) return -1; newval |= 1< capabilities are subsets of the new bounding set. Return -EINVAL if no capabilities. As suggested by Andrew Morgan, send the capability version along with the bset for prctl(PR_SET_CAPBSET) and PR_GET_CAPBSET) Signed-off-by: Serge Hallyn --- include/linux/capability.h | 24 +++++++++++++++++++++++- include/linux/init_task.h | 1 + include/linux/prctl.h | 4 ++++ include/linux/sched.h | 2 +- include/linux/security.h | 5 ----- include/linux/sysctl.h | 3 --- kernel/fork.c | 1 + kernel/sys.c | 18 +++++++++++++++++- kernel/sysctl.c | 35 ----------------------------------- kernel/sysctl_check.c | 7 ------- security/commoncap.c | 37 +++++++++++++++++++++++++++++++++---- 11 files changed, 80 insertions(+), 57 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index 7a8d7ad..d853b0d 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -197,7 +197,6 @@ typedef __u32 kernel_cap_t; #define CAP_IPC_OWNER 15 /* Insert and remove kernel modules - modify kernel without limit */ -/* Modify cap_bset */ #define CAP_SYS_MODULE 16 /* Allow ioperm/iopl access */ @@ -254,6 +253,7 @@ typedef __u32 kernel_cap_t; arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ +/* Allow taking bits out of capability bounding set */ #define CAP_SYS_ADMIN 21 @@ -332,6 +332,17 @@ typedef __u32 kernel_cap_t; #define CAP_INIT_EFF_SET to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP)) #define CAP_INIT_INH_SET to_cap_t(0) +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +/* + * Because of the reduced scope of CAP_SETPCAP when filesystem + * capabilities are in effect, it is safe to allow this capability to + * be available in the default configuration. + */ +# define CAP_INIT_BSET CAP_FULL_SET +#else +# define CAP_INIT_BSET CAP_INIT_EFF_SET +#endif + #define CAP_TO_MASK(x) (1 << (x)) #define cap_raise(c, flag) (cap_t(c) |= CAP_TO_MASK(flag)) #define cap_lower(c, flag) (cap_t(c) &= ~CAP_TO_MASK(flag)) @@ -377,6 +388,17 @@ static inline kernel_cap_t cap_invert(kernel_cap_t c) int capable(int cap); int __capable(struct task_struct *t, int cap); +#ifdef CONFIG_COMMONCAP +extern int cap_prctl_setbset(unsigned long new_bset); +extern int cap_prctl_getbset(unsigned long *bset); +#else +#include +static inline int cap_prctl_setbset(unsigned long new_bset) +{ return -EINVAL; } +static inline int cap_prctl_getbset(unsigned long *bset) +{ return -EINVAL; } +#endif + #endif /* __KERNEL__ */ #endif /* !_LINUX_CAPABILITY_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index cae35b6..5c84d14 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -147,6 +147,7 @@ extern struct group_info init_groups; .cap_effective = CAP_INIT_EFF_SET, \ .cap_inheritable = CAP_INIT_INH_SET, \ .cap_permitted = CAP_FULL_SET, \ + .cap_bset = CAP_INIT_BSET, \ .keep_capabilities = 0, \ .user = INIT_USER, \ .comm = "swapper", \ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index e2eff90..a7de023 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -63,4 +63,8 @@ #define PR_GET_SECCOMP 21 #define PR_SET_SECCOMP 22 +/* Get/set the capability bounding set */ +#define PR_GET_CAPBSET 23 +#define PR_SET_CAPBSET 24 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f21af1..7c21341 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -973,7 +973,7 @@ struct task_struct { uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; struct group_info *group_info; - kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; unsigned keep_capabilities:1; struct user_struct *user; #ifdef CONFIG_KEYS diff --git a/include/linux/security.h b/include/linux/security.h index ff3f857..e2d2f06 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -34,11 +34,6 @@ #include #include -/* - * Bounding set - */ -extern kernel_cap_t cap_bset; - extern unsigned securebits; struct ctl_table; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index e99171f..3771782 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -103,7 +103,6 @@ enum KERN_NODENAME=7, KERN_DOMAINNAME=8, - KERN_CAP_BSET=14, /* int: capability bounding set */ KERN_PANIC=15, /* int: panic timeout */ KERN_REALROOTDEV=16, /* real root device to mount after initrd */ @@ -968,8 +967,6 @@ extern int proc_dostring(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int proc_dointvec(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -extern int proc_dointvec_bset(struct ctl_table *, int, struct file *, - void __user *, size_t *, loff_t *); extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *, diff --git a/kernel/fork.c b/kernel/fork.c index f85731a..df13a01 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1082,6 +1082,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_SECURITY p->security = NULL; #endif + p->cap_bset = current->cap_bset; p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); diff --git a/kernel/sys.c b/kernel/sys.c index 796299c..cff511f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1739,7 +1739,23 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2); break; - + case PR_GET_CAPBSET: { + unsigned long bset; + error = cap_prctl_getbset(&bset); + if (error) + return error; + error = put_user(_LINUX_CAPABILITY_VERSION, + (__u32 __user *)arg2); + if (error) + break; + error = put_user(bset, (unsigned long __user *)arg3); + return error; + } + case PR_SET_CAPBSET: + if (arg2 != _LINUX_CAPABILITY_VERSION) + return -EINVAL; + error = cap_prctl_setbset(arg3); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b11d22b..7bff6a2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -363,15 +363,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec_taint, }, #endif -#ifdef CONFIG_SECURITY_CAPABILITIES - { - .procname = "cap-bound", - .data = &cap_bset, - .maxlen = sizeof(kernel_cap_t), - .mode = 0600, - .proc_handler = &proc_dointvec_bset, - }, -#endif /* def CONFIG_SECURITY_CAPABILITIES */ #ifdef CONFIG_BLK_DEV_INITRD { .ctl_name = KERN_REALROOTDEV, @@ -1882,26 +1873,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, return 0; } -#ifdef CONFIG_SECURITY_CAPABILITIES -/* - * init may raise the set. - */ - -int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int op; - - if (write && !capable(CAP_SYS_MODULE)) { - return -EPERM; - } - - op = is_global_init(current) ? OP_SET : OP_AND; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); -} -#endif /* def CONFIG_SECURITY_CAPABILITIES */ - /* * Taint values can only be increased */ @@ -2315,12 +2286,6 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, return -ENOSYS; } -int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 3c9ef5a..41c7f16 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -38,10 +38,6 @@ static struct trans_ctl_table trans_kern_table[] = { { KERN_NODENAME, "hostname" }, { KERN_DOMAINNAME, "domainname" }, -#ifdef CONFIG_SECURITY_CAPABILITIES - { KERN_CAP_BSET, "cap-bound" }, -#endif /* def CONFIG_SECURITY_CAPABILITIES */ - { KERN_PANIC, "panic" }, { KERN_REALROOTDEV, "real-root-dev" }, @@ -1535,9 +1531,6 @@ int sysctl_check_table(struct ctl_table *table) (table->strategy == sysctl_ms_jiffies) || (table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || -#ifdef CONFIG_SECURITY_CAPABILITIES - (table->proc_handler == proc_dointvec_bset) || -#endif /* def CONFIG_SECURITY_CAPABILITIES */ (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || diff --git a/security/commoncap.c b/security/commoncap.c index 43f9027..5be15bb 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -36,9 +36,6 @@ # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */ -kernel_cap_t cap_bset = CAP_INIT_BSET; /* systemwide capability bound */ -EXPORT_SYMBOL(cap_bset); - /* Global security state */ unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ @@ -307,7 +304,8 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) /* Derived from fs/exec.c:compute_creds. */ kernel_cap_t new_permitted, working; - new_permitted = cap_intersect (bprm->cap_permitted, cap_bset); + new_permitted = cap_intersect (bprm->cap_permitted, + current->cap_bset); working = cap_intersect (bprm->cap_inheritable, current->cap_inheritable); new_permitted = cap_combine (new_permitted, working); @@ -580,3 +578,34 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages) return __vm_enough_memory(mm, pages, cap_sys_admin); } +/* + * cap_prctl_setbset currently requires CAP_SYS_ADMIN. The reason is + * my fear that an ordinary user could selectively take capabilities + * out, then run a setuid root binary or binary with file capabilities, + * which would perform part of a dangerous action with CAP_SOMECAP1, + * then fail to perform the second part of the action because + * CAP_SOMECAP2 is not in bset, leaving things in an unsafe state, + * i.e a sensitive file owned by the non-root user because CAP_CHOWN + * was not allowed. + */ +int cap_prctl_setbset(unsigned long new_bset) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!cap_issubset(new_bset, current->cap_bset)) + return -EPERM; + current->cap_bset = new_bset; + current->cap_effective = cap_intersect(current->cap_effective, + new_bset); + current->cap_permitted = cap_intersect(current->cap_permitted, + new_bset); + current->cap_inheritable = cap_intersect(current->cap_inheritable, + new_bset); + return 0; +} + +int cap_prctl_getbset(unsigned long *bset) +{ + *bset = current->cap_bset; + return 0; +} -- 1.5.1.1.GIT - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/