Root in a user ns cannot be trusted to write a traditional
security.capability xattr. If it were allowed to do so, then any
unprivileged user on the host could map his own uid to root in a
namespace, write the xattr, and execute the file with privilege on the
host.
This patch introduces v3 of the security.capability xattr. It builds a
vfs_ns_cap_data struct by appending a uid_t rootid to struct
vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in
init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces
the file capabilities may take effect.
When a task in a user ns (which is privileged with CAP_SETFCAP toward
that user_ns) asks to write v2 security.capability, the kernel will
transparently rewrite the xattr as a v3 with the appropriate rootid.
Subsequently, any task executing the file which has the noted kuid as
its root uid, or which is in a descendent user_ns of such a user_ns,
will run the file with capabilities.
If a task writes a v3 security.capability, then it can provide a
uid (valid within its own user namespace, over which it has CAP_SETFCAP)
for the xattr. The kernel will translate that to the absolute uid, and
write that to disk. After this, a task in the writer's namespace will
not be able to use those capabilities, but a task in a namespace where
the given uid is root will.
Only a single security.capability xattr may be written. A task may
overwrite the existing one so long as it was written by a user mapped
into his own user_ns over which he has CAP_SETFCAP.
This allows a simple setxattr to work, allows tar/untar to work, and
allows us to tar in one namespace and untar in another while preserving
the capability, without risking leaking privilege into a parent
namespace.
Changelog:
Nov 02 2016: fix invalid check at refuse_fcap_overwrite()
Nov 07 2016: convert rootid from and to fs user_ns
---
fs/xattr.c | 27 +++-
include/linux/capability.h | 5 +-
include/linux/security.h | 2 +
include/uapi/linux/capability.h | 22 ++-
security/commoncap.c | 335 ++++++++++++++++++++++++++++++++++++++--
5 files changed, 366 insertions(+), 25 deletions(-)
diff --git a/fs/xattr.c b/fs/xattr.c
index 2d13b4e..e9e70f1 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -171,11 +171,27 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
{
struct inode *inode = dentry->d_inode;
int error = -EAGAIN;
+ void *wvalue = NULL;
+ size_t wsize = 0;
int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
- if (issec)
+ if (issec) {
inode->i_flags &= ~S_NOSEC;
+
+ /* if root in a non-init user_ns tries to set
+ * security.capability, write the virtualized
+ * xattr in its place */
+ if (!strcmp(name, "security.capability") &&
+ current_user_ns() != &init_user_ns) {
+ cap_setxattr_make_nscap(dentry, value, size, &wvalue, &wsize);
+ if (!wvalue)
+ return -EPERM;
+ value = wvalue;
+ size = wsize;
+ }
+ }
+
if (inode->i_opflags & IOP_XATTR) {
error = __vfs_setxattr(dentry, inode, name, value, size, flags);
if (!error) {
@@ -184,8 +200,10 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
size, flags);
}
} else {
- if (unlikely(is_bad_inode(inode)))
- return -EIO;
+ if (unlikely(is_bad_inode(inode))) {
+ error = -EIO;
+ goto out;
+ }
}
if (error == -EAGAIN) {
error = -EOPNOTSUPP;
@@ -200,10 +218,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
}
}
+out:
+ kfree(wvalue);
return error;
}
-
int
vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
diff --git a/include/linux/capability.h b/include/linux/capability.h
index dbc21c7..edd5be1 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,7 +13,7 @@
#define _LINUX_CAPABILITY_H
#include <uapi/linux/capability.h>
-
+#include <linux/uidgid.h>
#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
@@ -246,4 +246,7 @@ extern bool file_ns_capable(const struct file *file, struct user_namespace *ns,
/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+extern void cap_setxattr_make_nscap(struct dentry *dentry, const void *value,
+ size_t size, void **wvalue, size_t *wsize);
+
#endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index c2125e9..3127531 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -86,6 +86,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
extern int cap_inode_need_killpriv(struct dentry *dentry);
extern int cap_inode_killpriv(struct dentry *dentry);
+extern int cap_inode_getsecurity(struct inode *inode, const char *name,
+ void **buffer, bool alloc);
extern int cap_mmap_addr(unsigned long addr);
extern int cap_mmap_file(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 49bc062..fd4f87d 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct {
#define VFS_CAP_U32_2 2
#define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
-#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
-#define VFS_CAP_U32 VFS_CAP_U32_2
-#define VFS_CAP_REVISION VFS_CAP_REVISION_2
+#define VFS_CAP_REVISION_3 0x03000000
+#define VFS_CAP_U32_3 2
+#define XATTR_CAPS_SZ_3 (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3))
+
+#define XATTR_CAPS_SZ XATTR_CAPS_SZ_3
+#define VFS_CAP_U32 VFS_CAP_U32_3
+#define VFS_CAP_REVISION VFS_CAP_REVISION_3
struct vfs_cap_data {
__le32 magic_etc; /* Little endian */
@@ -72,6 +76,18 @@ struct vfs_cap_data {
} data[VFS_CAP_U32];
};
+/*
+ * same as vfs_cap_data but with a rootid at the end
+ */
+struct vfs_ns_cap_data {
+ __le32 magic_etc;
+ struct {
+ __le32 permitted; /* Little endian */
+ __le32 inheritable; /* Little endian */
+ } data[VFS_CAP_U32];
+ __le32 rootid;
+};
+
#ifndef __KERNEL__
/*
diff --git a/security/commoncap.c b/security/commoncap.c
index 8df676f..1f189b2 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -332,6 +332,272 @@ int cap_inode_killpriv(struct dentry *dentry)
return error;
}
+static bool rootid_owns_currentns(kuid_t kroot)
+{
+ struct user_namespace *ns;
+
+ if (!uid_valid(kroot))
+ return false;
+
+ for (ns = current_user_ns(); ; ns = ns->parent) {
+ if (from_kuid(ns, kroot) == 0) {
+ return true;
+ }
+ if (ns == &init_user_ns)
+ break;
+ }
+
+ return false;
+}
+
+static char *cap_convert_v2_v3(char *buf, struct inode *inode)
+{
+ char *ret;
+ struct vfs_ns_cap_data *v3;
+ struct vfs_cap_data *v2 = (struct vfs_cap_data *)buf;
+ kuid_t krootid;
+
+ krootid = make_kuid(inode->i_sb->s_user_ns, 0);
+ if (!uid_valid(krootid)) {
+ ret = ERR_PTR(-EPERM);
+ goto out;
+ }
+ ret = kmalloc(sizeof(struct vfs_ns_cap_data), GFP_NOFS);
+ if (!ret) {
+ ret = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ v3 = (struct vfs_ns_cap_data *)ret;
+
+ memcpy(&v3->data, &v2->data, sizeof(v2->data));
+ v3->magic_etc = VFS_CAP_REVISION_3;
+ if (v2->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
+ v3->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
+ v3->rootid = from_kuid(&init_user_ns, krootid);
+
+out:
+ kfree(buf);
+ return ret;
+}
+
+/*
+ * getsecurity: We are called for security.* before any attempt to read the
+ * xattr from the inode itself.
+ *
+ * This gives us a chance to read the on-disk value and convert it. If we
+ * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
+ *
+ * Note we are not called by vfs_getxattr_alloc(), but that is only called
+ * by the integrity subsystem, which really wants the unconverted values -
+ * so that's good.
+ */
+int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+ bool alloc)
+{
+ int size, ret;
+ kuid_t kroot;
+ uid_t root, mappedroot;
+ char *tmpbuf = NULL;
+ struct vfs_ns_cap_data *nscap;
+ struct dentry *dentry;
+ struct user_namespace *fs_ns;
+
+ if (!inode->i_op->getxattr)
+ return -EOPNOTSUPP;
+
+ if (strcmp(name, "capability") != 0)
+ return -EOPNOTSUPP;
+
+ dentry = d_find_alias(inode);
+ if (!dentry)
+ return -EINVAL;
+
+ size = sizeof(struct vfs_ns_cap_data);
+ ret = vfs_getxattr_alloc(dentry, "security.capability",
+ &tmpbuf, size, GFP_NOFS);
+
+ if (ret < 0)
+ return ret;
+
+ fs_ns = inode->i_sb->s_user_ns;
+ if (ret == sizeof(struct vfs_cap_data) && fs_ns == &init_user_ns) {
+ /* If this is sizeof(vfs_cap_data) then we're ok with the
+ * on-disk value, so return that. */
+ if (alloc)
+ *buffer = tmpbuf;
+ else
+ kfree(tmpbuf);
+ return ret;
+ } else if (ret == sizeof(struct vfs_cap_data)) {
+ tmpbuf = cap_convert_v2_v3(tmpbuf, inode);
+ if (!tmpbuf)
+ return -EPERM;
+ } else if (ret != size) {
+ kfree(tmpbuf);
+ return -EINVAL;
+ }
+
+ nscap = (struct vfs_ns_cap_data *) tmpbuf;
+ root = le32_to_cpu(nscap->rootid);
+ kroot = make_kuid(fs_ns, root);
+
+ /* If the root kuid maps to a valid uid in current ns, then return
+ * this as a nscap. */
+ mappedroot = from_kuid(current_user_ns(), kroot);
+ if (mappedroot != (uid_t)-1) {
+ if (alloc) {
+ *buffer = tmpbuf;
+ nscap->rootid = cpu_to_le32(mappedroot);
+ } else
+ kfree(tmpbuf);
+ return size;
+ }
+
+ if (!rootid_owns_currentns(kroot)) {
+ kfree(tmpbuf);
+ return -EOPNOTSUPP;
+ }
+
+ /* This comes from a parent namespace. Return as a v2 capability */
+ size = sizeof(struct vfs_cap_data);
+ if (alloc) {
+ *buffer = kmalloc(size, GFP_ATOMIC);
+ if (*buffer) {
+ struct vfs_cap_data *cap = *buffer;
+ __le32 nsmagic, magic;
+ magic = VFS_CAP_REVISION_2;
+ nsmagic = le32_to_cpu(nscap->magic_etc);
+ if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
+ magic |= VFS_CAP_FLAGS_EFFECTIVE;
+ memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+ cap->magic_etc = cpu_to_le32(magic);
+ }
+ }
+ kfree(tmpbuf);
+ return size;
+}
+
+/*
+ * Root can only overwite an existing security.capability xattr
+ * if it is privileged over the root listed in the xattr
+ * Note we've already checked for ns_capable(CAP_SETFCAP) in the
+ * !capable_wrt_inode_uidgid() call by the caller, so we do not
+ * check for that here.
+ */
+static bool refuse_fcap_overwrite(struct inode *inode)
+{
+ void *tmpbuf;
+ int ret;
+ uid_t root;
+ kuid_t kroot;
+ struct vfs_ns_cap_data *nscap;
+ __u32 magic_etc;
+ bool should_refuse;
+ struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
+
+ ret = cap_inode_getsecurity(inode, "capability", &tmpbuf, true);
+ if (ret < 0)
+ return false;
+ if (ret == sizeof(struct vfs_cap_data) && fs_ns == &init_user_ns) {
+ /*
+ * host-root-installed capability, user-namespace-root may
+ * not overwrite this.
+ * TODO - if inode->i_sb->s_user_ns != &init_user_ns we do
+ * in fact want to allow it.
+ */
+ kfree(tmpbuf);
+ return true;
+ }
+ if (ret < sizeof(struct vfs_ns_cap_data)) {
+ /* Corrupt fscap. Caller is privileged wrt inode, permit fixup */
+ kfree(tmpbuf);
+ return false;
+ }
+
+ nscap = (struct vfs_ns_cap_data *)tmpbuf;
+
+ magic_etc = le32_to_cpu(nscap->magic_etc);
+ if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_3) {
+ /*
+ * This version is newer than we know about - i.e. from a newer
+ * kernel. Don't overwrite.
+ */
+ kfree(tmpbuf);
+ return true;
+ }
+ if (ret != sizeof(struct vfs_ns_cap_data)) {
+ /* Corrupt v4 fscap. Permit fixup */
+ kfree(tmpbuf);
+ return false;
+ }
+ root = le32_to_cpu(nscap->rootid);
+ kroot = make_kuid(&init_user_ns, root);
+ should_refuse = !kuid_has_mapping(current_user_ns(), kroot);
+ kfree(tmpbuf);
+ return should_refuse;
+}
+
+static kuid_t rootid_from_xattr(const void *value, size_t size,
+ struct user_namespace *task_ns)
+{
+ const struct vfs_ns_cap_data *nscap = value;
+ uid_t rootid = 0;
+
+ if (size == XATTR_CAPS_SZ_3)
+ rootid = le32_to_cpu(nscap->rootid);
+
+ return make_kuid(task_ns, rootid);
+}
+
+/*
+ * Use requested a write of security.capability but is in a non-init
+ * userns. So we construct and write a v4.
+ *
+ * If all is ok, wvalue has an allocated new value. Otherwise, wvalue
+ * is NULL.
+ */
+void cap_setxattr_make_nscap(struct dentry *dentry, const void *value, size_t size,
+ void **wvalue, size_t *wsize)
+{
+ struct vfs_ns_cap_data *nscap;
+ const struct vfs_cap_data *cap = value;
+ __u32 magic, nsmagic;
+ struct inode *inode = d_backing_inode(dentry);
+ struct user_namespace *task_ns = current_user_ns(),
+ *fs_ns = inode->i_sb->s_user_ns;
+ kuid_t rootid;
+
+ if (!value)
+ return;
+ if (size != XATTR_CAPS_SZ_2 && size != XATTR_CAPS_SZ_3)
+ return;
+ if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+ return;
+
+ /* refuse if security.capability exists */
+ if (refuse_fcap_overwrite(inode))
+ return;
+
+ rootid = rootid_from_xattr(value, size, task_ns);
+ if (!uid_valid(rootid))
+ return;
+
+ *wsize = sizeof(struct vfs_ns_cap_data);
+ nscap = kmalloc(*wsize, GFP_ATOMIC);
+ if (!nscap)
+ return;
+ nscap->rootid = cpu_to_le32(from_kuid(fs_ns, rootid));
+ nsmagic = VFS_CAP_REVISION_3;
+ magic = le32_to_cpu(cap->magic_etc);
+ if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+ nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+ nscap->magic_etc = cpu_to_le32(nsmagic);
+ memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+
+ *wvalue = nscap;
+ return;
+}
+
/*
* Calculate the new process capability sets from the capability sets attached
* to a file.
@@ -385,7 +651,10 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
__u32 magic_etc;
unsigned tocopy, i;
int size;
- struct vfs_cap_data caps;
+ struct vfs_ns_cap_data data, *nscaps = &data;
+ struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
+ kuid_t rootkuid;
+ struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
@@ -393,17 +662,18 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
return -ENODATA;
size = __vfs_getxattr((struct dentry *)dentry, inode,
- XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ);
+ XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
if (size == -ENODATA || size == -EOPNOTSUPP)
/* no data, that's ok */
return -ENODATA;
+
if (size < 0)
return size;
if (size < sizeof(magic_etc))
return -EINVAL;
- cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
+ cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
switch (magic_etc & VFS_CAP_REVISION_MASK) {
case VFS_CAP_REVISION_1:
@@ -414,8 +684,25 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
case VFS_CAP_REVISION_2:
if (size != XATTR_CAPS_SZ_2)
return -EINVAL;
+ if (fs_ns != &init_user_ns) {
+ /* unpriv user mounted this fs; make sure they
+ * own current user_ns */
+ rootkuid = make_kuid(fs_ns, 0);
+ if (!rootid_owns_currentns(rootkuid))
+ return -ENODATA;
+ }
tocopy = VFS_CAP_U32_2;
break;
+ case VFS_CAP_REVISION_3:
+ if (size != XATTR_CAPS_SZ_3)
+ return -EINVAL;
+ tocopy = VFS_CAP_U32_3;
+
+ rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
+ if (!rootid_owns_currentns(rootkuid))
+ return -ENODATA;
+ break;
+
default:
return -EINVAL;
}
@@ -423,8 +710,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
CAP_FOR_EACH_U32(i) {
if (i >= tocopy)
break;
- cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
- cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
+ cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
+ cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
}
cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
@@ -462,8 +749,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
- printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
- __func__, rc, bprm->filename);
+ printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
+ bprm->filename);
else if (rc == -ENODATA)
rc = 0;
goto out;
@@ -659,15 +946,21 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
int cap_inode_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
- if (!capable(CAP_SETFCAP))
+ /* Ignore non-security xattrs */
+ if (strncmp(name, XATTR_SECURITY_PREFIX,
+ sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+ return 0;
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ /* Write from initial user_ns will in * __vfs_setxattr_noperm()
+ * be diverted to a nscap write. But from initial user_ns we
+ * require CAP_SETFCAP targeted at init_user_ns */
+ if (current_user_ns() == &init_user_ns && !capable(CAP_SETFCAP))
return -EPERM;
return 0;
}
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- sizeof(XATTR_SECURITY_PREFIX) - 1) &&
- !capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -685,15 +978,22 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
*/
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
- if (!capable(CAP_SETFCAP))
+ /* Ignore non-security xattrs */
+ if (strncmp(name, XATTR_SECURITY_PREFIX,
+ sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+ return 0;
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ /* security.capability gets namespaced */
+ struct inode *inode = d_backing_inode(dentry);
+ if (!inode)
+ return -EINVAL;
+ if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
return -EPERM;
return 0;
}
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- sizeof(XATTR_SECURITY_PREFIX) - 1) &&
- !capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -1081,6 +1381,7 @@ struct security_hook_list capability_hooks[] = {
LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
+ LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
LSM_HOOK_INIT(mmap_file, cap_mmap_file),
LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
--
2.7.4
"Serge E. Hallyn" <[email protected]> writes:
> Root in a user ns cannot be trusted to write a traditional
> security.capability xattr. If it were allowed to do so, then any
> unprivileged user on the host could map his own uid to root in a
> namespace, write the xattr, and execute the file with privilege on the
> host.
>
> This patch introduces v3 of the security.capability xattr. It builds a
> vfs_ns_cap_data struct by appending a uid_t rootid to struct
> vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in
> init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces
> the file capabilities may take effect.
>
> When a task in a user ns (which is privileged with CAP_SETFCAP toward
> that user_ns) asks to write v2 security.capability, the kernel will
> transparently rewrite the xattr as a v3 with the appropriate rootid.
> Subsequently, any task executing the file which has the noted kuid as
> its root uid, or which is in a descendent user_ns of such a user_ns,
> will run the file with capabilities.
>
> If a task writes a v3 security.capability, then it can provide a
> uid (valid within its own user namespace, over which it has CAP_SETFCAP)
> for the xattr. The kernel will translate that to the absolute uid, and
> write that to disk. After this, a task in the writer's namespace will
> not be able to use those capabilities, but a task in a namespace where
> the given uid is root will.
>
> Only a single security.capability xattr may be written. A task may
> overwrite the existing one so long as it was written by a user mapped
> into his own user_ns over which he has CAP_SETFCAP.
>
> This allows a simple setxattr to work, allows tar/untar to work, and
> allows us to tar in one namespace and untar in another while preserving
> the capability, without risking leaking privilege into a parent
> namespace.
Skimming through this, this looks good. It is doing enough different
things I want to read through this carefully before applying it, but I
expect I will.
Thank you,
Eric
> Changelog:
> Nov 02 2016: fix invalid check at refuse_fcap_overwrite()
> Nov 07 2016: convert rootid from and to fs user_ns
> ---
> fs/xattr.c | 27 +++-
> include/linux/capability.h | 5 +-
> include/linux/security.h | 2 +
> include/uapi/linux/capability.h | 22 ++-
> security/commoncap.c | 335 ++++++++++++++++++++++++++++++++++++++--
> 5 files changed, 366 insertions(+), 25 deletions(-)
>
> diff --git a/fs/xattr.c b/fs/xattr.c
> index 2d13b4e..e9e70f1 100644
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -171,11 +171,27 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
> {
> struct inode *inode = dentry->d_inode;
> int error = -EAGAIN;
> + void *wvalue = NULL;
> + size_t wsize = 0;
> int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
> XATTR_SECURITY_PREFIX_LEN);
>
> - if (issec)
> + if (issec) {
> inode->i_flags &= ~S_NOSEC;
> +
> + /* if root in a non-init user_ns tries to set
> + * security.capability, write the virtualized
> + * xattr in its place */
> + if (!strcmp(name, "security.capability") &&
> + current_user_ns() != &init_user_ns) {
> + cap_setxattr_make_nscap(dentry, value, size, &wvalue, &wsize);
> + if (!wvalue)
> + return -EPERM;
> + value = wvalue;
> + size = wsize;
> + }
> + }
> +
> if (inode->i_opflags & IOP_XATTR) {
> error = __vfs_setxattr(dentry, inode, name, value, size, flags);
> if (!error) {
> @@ -184,8 +200,10 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
> size, flags);
> }
> } else {
> - if (unlikely(is_bad_inode(inode)))
> - return -EIO;
> + if (unlikely(is_bad_inode(inode))) {
> + error = -EIO;
> + goto out;
> + }
> }
> if (error == -EAGAIN) {
> error = -EOPNOTSUPP;
> @@ -200,10 +218,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
> }
> }
>
> +out:
> + kfree(wvalue);
> return error;
> }
>
> -
> int
> vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
> size_t size, int flags)
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index dbc21c7..edd5be1 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -13,7 +13,7 @@
> #define _LINUX_CAPABILITY_H
>
> #include <uapi/linux/capability.h>
> -
> +#include <linux/uidgid.h>
>
> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
> #define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
> @@ -246,4 +246,7 @@ extern bool file_ns_capable(const struct file *file, struct user_namespace *ns,
> /* audit system wants to get cap info from files as well */
> extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
>
> +extern void cap_setxattr_make_nscap(struct dentry *dentry, const void *value,
> + size_t size, void **wvalue, size_t *wsize);
> +
> #endif /* !_LINUX_CAPABILITY_H */
> diff --git a/include/linux/security.h b/include/linux/security.h
> index c2125e9..3127531 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -86,6 +86,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
> extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
> extern int cap_inode_need_killpriv(struct dentry *dentry);
> extern int cap_inode_killpriv(struct dentry *dentry);
> +extern int cap_inode_getsecurity(struct inode *inode, const char *name,
> + void **buffer, bool alloc);
> extern int cap_mmap_addr(unsigned long addr);
> extern int cap_mmap_file(struct file *file, unsigned long reqprot,
> unsigned long prot, unsigned long flags);
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index 49bc062..fd4f87d 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct {
> #define VFS_CAP_U32_2 2
> #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
>
> -#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
> -#define VFS_CAP_U32 VFS_CAP_U32_2
> -#define VFS_CAP_REVISION VFS_CAP_REVISION_2
> +#define VFS_CAP_REVISION_3 0x03000000
> +#define VFS_CAP_U32_3 2
> +#define XATTR_CAPS_SZ_3 (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3))
> +
> +#define XATTR_CAPS_SZ XATTR_CAPS_SZ_3
> +#define VFS_CAP_U32 VFS_CAP_U32_3
> +#define VFS_CAP_REVISION VFS_CAP_REVISION_3
>
> struct vfs_cap_data {
> __le32 magic_etc; /* Little endian */
> @@ -72,6 +76,18 @@ struct vfs_cap_data {
> } data[VFS_CAP_U32];
> };
>
> +/*
> + * same as vfs_cap_data but with a rootid at the end
> + */
> +struct vfs_ns_cap_data {
> + __le32 magic_etc;
> + struct {
> + __le32 permitted; /* Little endian */
> + __le32 inheritable; /* Little endian */
> + } data[VFS_CAP_U32];
> + __le32 rootid;
> +};
> +
> #ifndef __KERNEL__
>
> /*
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 8df676f..1f189b2 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -332,6 +332,272 @@ int cap_inode_killpriv(struct dentry *dentry)
> return error;
> }
>
> +static bool rootid_owns_currentns(kuid_t kroot)
> +{
> + struct user_namespace *ns;
> +
> + if (!uid_valid(kroot))
> + return false;
> +
> + for (ns = current_user_ns(); ; ns = ns->parent) {
> + if (from_kuid(ns, kroot) == 0) {
> + return true;
> + }
> + if (ns == &init_user_ns)
> + break;
> + }
> +
> + return false;
> +}
> +
> +static char *cap_convert_v2_v3(char *buf, struct inode *inode)
> +{
> + char *ret;
> + struct vfs_ns_cap_data *v3;
> + struct vfs_cap_data *v2 = (struct vfs_cap_data *)buf;
> + kuid_t krootid;
> +
> + krootid = make_kuid(inode->i_sb->s_user_ns, 0);
> + if (!uid_valid(krootid)) {
> + ret = ERR_PTR(-EPERM);
> + goto out;
> + }
> + ret = kmalloc(sizeof(struct vfs_ns_cap_data), GFP_NOFS);
> + if (!ret) {
> + ret = ERR_PTR(-ENOMEM);
> + goto out;
> + }
> + v3 = (struct vfs_ns_cap_data *)ret;
> +
> + memcpy(&v3->data, &v2->data, sizeof(v2->data));
> + v3->magic_etc = VFS_CAP_REVISION_3;
> + if (v2->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
> + v3->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
> + v3->rootid = from_kuid(&init_user_ns, krootid);
> +
> +out:
> + kfree(buf);
> + return ret;
> +}
> +
> +/*
> + * getsecurity: We are called for security.* before any attempt to read the
> + * xattr from the inode itself.
> + *
> + * This gives us a chance to read the on-disk value and convert it. If we
> + * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
> + *
> + * Note we are not called by vfs_getxattr_alloc(), but that is only called
> + * by the integrity subsystem, which really wants the unconverted values -
> + * so that's good.
> + */
> +int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
> + bool alloc)
> +{
> + int size, ret;
> + kuid_t kroot;
> + uid_t root, mappedroot;
> + char *tmpbuf = NULL;
> + struct vfs_ns_cap_data *nscap;
> + struct dentry *dentry;
> + struct user_namespace *fs_ns;
> +
> + if (!inode->i_op->getxattr)
> + return -EOPNOTSUPP;
> +
> + if (strcmp(name, "capability") != 0)
> + return -EOPNOTSUPP;
> +
> + dentry = d_find_alias(inode);
> + if (!dentry)
> + return -EINVAL;
> +
> + size = sizeof(struct vfs_ns_cap_data);
> + ret = vfs_getxattr_alloc(dentry, "security.capability",
> + &tmpbuf, size, GFP_NOFS);
> +
> + if (ret < 0)
> + return ret;
> +
> + fs_ns = inode->i_sb->s_user_ns;
> + if (ret == sizeof(struct vfs_cap_data) && fs_ns == &init_user_ns) {
> + /* If this is sizeof(vfs_cap_data) then we're ok with the
> + * on-disk value, so return that. */
> + if (alloc)
> + *buffer = tmpbuf;
> + else
> + kfree(tmpbuf);
> + return ret;
> + } else if (ret == sizeof(struct vfs_cap_data)) {
> + tmpbuf = cap_convert_v2_v3(tmpbuf, inode);
> + if (!tmpbuf)
> + return -EPERM;
> + } else if (ret != size) {
> + kfree(tmpbuf);
> + return -EINVAL;
> + }
> +
> + nscap = (struct vfs_ns_cap_data *) tmpbuf;
> + root = le32_to_cpu(nscap->rootid);
> + kroot = make_kuid(fs_ns, root);
> +
> + /* If the root kuid maps to a valid uid in current ns, then return
> + * this as a nscap. */
> + mappedroot = from_kuid(current_user_ns(), kroot);
> + if (mappedroot != (uid_t)-1) {
> + if (alloc) {
> + *buffer = tmpbuf;
> + nscap->rootid = cpu_to_le32(mappedroot);
> + } else
> + kfree(tmpbuf);
> + return size;
> + }
> +
> + if (!rootid_owns_currentns(kroot)) {
> + kfree(tmpbuf);
> + return -EOPNOTSUPP;
> + }
> +
> + /* This comes from a parent namespace. Return as a v2 capability */
> + size = sizeof(struct vfs_cap_data);
> + if (alloc) {
> + *buffer = kmalloc(size, GFP_ATOMIC);
> + if (*buffer) {
> + struct vfs_cap_data *cap = *buffer;
> + __le32 nsmagic, magic;
> + magic = VFS_CAP_REVISION_2;
> + nsmagic = le32_to_cpu(nscap->magic_etc);
> + if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
> + magic |= VFS_CAP_FLAGS_EFFECTIVE;
> + memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
> + cap->magic_etc = cpu_to_le32(magic);
> + }
> + }
> + kfree(tmpbuf);
> + return size;
> +}
> +
> +/*
> + * Root can only overwite an existing security.capability xattr
> + * if it is privileged over the root listed in the xattr
> + * Note we've already checked for ns_capable(CAP_SETFCAP) in the
> + * !capable_wrt_inode_uidgid() call by the caller, so we do not
> + * check for that here.
> + */
> +static bool refuse_fcap_overwrite(struct inode *inode)
> +{
> + void *tmpbuf;
> + int ret;
> + uid_t root;
> + kuid_t kroot;
> + struct vfs_ns_cap_data *nscap;
> + __u32 magic_etc;
> + bool should_refuse;
> + struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
> +
> + ret = cap_inode_getsecurity(inode, "capability", &tmpbuf, true);
> + if (ret < 0)
> + return false;
> + if (ret == sizeof(struct vfs_cap_data) && fs_ns == &init_user_ns) {
> + /*
> + * host-root-installed capability, user-namespace-root may
> + * not overwrite this.
> + * TODO - if inode->i_sb->s_user_ns != &init_user_ns we do
> + * in fact want to allow it.
> + */
> + kfree(tmpbuf);
> + return true;
> + }
> + if (ret < sizeof(struct vfs_ns_cap_data)) {
> + /* Corrupt fscap. Caller is privileged wrt inode, permit fixup */
> + kfree(tmpbuf);
> + return false;
> + }
> +
> + nscap = (struct vfs_ns_cap_data *)tmpbuf;
> +
> + magic_etc = le32_to_cpu(nscap->magic_etc);
> + if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_3) {
> + /*
> + * This version is newer than we know about - i.e. from a newer
> + * kernel. Don't overwrite.
> + */
> + kfree(tmpbuf);
> + return true;
> + }
> + if (ret != sizeof(struct vfs_ns_cap_data)) {
> + /* Corrupt v4 fscap. Permit fixup */
> + kfree(tmpbuf);
> + return false;
> + }
> + root = le32_to_cpu(nscap->rootid);
> + kroot = make_kuid(&init_user_ns, root);
> + should_refuse = !kuid_has_mapping(current_user_ns(), kroot);
> + kfree(tmpbuf);
> + return should_refuse;
> +}
> +
> +static kuid_t rootid_from_xattr(const void *value, size_t size,
> + struct user_namespace *task_ns)
> +{
> + const struct vfs_ns_cap_data *nscap = value;
> + uid_t rootid = 0;
> +
> + if (size == XATTR_CAPS_SZ_3)
> + rootid = le32_to_cpu(nscap->rootid);
> +
> + return make_kuid(task_ns, rootid);
> +}
> +
> +/*
> + * Use requested a write of security.capability but is in a non-init
> + * userns. So we construct and write a v4.
> + *
> + * If all is ok, wvalue has an allocated new value. Otherwise, wvalue
> + * is NULL.
> + */
> +void cap_setxattr_make_nscap(struct dentry *dentry, const void *value, size_t size,
> + void **wvalue, size_t *wsize)
> +{
> + struct vfs_ns_cap_data *nscap;
> + const struct vfs_cap_data *cap = value;
> + __u32 magic, nsmagic;
> + struct inode *inode = d_backing_inode(dentry);
> + struct user_namespace *task_ns = current_user_ns(),
> + *fs_ns = inode->i_sb->s_user_ns;
> + kuid_t rootid;
> +
> + if (!value)
> + return;
> + if (size != XATTR_CAPS_SZ_2 && size != XATTR_CAPS_SZ_3)
> + return;
> + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
> + return;
> +
> + /* refuse if security.capability exists */
> + if (refuse_fcap_overwrite(inode))
> + return;
> +
> + rootid = rootid_from_xattr(value, size, task_ns);
> + if (!uid_valid(rootid))
> + return;
> +
> + *wsize = sizeof(struct vfs_ns_cap_data);
> + nscap = kmalloc(*wsize, GFP_ATOMIC);
> + if (!nscap)
> + return;
> + nscap->rootid = cpu_to_le32(from_kuid(fs_ns, rootid));
> + nsmagic = VFS_CAP_REVISION_3;
> + magic = le32_to_cpu(cap->magic_etc);
> + if (magic & VFS_CAP_FLAGS_EFFECTIVE)
> + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
> + nscap->magic_etc = cpu_to_le32(nsmagic);
> + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
> +
> + *wvalue = nscap;
> + return;
> +}
> +
> /*
> * Calculate the new process capability sets from the capability sets attached
> * to a file.
> @@ -385,7 +651,10 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> __u32 magic_etc;
> unsigned tocopy, i;
> int size;
> - struct vfs_cap_data caps;
> + struct vfs_ns_cap_data data, *nscaps = &data;
> + struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
> + kuid_t rootkuid;
> + struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
>
> memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
>
> @@ -393,17 +662,18 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> return -ENODATA;
>
> size = __vfs_getxattr((struct dentry *)dentry, inode,
> - XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ);
> + XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> if (size == -ENODATA || size == -EOPNOTSUPP)
> /* no data, that's ok */
> return -ENODATA;
> +
> if (size < 0)
> return size;
>
> if (size < sizeof(magic_etc))
> return -EINVAL;
>
> - cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
> + cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
>
> switch (magic_etc & VFS_CAP_REVISION_MASK) {
> case VFS_CAP_REVISION_1:
> @@ -414,8 +684,25 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> case VFS_CAP_REVISION_2:
> if (size != XATTR_CAPS_SZ_2)
> return -EINVAL;
> + if (fs_ns != &init_user_ns) {
> + /* unpriv user mounted this fs; make sure they
> + * own current user_ns */
> + rootkuid = make_kuid(fs_ns, 0);
> + if (!rootid_owns_currentns(rootkuid))
> + return -ENODATA;
> + }
> tocopy = VFS_CAP_U32_2;
> break;
> + case VFS_CAP_REVISION_3:
> + if (size != XATTR_CAPS_SZ_3)
> + return -EINVAL;
> + tocopy = VFS_CAP_U32_3;
> +
> + rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
> + if (!rootid_owns_currentns(rootkuid))
> + return -ENODATA;
> + break;
> +
> default:
> return -EINVAL;
> }
> @@ -423,8 +710,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> CAP_FOR_EACH_U32(i) {
> if (i >= tocopy)
> break;
> - cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
> - cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
> + cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
> + cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
> }
>
> cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
> @@ -462,8 +749,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
> rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
> if (rc < 0) {
> if (rc == -EINVAL)
> - printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
> - __func__, rc, bprm->filename);
> + printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
> + bprm->filename);
> else if (rc == -ENODATA)
> rc = 0;
> goto out;
> @@ -659,15 +946,21 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
> int cap_inode_setxattr(struct dentry *dentry, const char *name,
> const void *value, size_t size, int flags)
> {
> - if (!strcmp(name, XATTR_NAME_CAPS)) {
> - if (!capable(CAP_SETFCAP))
> + /* Ignore non-security xattrs */
> + if (strncmp(name, XATTR_SECURITY_PREFIX,
> + sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
> + return 0;
> +
> + if (strcmp(name, XATTR_NAME_CAPS) == 0) {
> + /* Write from initial user_ns will in * __vfs_setxattr_noperm()
> + * be diverted to a nscap write. But from initial user_ns we
> + * require CAP_SETFCAP targeted at init_user_ns */
> + if (current_user_ns() == &init_user_ns && !capable(CAP_SETFCAP))
> return -EPERM;
> return 0;
> }
>
> - if (!strncmp(name, XATTR_SECURITY_PREFIX,
> - sizeof(XATTR_SECURITY_PREFIX) - 1) &&
> - !capable(CAP_SYS_ADMIN))
> + if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> return 0;
> }
> @@ -685,15 +978,22 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
> */
> int cap_inode_removexattr(struct dentry *dentry, const char *name)
> {
> - if (!strcmp(name, XATTR_NAME_CAPS)) {
> - if (!capable(CAP_SETFCAP))
> + /* Ignore non-security xattrs */
> + if (strncmp(name, XATTR_SECURITY_PREFIX,
> + sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
> + return 0;
> +
> + if (strcmp(name, XATTR_NAME_CAPS) == 0) {
> + /* security.capability gets namespaced */
> + struct inode *inode = d_backing_inode(dentry);
> + if (!inode)
> + return -EINVAL;
> + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
> return -EPERM;
> return 0;
> }
>
> - if (!strncmp(name, XATTR_SECURITY_PREFIX,
> - sizeof(XATTR_SECURITY_PREFIX) - 1) &&
> - !capable(CAP_SYS_ADMIN))
> + if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> return 0;
> }
> @@ -1081,6 +1381,7 @@ struct security_hook_list capability_hooks[] = {
> LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
> LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
> LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
> + LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
> LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
> LSM_HOOK_INIT(mmap_file, cap_mmap_file),
> LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
Hi Serge,
On 11/19/2016 04:17 PM, Serge E. Hallyn wrote:
> Root in a user ns cannot be trusted to write a traditional
> security.capability xattr. If it were allowed to do so, then any
> unprivileged user on the host could map his own uid to root in a
> namespace, write the xattr, and execute the file with privilege on the
> host.
>
> This patch introduces v3 of the security.capability xattr. It builds a
> vfs_ns_cap_data struct by appending a uid_t rootid to struct
> vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in
> init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces
> the file capabilities may take effect.
>
> When a task in a user ns (which is privileged with CAP_SETFCAP toward
> that user_ns) asks to write v2 security.capability, the kernel will
> transparently rewrite the xattr as a v3 with the appropriate rootid.
> Subsequently, any task executing the file which has the noted kuid as
> its root uid, or which is in a descendent user_ns of such a user_ns,
> will run the file with capabilities.
>
> If a task writes a v3 security.capability, then it can provide a
> uid (valid within its own user namespace, over which it has CAP_SETFCAP)
> for the xattr. The kernel will translate that to the absolute uid, and
> write that to disk. After this, a task in the writer's namespace will
> not be able to use those capabilities, but a task in a namespace where
> the given uid is root will.
>
> Only a single security.capability xattr may be written. A task may
> overwrite the existing one so long as it was written by a user mapped
> into his own user_ns over which he has CAP_SETFCAP.
>
> This allows a simple setxattr to work, allows tar/untar to work, and
> allows us to tar in one namespace and untar in another while preserving
> the capability, without risking leaking privilege into a parent
> namespace.
Could we have a man-pages patch for this feature? Presumably for
user_namespaces(7) or capabilities(7).
Cheers,
Michael
>
> Changelog:
> Nov 02 2016: fix invalid check at refuse_fcap_overwrite()
> Nov 07 2016: convert rootid from and to fs user_ns
> ---
> fs/xattr.c | 27 +++-
> include/linux/capability.h | 5 +-
> include/linux/security.h | 2 +
> include/uapi/linux/capability.h | 22 ++-
> security/commoncap.c | 335 ++++++++++++++++++++++++++++++++++++++--
> 5 files changed, 366 insertions(+), 25 deletions(-)
>
> diff --git a/fs/xattr.c b/fs/xattr.c
> index 2d13b4e..e9e70f1 100644
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -171,11 +171,27 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
> {
> struct inode *inode = dentry->d_inode;
> int error = -EAGAIN;
> + void *wvalue = NULL;
> + size_t wsize = 0;
> int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
> XATTR_SECURITY_PREFIX_LEN);
>
> - if (issec)
> + if (issec) {
> inode->i_flags &= ~S_NOSEC;
> +
> + /* if root in a non-init user_ns tries to set
> + * security.capability, write the virtualized
> + * xattr in its place */
> + if (!strcmp(name, "security.capability") &&
> + current_user_ns() != &init_user_ns) {
> + cap_setxattr_make_nscap(dentry, value, size, &wvalue, &wsize);
> + if (!wvalue)
> + return -EPERM;
> + value = wvalue;
> + size = wsize;
> + }
> + }
> +
> if (inode->i_opflags & IOP_XATTR) {
> error = __vfs_setxattr(dentry, inode, name, value, size, flags);
> if (!error) {
> @@ -184,8 +200,10 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
> size, flags);
> }
> } else {
> - if (unlikely(is_bad_inode(inode)))
> - return -EIO;
> + if (unlikely(is_bad_inode(inode))) {
> + error = -EIO;
> + goto out;
> + }
> }
> if (error == -EAGAIN) {
> error = -EOPNOTSUPP;
> @@ -200,10 +218,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
> }
> }
>
> +out:
> + kfree(wvalue);
> return error;
> }
>
> -
> int
> vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
> size_t size, int flags)
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index dbc21c7..edd5be1 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -13,7 +13,7 @@
> #define _LINUX_CAPABILITY_H
>
> #include <uapi/linux/capability.h>
> -
> +#include <linux/uidgid.h>
>
> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
> #define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
> @@ -246,4 +246,7 @@ extern bool file_ns_capable(const struct file *file, struct user_namespace *ns,
> /* audit system wants to get cap info from files as well */
> extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
>
> +extern void cap_setxattr_make_nscap(struct dentry *dentry, const void *value,
> + size_t size, void **wvalue, size_t *wsize);
> +
> #endif /* !_LINUX_CAPABILITY_H */
> diff --git a/include/linux/security.h b/include/linux/security.h
> index c2125e9..3127531 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -86,6 +86,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
> extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
> extern int cap_inode_need_killpriv(struct dentry *dentry);
> extern int cap_inode_killpriv(struct dentry *dentry);
> +extern int cap_inode_getsecurity(struct inode *inode, const char *name,
> + void **buffer, bool alloc);
> extern int cap_mmap_addr(unsigned long addr);
> extern int cap_mmap_file(struct file *file, unsigned long reqprot,
> unsigned long prot, unsigned long flags);
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index 49bc062..fd4f87d 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct {
> #define VFS_CAP_U32_2 2
> #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
>
> -#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
> -#define VFS_CAP_U32 VFS_CAP_U32_2
> -#define VFS_CAP_REVISION VFS_CAP_REVISION_2
> +#define VFS_CAP_REVISION_3 0x03000000
> +#define VFS_CAP_U32_3 2
> +#define XATTR_CAPS_SZ_3 (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3))
> +
> +#define XATTR_CAPS_SZ XATTR_CAPS_SZ_3
> +#define VFS_CAP_U32 VFS_CAP_U32_3
> +#define VFS_CAP_REVISION VFS_CAP_REVISION_3
>
> struct vfs_cap_data {
> __le32 magic_etc; /* Little endian */
> @@ -72,6 +76,18 @@ struct vfs_cap_data {
> } data[VFS_CAP_U32];
> };
>
> +/*
> + * same as vfs_cap_data but with a rootid at the end
> + */
> +struct vfs_ns_cap_data {
> + __le32 magic_etc;
> + struct {
> + __le32 permitted; /* Little endian */
> + __le32 inheritable; /* Little endian */
> + } data[VFS_CAP_U32];
> + __le32 rootid;
> +};
> +
> #ifndef __KERNEL__
>
> /*
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 8df676f..1f189b2 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -332,6 +332,272 @@ int cap_inode_killpriv(struct dentry *dentry)
> return error;
> }
>
> +static bool rootid_owns_currentns(kuid_t kroot)
> +{
> + struct user_namespace *ns;
> +
> + if (!uid_valid(kroot))
> + return false;
> +
> + for (ns = current_user_ns(); ; ns = ns->parent) {
> + if (from_kuid(ns, kroot) == 0) {
> + return true;
> + }
> + if (ns == &init_user_ns)
> + break;
> + }
> +
> + return false;
> +}
> +
> +static char *cap_convert_v2_v3(char *buf, struct inode *inode)
> +{
> + char *ret;
> + struct vfs_ns_cap_data *v3;
> + struct vfs_cap_data *v2 = (struct vfs_cap_data *)buf;
> + kuid_t krootid;
> +
> + krootid = make_kuid(inode->i_sb->s_user_ns, 0);
> + if (!uid_valid(krootid)) {
> + ret = ERR_PTR(-EPERM);
> + goto out;
> + }
> + ret = kmalloc(sizeof(struct vfs_ns_cap_data), GFP_NOFS);
> + if (!ret) {
> + ret = ERR_PTR(-ENOMEM);
> + goto out;
> + }
> + v3 = (struct vfs_ns_cap_data *)ret;
> +
> + memcpy(&v3->data, &v2->data, sizeof(v2->data));
> + v3->magic_etc = VFS_CAP_REVISION_3;
> + if (v2->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
> + v3->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
> + v3->rootid = from_kuid(&init_user_ns, krootid);
> +
> +out:
> + kfree(buf);
> + return ret;
> +}
> +
> +/*
> + * getsecurity: We are called for security.* before any attempt to read the
> + * xattr from the inode itself.
> + *
> + * This gives us a chance to read the on-disk value and convert it. If we
> + * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
> + *
> + * Note we are not called by vfs_getxattr_alloc(), but that is only called
> + * by the integrity subsystem, which really wants the unconverted values -
> + * so that's good.
> + */
> +int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
> + bool alloc)
> +{
> + int size, ret;
> + kuid_t kroot;
> + uid_t root, mappedroot;
> + char *tmpbuf = NULL;
> + struct vfs_ns_cap_data *nscap;
> + struct dentry *dentry;
> + struct user_namespace *fs_ns;
> +
> + if (!inode->i_op->getxattr)
> + return -EOPNOTSUPP;
> +
> + if (strcmp(name, "capability") != 0)
> + return -EOPNOTSUPP;
> +
> + dentry = d_find_alias(inode);
> + if (!dentry)
> + return -EINVAL;
> +
> + size = sizeof(struct vfs_ns_cap_data);
> + ret = vfs_getxattr_alloc(dentry, "security.capability",
> + &tmpbuf, size, GFP_NOFS);
> +
> + if (ret < 0)
> + return ret;
> +
> + fs_ns = inode->i_sb->s_user_ns;
> + if (ret == sizeof(struct vfs_cap_data) && fs_ns == &init_user_ns) {
> + /* If this is sizeof(vfs_cap_data) then we're ok with the
> + * on-disk value, so return that. */
> + if (alloc)
> + *buffer = tmpbuf;
> + else
> + kfree(tmpbuf);
> + return ret;
> + } else if (ret == sizeof(struct vfs_cap_data)) {
> + tmpbuf = cap_convert_v2_v3(tmpbuf, inode);
> + if (!tmpbuf)
> + return -EPERM;
> + } else if (ret != size) {
> + kfree(tmpbuf);
> + return -EINVAL;
> + }
> +
> + nscap = (struct vfs_ns_cap_data *) tmpbuf;
> + root = le32_to_cpu(nscap->rootid);
> + kroot = make_kuid(fs_ns, root);
> +
> + /* If the root kuid maps to a valid uid in current ns, then return
> + * this as a nscap. */
> + mappedroot = from_kuid(current_user_ns(), kroot);
> + if (mappedroot != (uid_t)-1) {
> + if (alloc) {
> + *buffer = tmpbuf;
> + nscap->rootid = cpu_to_le32(mappedroot);
> + } else
> + kfree(tmpbuf);
> + return size;
> + }
> +
> + if (!rootid_owns_currentns(kroot)) {
> + kfree(tmpbuf);
> + return -EOPNOTSUPP;
> + }
> +
> + /* This comes from a parent namespace. Return as a v2 capability */
> + size = sizeof(struct vfs_cap_data);
> + if (alloc) {
> + *buffer = kmalloc(size, GFP_ATOMIC);
> + if (*buffer) {
> + struct vfs_cap_data *cap = *buffer;
> + __le32 nsmagic, magic;
> + magic = VFS_CAP_REVISION_2;
> + nsmagic = le32_to_cpu(nscap->magic_etc);
> + if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
> + magic |= VFS_CAP_FLAGS_EFFECTIVE;
> + memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
> + cap->magic_etc = cpu_to_le32(magic);
> + }
> + }
> + kfree(tmpbuf);
> + return size;
> +}
> +
> +/*
> + * Root can only overwite an existing security.capability xattr
> + * if it is privileged over the root listed in the xattr
> + * Note we've already checked for ns_capable(CAP_SETFCAP) in the
> + * !capable_wrt_inode_uidgid() call by the caller, so we do not
> + * check for that here.
> + */
> +static bool refuse_fcap_overwrite(struct inode *inode)
> +{
> + void *tmpbuf;
> + int ret;
> + uid_t root;
> + kuid_t kroot;
> + struct vfs_ns_cap_data *nscap;
> + __u32 magic_etc;
> + bool should_refuse;
> + struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
> +
> + ret = cap_inode_getsecurity(inode, "capability", &tmpbuf, true);
> + if (ret < 0)
> + return false;
> + if (ret == sizeof(struct vfs_cap_data) && fs_ns == &init_user_ns) {
> + /*
> + * host-root-installed capability, user-namespace-root may
> + * not overwrite this.
> + * TODO - if inode->i_sb->s_user_ns != &init_user_ns we do
> + * in fact want to allow it.
> + */
> + kfree(tmpbuf);
> + return true;
> + }
> + if (ret < sizeof(struct vfs_ns_cap_data)) {
> + /* Corrupt fscap. Caller is privileged wrt inode, permit fixup */
> + kfree(tmpbuf);
> + return false;
> + }
> +
> + nscap = (struct vfs_ns_cap_data *)tmpbuf;
> +
> + magic_etc = le32_to_cpu(nscap->magic_etc);
> + if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_3) {
> + /*
> + * This version is newer than we know about - i.e. from a newer
> + * kernel. Don't overwrite.
> + */
> + kfree(tmpbuf);
> + return true;
> + }
> + if (ret != sizeof(struct vfs_ns_cap_data)) {
> + /* Corrupt v4 fscap. Permit fixup */
> + kfree(tmpbuf);
> + return false;
> + }
> + root = le32_to_cpu(nscap->rootid);
> + kroot = make_kuid(&init_user_ns, root);
> + should_refuse = !kuid_has_mapping(current_user_ns(), kroot);
> + kfree(tmpbuf);
> + return should_refuse;
> +}
> +
> +static kuid_t rootid_from_xattr(const void *value, size_t size,
> + struct user_namespace *task_ns)
> +{
> + const struct vfs_ns_cap_data *nscap = value;
> + uid_t rootid = 0;
> +
> + if (size == XATTR_CAPS_SZ_3)
> + rootid = le32_to_cpu(nscap->rootid);
> +
> + return make_kuid(task_ns, rootid);
> +}
> +
> +/*
> + * Use requested a write of security.capability but is in a non-init
> + * userns. So we construct and write a v4.
> + *
> + * If all is ok, wvalue has an allocated new value. Otherwise, wvalue
> + * is NULL.
> + */
> +void cap_setxattr_make_nscap(struct dentry *dentry, const void *value, size_t size,
> + void **wvalue, size_t *wsize)
> +{
> + struct vfs_ns_cap_data *nscap;
> + const struct vfs_cap_data *cap = value;
> + __u32 magic, nsmagic;
> + struct inode *inode = d_backing_inode(dentry);
> + struct user_namespace *task_ns = current_user_ns(),
> + *fs_ns = inode->i_sb->s_user_ns;
> + kuid_t rootid;
> +
> + if (!value)
> + return;
> + if (size != XATTR_CAPS_SZ_2 && size != XATTR_CAPS_SZ_3)
> + return;
> + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
> + return;
> +
> + /* refuse if security.capability exists */
> + if (refuse_fcap_overwrite(inode))
> + return;
> +
> + rootid = rootid_from_xattr(value, size, task_ns);
> + if (!uid_valid(rootid))
> + return;
> +
> + *wsize = sizeof(struct vfs_ns_cap_data);
> + nscap = kmalloc(*wsize, GFP_ATOMIC);
> + if (!nscap)
> + return;
> + nscap->rootid = cpu_to_le32(from_kuid(fs_ns, rootid));
> + nsmagic = VFS_CAP_REVISION_3;
> + magic = le32_to_cpu(cap->magic_etc);
> + if (magic & VFS_CAP_FLAGS_EFFECTIVE)
> + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
> + nscap->magic_etc = cpu_to_le32(nsmagic);
> + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
> +
> + *wvalue = nscap;
> + return;
> +}
> +
> /*
> * Calculate the new process capability sets from the capability sets attached
> * to a file.
> @@ -385,7 +651,10 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> __u32 magic_etc;
> unsigned tocopy, i;
> int size;
> - struct vfs_cap_data caps;
> + struct vfs_ns_cap_data data, *nscaps = &data;
> + struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
> + kuid_t rootkuid;
> + struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
>
> memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
>
> @@ -393,17 +662,18 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> return -ENODATA;
>
> size = __vfs_getxattr((struct dentry *)dentry, inode,
> - XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ);
> + XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> if (size == -ENODATA || size == -EOPNOTSUPP)
> /* no data, that's ok */
> return -ENODATA;
> +
> if (size < 0)
> return size;
>
> if (size < sizeof(magic_etc))
> return -EINVAL;
>
> - cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
> + cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
>
> switch (magic_etc & VFS_CAP_REVISION_MASK) {
> case VFS_CAP_REVISION_1:
> @@ -414,8 +684,25 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> case VFS_CAP_REVISION_2:
> if (size != XATTR_CAPS_SZ_2)
> return -EINVAL;
> + if (fs_ns != &init_user_ns) {
> + /* unpriv user mounted this fs; make sure they
> + * own current user_ns */
> + rootkuid = make_kuid(fs_ns, 0);
> + if (!rootid_owns_currentns(rootkuid))
> + return -ENODATA;
> + }
> tocopy = VFS_CAP_U32_2;
> break;
> + case VFS_CAP_REVISION_3:
> + if (size != XATTR_CAPS_SZ_3)
> + return -EINVAL;
> + tocopy = VFS_CAP_U32_3;
> +
> + rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
> + if (!rootid_owns_currentns(rootkuid))
> + return -ENODATA;
> + break;
> +
> default:
> return -EINVAL;
> }
> @@ -423,8 +710,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> CAP_FOR_EACH_U32(i) {
> if (i >= tocopy)
> break;
> - cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
> - cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
> + cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
> + cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
> }
>
> cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
> @@ -462,8 +749,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
> rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
> if (rc < 0) {
> if (rc == -EINVAL)
> - printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
> - __func__, rc, bprm->filename);
> + printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
> + bprm->filename);
> else if (rc == -ENODATA)
> rc = 0;
> goto out;
> @@ -659,15 +946,21 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
> int cap_inode_setxattr(struct dentry *dentry, const char *name,
> const void *value, size_t size, int flags)
> {
> - if (!strcmp(name, XATTR_NAME_CAPS)) {
> - if (!capable(CAP_SETFCAP))
> + /* Ignore non-security xattrs */
> + if (strncmp(name, XATTR_SECURITY_PREFIX,
> + sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
> + return 0;
> +
> + if (strcmp(name, XATTR_NAME_CAPS) == 0) {
> + /* Write from initial user_ns will in * __vfs_setxattr_noperm()
> + * be diverted to a nscap write. But from initial user_ns we
> + * require CAP_SETFCAP targeted at init_user_ns */
> + if (current_user_ns() == &init_user_ns && !capable(CAP_SETFCAP))
> return -EPERM;
> return 0;
> }
>
> - if (!strncmp(name, XATTR_SECURITY_PREFIX,
> - sizeof(XATTR_SECURITY_PREFIX) - 1) &&
> - !capable(CAP_SYS_ADMIN))
> + if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> return 0;
> }
> @@ -685,15 +978,22 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
> */
> int cap_inode_removexattr(struct dentry *dentry, const char *name)
> {
> - if (!strcmp(name, XATTR_NAME_CAPS)) {
> - if (!capable(CAP_SETFCAP))
> + /* Ignore non-security xattrs */
> + if (strncmp(name, XATTR_SECURITY_PREFIX,
> + sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
> + return 0;
> +
> + if (strcmp(name, XATTR_NAME_CAPS) == 0) {
> + /* security.capability gets namespaced */
> + struct inode *inode = d_backing_inode(dentry);
> + if (!inode)
> + return -EINVAL;
> + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
> return -EPERM;
> return 0;
> }
>
> - if (!strncmp(name, XATTR_SECURITY_PREFIX,
> - sizeof(XATTR_SECURITY_PREFIX) - 1) &&
> - !capable(CAP_SYS_ADMIN))
> + if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> return 0;
> }
> @@ -1081,6 +1381,7 @@ struct security_hook_list capability_hooks[] = {
> LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
> LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
> LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
> + LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
> LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
> LSM_HOOK_INIT(mmap_file, cap_mmap_file),
> LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
>
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
Quoting Michael Kerrisk (man-pages) ([email protected]):
> Hi Serge,
>
> On 11/19/2016 04:17 PM, Serge E. Hallyn wrote:
> > Root in a user ns cannot be trusted to write a traditional
> > security.capability xattr. If it were allowed to do so, then any
> > unprivileged user on the host could map his own uid to root in a
> > namespace, write the xattr, and execute the file with privilege on the
> > host.
> >
> > This patch introduces v3 of the security.capability xattr. It builds a
> > vfs_ns_cap_data struct by appending a uid_t rootid to struct
> > vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in
> > init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces
> > the file capabilities may take effect.
> >
> > When a task in a user ns (which is privileged with CAP_SETFCAP toward
> > that user_ns) asks to write v2 security.capability, the kernel will
> > transparently rewrite the xattr as a v3 with the appropriate rootid.
> > Subsequently, any task executing the file which has the noted kuid as
> > its root uid, or which is in a descendent user_ns of such a user_ns,
> > will run the file with capabilities.
> >
> > If a task writes a v3 security.capability, then it can provide a
> > uid (valid within its own user namespace, over which it has CAP_SETFCAP)
> > for the xattr. The kernel will translate that to the absolute uid, and
> > write that to disk. After this, a task in the writer's namespace will
> > not be able to use those capabilities, but a task in a namespace where
> > the given uid is root will.
> >
> > Only a single security.capability xattr may be written. A task may
> > overwrite the existing one so long as it was written by a user mapped
> > into his own user_ns over which he has CAP_SETFCAP.
> >
> > This allows a simple setxattr to work, allows tar/untar to work, and
> > allows us to tar in one namespace and untar in another while preserving
> > the capability, without risking leaking privilege into a parent
> > namespace.
>
> Could we have a man-pages patch for this feature? Presumably for
> user_namespaces(7) or capabilities(7).
capabilities.7 doesn't actually mention anything about user namespaces
right now. I'll come up with a patch for both I think. Do you have a
deadline for a new release coming up?
Hi Serge,
On 11/24/2016 11:52 PM, Serge E. Hallyn wrote:
> Quoting Michael Kerrisk (man-pages) ([email protected]):
[...]
>> Could we have a man-pages patch for this feature? Presumably for
>> user_namespaces(7) or capabilities(7).
>
> capabilities.7 doesn't actually mention anything about user namespaces
> right now.
True. There's really just this:
Interaction with user namespaces
For a discussion of the interaction of capabilities and user
namespaces, see user_namespaces(7).
> I'll come up with a patch for both I think. Do you have a
> deadline for a new release coming up?
No deadlines as such. The last couple of years, as a sort of
experiment, I've fallen into the same release cycle as the kernel
(typically making a release in the week or so after the kernel release),
and I am even using a similar numbering scheme. Ideally, the man-pages
patch would go into the release that corresponds to the kernel release
that makes the change.
Cheers,
Michael
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
On Fri, Nov 25, 2016 at 09:33:50AM +0100, Michael Kerrisk (man-pages) wrote:
> Hi Serge,
>
> On 11/24/2016 11:52 PM, Serge E. Hallyn wrote:
> > Quoting Michael Kerrisk (man-pages) ([email protected]):
>
> [...]
>
> >> Could we have a man-pages patch for this feature? Presumably for
> >> user_namespaces(7) or capabilities(7).
> >
> > capabilities.7 doesn't actually mention anything about user namespaces
> > right now.
>
> True. There's really just this:
>
> Interaction with user namespaces
> For a discussion of the interaction of capabilities and user
> namespaces, see user_namespaces(7).
>
> > I'll come up with a patch for both I think. Do you have a
> > deadline for a new release coming up?
>
> No deadlines as such. The last couple of years, as a sort of
> experiment, I've fallen into the same release cycle as the kernel
> (typically making a release in the week or so after the kernel release),
> and I am even using a similar numbering scheme. Ideally, the man-pages
> patch would go into the release that corresponds to the kernel release
> that makes the change.
Cool - I'll write something up in the next few weeks.
On 11/25/2016 06:50 PM, Serge E. Hallyn wrote:
> On Fri, Nov 25, 2016 at 09:33:50AM +0100, Michael Kerrisk (man-pages) wrote:
>> Hi Serge,
>>
>> On 11/24/2016 11:52 PM, Serge E. Hallyn wrote:
>>> Quoting Michael Kerrisk (man-pages) ([email protected]):
>>
>> [...]
>>
>>>> Could we have a man-pages patch for this feature? Presumably for
>>>> user_namespaces(7) or capabilities(7).
>>>
>>> capabilities.7 doesn't actually mention anything about user namespaces
>>> right now.
>>
>> True. There's really just this:
>>
>> Interaction with user namespaces
>> For a discussion of the interaction of capabilities and user
>> namespaces, see user_namespaces(7).
>>
>>> I'll come up with a patch for both I think. Do you have a
>>> deadline for a new release coming up?
>>
>> No deadlines as such. The last couple of years, as a sort of
>> experiment, I've fallen into the same release cycle as the kernel
>> (typically making a release in the week or so after the kernel release),
>> and I am even using a similar numbering scheme. Ideally, the man-pages
>> patch would go into the release that corresponds to the kernel release
>> that makes the change.
>
> Cool - I'll write something up in the next few weeks.
Obviously, the sooner you write it, the sooner others may read--and
perhaps test--it.
Cheers,
Michael
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
Quoting Michael Kerrisk (man-pages) ([email protected]):
> On 11/25/2016 06:50 PM, Serge E. Hallyn wrote:
> > On Fri, Nov 25, 2016 at 09:33:50AM +0100, Michael Kerrisk (man-pages) wrote:
> >> Hi Serge,
> >>
> >> On 11/24/2016 11:52 PM, Serge E. Hallyn wrote:
> >>> Quoting Michael Kerrisk (man-pages) ([email protected]):
> >>
> >> [...]
> >>
> >>>> Could we have a man-pages patch for this feature? Presumably for
> >>>> user_namespaces(7) or capabilities(7).
> >>>
> >>> capabilities.7 doesn't actually mention anything about user namespaces
> >>> right now.
> >>
> >> True. There's really just this:
> >>
> >> Interaction with user namespaces
> >> For a discussion of the interaction of capabilities and user
> >> namespaces, see user_namespaces(7).
> >>
> >>> I'll come up with a patch for both I think. Do you have a
> >>> deadline for a new release coming up?
> >>
> >> No deadlines as such. The last couple of years, as a sort of
> >> experiment, I've fallen into the same release cycle as the kernel
> >> (typically making a release in the week or so after the kernel release),
> >> and I am even using a similar numbering scheme. Ideally, the man-pages
> >> patch would go into the release that corresponds to the kernel release
> >> that makes the change.
> >
> > Cool - I'll write something up in the next few weeks.
>
> Obviously, the sooner you write it, the sooner others may read--and
> perhaps test--it.
Hi,
first draft
https://git.kernel.org/cgit/linux/kernel/git/sergeh/man-pages.git/commit/?h=2016-11-29/nscaps
>From 62578b7cb2e0cbb100d1b29000de5657e9d998c4 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <[email protected]>
Date: Tue, 29 Nov 2016 15:25:37 -0600
Subject: [PATCH 1/1] Describe the new namespaced file capabilities.
---
man7/user_namespaces.7 | 35 +++++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
diff --git a/man7/user_namespaces.7 b/man7/user_namespaces.7
index 0c99df0..b1dd027 100644
--- a/man7/user_namespaces.7
+++ b/man7/user_namespaces.7
@@ -208,6 +208,41 @@ further removed descendant user namespaces as well.
.\"
.\" ============================================================
.\"
+.SS File capabilities in user namespaces
+Until v4.9, writing file capabilities required the writer to possess
+.BR CAP_SETFCAP
+targeted at the initial user namespace. In v4.10 a new version (v3) of the
+file capability extended attribute was introduced, which targets the
+capabilities at a namespace root userid. This means that a task executing the
+file will receive elevated privilege only if it is running in a namespace whose
+root is mapped to the specified target uid. If a task does not have
+.BR CAP_SETFCAP
+toward the user namespace which owns the filesystem hosting the file, then it
+can only write file capabilities targeted at uids mapped in the task's own
+namespace.
+
+As a detailed example, assume a user namespace where uid 0 is mapped to host
+uid 100000. Root in the container writes a file capability. If the file
+capability xattr is v2, then a v3 capability xattr targeted to 100000 will be
+written.
+
+If instead a v3 capability xattr is written, then the kernel will verify that
+the writer is privileged with
+.BR CAP_SETFCAP
+over its own namespace and that the file owner's uid and gid are mapped into
+the current task's namespace.
+
+The capability target uid which is written to disk is mapped into the
+filesystem's user namespace. Therefore, in the above example, if uid 0 in the
+namespace (100000 on the host) mounted the filesystem, the target uid value
+actually written will be converted back to 0 (the mapped value for host uid
+100000). In this case the mount will be treated as foreign for any tasks in
+the initial user namespace, so that the file capability (as well as setuid and
+setgid bits) will be ignored, preventing a leak of privilege.
+
+.\"
+.\" ============================================================
+.\"
.SS Effect of capabilities within a user namespace
Having a capability inside a user namespace
permits a process to perform operations (that require privilege)
--
2.7.4
"Serge E. Hallyn" <[email protected]> writes:
> Root in a user ns cannot be trusted to write a traditional
> security.capability xattr. If it were allowed to do so, then any
> unprivileged user on the host could map his own uid to root in a
> namespace, write the xattr, and execute the file with privilege on the
> host.
>
> This patch introduces v3 of the security.capability xattr. It builds a
> vfs_ns_cap_data struct by appending a uid_t rootid to struct
> vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in
> init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces
> the file capabilities may take effect.
>
> When a task in a user ns (which is privileged with CAP_SETFCAP toward
> that user_ns) asks to write v2 security.capability, the kernel will
> transparently rewrite the xattr as a v3 with the appropriate rootid.
> Subsequently, any task executing the file which has the noted kuid as
> its root uid, or which is in a descendent user_ns of such a user_ns,
> will run the file with capabilities.
>
> If a task writes a v3 security.capability, then it can provide a
> uid (valid within its own user namespace, over which it has CAP_SETFCAP)
> for the xattr. The kernel will translate that to the absolute uid, and
> write that to disk. After this, a task in the writer's namespace will
> not be able to use those capabilities, but a task in a namespace where
> the given uid is root will.
>
> Only a single security.capability xattr may be written. A task may
> overwrite the existing one so long as it was written by a user mapped
> into his own user_ns over which he has CAP_SETFCAP.
>
> This allows a simple setxattr to work, allows tar/untar to work, and
> allows us to tar in one namespace and untar in another while preserving
> the capability, without risking leaking privilege into a parent
> namespace.
Any chance of a singed-off-by?
Eric
On Thu, Dec 08, 2016 at 05:43:09PM +1300, Eric W. Biederman wrote:
> "Serge E. Hallyn" <[email protected]> writes:
>
> > Root in a user ns cannot be trusted to write a traditional
> > security.capability xattr. If it were allowed to do so, then any
> > unprivileged user on the host could map his own uid to root in a
> > namespace, write the xattr, and execute the file with privilege on the
> > host.
> >
> > This patch introduces v3 of the security.capability xattr. It builds a
> > vfs_ns_cap_data struct by appending a uid_t rootid to struct
> > vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in
> > init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces
> > the file capabilities may take effect.
> >
> > When a task in a user ns (which is privileged with CAP_SETFCAP toward
> > that user_ns) asks to write v2 security.capability, the kernel will
> > transparently rewrite the xattr as a v3 with the appropriate rootid.
> > Subsequently, any task executing the file which has the noted kuid as
> > its root uid, or which is in a descendent user_ns of such a user_ns,
> > will run the file with capabilities.
> >
> > If a task writes a v3 security.capability, then it can provide a
> > uid (valid within its own user namespace, over which it has CAP_SETFCAP)
> > for the xattr. The kernel will translate that to the absolute uid, and
> > write that to disk. After this, a task in the writer's namespace will
> > not be able to use those capabilities, but a task in a namespace where
> > the given uid is root will.
> >
> > Only a single security.capability xattr may be written. A task may
> > overwrite the existing one so long as it was written by a user mapped
> > into his own user_ns over which he has CAP_SETFCAP.
> >
> > This allows a simple setxattr to work, allows tar/untar to work, and
> > allows us to tar in one namespace and untar in another while preserving
> > the capability, without risking leaking privilege into a parent
> > namespace.
>
> Any chance of a singed-off-by?
Yes, sorry, St?phane had pointed out that I'd apparently forgotten to do
-s. Do you want me to resend the whole shebang, or does
Signed-off-by: Serge Hallyn <[email protected]>
suffice? (My previous iterations did have it fwiw so I don't think I could
legally disavow it now :)
"Serge E. Hallyn" <[email protected]> writes:
> On Thu, Dec 08, 2016 at 05:43:09PM +1300, Eric W. Biederman wrote:
>> "Serge E. Hallyn" <[email protected]> writes:
>>
>> > Root in a user ns cannot be trusted to write a traditional
>> > security.capability xattr. If it were allowed to do so, then any
>> > unprivileged user on the host could map his own uid to root in a
>> > namespace, write the xattr, and execute the file with privilege on the
>> > host.
>> >
>> > This patch introduces v3 of the security.capability xattr. It builds a
>> > vfs_ns_cap_data struct by appending a uid_t rootid to struct
>> > vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in
>> > init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces
>> > the file capabilities may take effect.
>> >
>> > When a task in a user ns (which is privileged with CAP_SETFCAP toward
>> > that user_ns) asks to write v2 security.capability, the kernel will
>> > transparently rewrite the xattr as a v3 with the appropriate rootid.
>> > Subsequently, any task executing the file which has the noted kuid as
>> > its root uid, or which is in a descendent user_ns of such a user_ns,
>> > will run the file with capabilities.
>> >
>> > If a task writes a v3 security.capability, then it can provide a
>> > uid (valid within its own user namespace, over which it has CAP_SETFCAP)
>> > for the xattr. The kernel will translate that to the absolute uid, and
>> > write that to disk. After this, a task in the writer's namespace will
>> > not be able to use those capabilities, but a task in a namespace where
>> > the given uid is root will.
>> >
>> > Only a single security.capability xattr may be written. A task may
>> > overwrite the existing one so long as it was written by a user mapped
>> > into his own user_ns over which he has CAP_SETFCAP.
>> >
>> > This allows a simple setxattr to work, allows tar/untar to work, and
>> > allows us to tar in one namespace and untar in another while preserving
>> > the capability, without risking leaking privilege into a parent
>> > namespace.
>>
>> Any chance of a singed-off-by?
>
> Yes, sorry, Stéphane had pointed out that I'd apparently forgotten to do
> -s. Do you want me to resend the whole shebang, or does
>
> Signed-off-by: Serge Hallyn <[email protected]>
>
> suffice? (My previous iterations did have it fwiw so I don't think I could
> legally disavow it now :)
That should be good enough. I just wanted to make certain it existed
somewhere.
The whole inode->i_op->getxattr reference was also a bit of a problem
as that method was removed in 4.9-rc1 but otherwise things are looking
reasonable.
Thank you,
Eric
"Serge E. Hallyn" <[email protected]> writes:
> On Thu, Dec 08, 2016 at 05:43:09PM +1300, Eric W. Biederman wrote:
>> "Serge E. Hallyn" <[email protected]> writes:
>> Any chance of a singed-off-by?
>
> Yes, sorry, Stéphane had pointed out that I'd apparently forgotten to do
> -s. Do you want me to resend the whole shebang, or does
>
> Signed-off-by: Serge Hallyn <[email protected]>
>
> suffice? (My previous iterations did have it fwiw so I don't think I could
> legally disavow it now :)
I was really hoping to get this in this for 4.10, but I am seeing a couple
of little things in my review. Comments referring to a non-existent v4
and a few other niggling little things so I am going to target this for
the next kernel release so there is time review. With a little luck I
can place this patch in my for-next tree just after the merge window
closes and 4.10-rc1 ships.
Eric
Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> > On Thu, Dec 08, 2016 at 05:43:09PM +1300, Eric W. Biederman wrote:
> >> "Serge E. Hallyn" <[email protected]> writes:
>
> >> Any chance of a singed-off-by?
> >
> > Yes, sorry, St?phane had pointed out that I'd apparently forgotten to do
> > -s. Do you want me to resend the whole shebang, or does
> >
> > Signed-off-by: Serge Hallyn <[email protected]>
> >
> > suffice? (My previous iterations did have it fwiw so I don't think I could
> > legally disavow it now :)
>
> I was really hoping to get this in this for 4.10, but I am seeing a couple
> of little things in my review. Comments referring to a non-existent v4
> and a few other niggling little things so I am going to target this for
> the next kernel release so there is time review. With a little luck I
> can place this patch in my for-next tree just after the merge window
> closes and 4.10-rc1 ships.
Ok, thanks. This is not something I'd want to rush :)
"Serge E. Hallyn" <[email protected]> writes:
> Quoting Eric W. Biederman ([email protected]):
>> "Serge E. Hallyn" <[email protected]> writes:
>>
>> > On Thu, Dec 08, 2016 at 05:43:09PM +1300, Eric W. Biederman wrote:
>> >> "Serge E. Hallyn" <[email protected]> writes:
>>
>> >> Any chance of a singed-off-by?
>> >
>> > Yes, sorry, Stéphane had pointed out that I'd apparently forgotten to do
>> > -s. Do you want me to resend the whole shebang, or does
>> >
>> > Signed-off-by: Serge Hallyn <[email protected]>
>> >
>> > suffice? (My previous iterations did have it fwiw so I don't think I could
>> > legally disavow it now :)
>>
>> I was really hoping to get this in this for 4.10, but I am seeing a couple
>> of little things in my review. Comments referring to a non-existent v4
>> and a few other niggling little things so I am going to target this for
>> the next kernel release so there is time review. With a little luck I
>> can place this patch in my for-next tree just after the merge window
>> closes and 4.10-rc1 ships.
>
> Ok, thanks. This is not something I'd want to rush :)
Sure. This is just something we get merged.
Eric
[email protected] (Eric W. Biederman) writes:
> "Serge E. Hallyn" <[email protected]> writes:
>
>> Quoting Eric W. Biederman ([email protected]):
>>> "Serge E. Hallyn" <[email protected]> writes:
>>>
>>> > On Thu, Dec 08, 2016 at 05:43:09PM +1300, Eric W. Biederman wrote:
>>> >> "Serge E. Hallyn" <[email protected]> writes:
>>>
>>> >> Any chance of a singed-off-by?
>>> >
>>> > Yes, sorry, Stéphane had pointed out that I'd apparently forgotten to do
>>> > -s. Do you want me to resend the whole shebang, or does
>>> >
>>> > Signed-off-by: Serge Hallyn <[email protected]>
>>> >
>>> > suffice? (My previous iterations did have it fwiw so I don't think I could
>>> > legally disavow it now :)
>>>
>>> I was really hoping to get this in this for 4.10, but I am seeing a couple
>>> of little things in my review. Comments referring to a non-existent v4
>>> and a few other niggling little things so I am going to target this for
>>> the next kernel release so there is time review. With a little luck I
>>> can place this patch in my for-next tree just after the merge window
>>> closes and 4.10-rc1 ships.
>>
>> Ok, thanks. This is not something I'd want to rush :)
>
> Sure. This is just something we get merged.
By which I meant to say this is something we need to get merged, and
hopefully before all of the developers forget what is going on. Not
having this is clearly a pain point for people working with file
capabilities.
Eric