2015-11-30 22:44:00

by Serge Hallyn

[permalink] [raw]
Subject: [PATCH RFC] Introduce new security.nscapability xattr

A common way for daemons to run with minimal privilege is to start as root,
perhaps setuid-root, choose a desired capability set, set PR_SET_KEEPCAPS,
then change uid to non-root. A simpler way to achieve this is to set file
capabilities on a not-setuid-root binary. However, when installing a package
inside a (user-namespaced) container, packages cannot be installed with file
capabilities. For this reason, containers must install ping setuid-root.

To achieve this, we would need for containers to be able to request file
capabilities be added to a file without causing these to be honored in the
initial user namespace.

To this end, the patch below introduces a new capability xattr format. The
main enhancement over the existing security.capability xattr is that we
tag capability sets with a uid - the uid of the root user in the namespace
where the capabilities are set. The capabilities will be ignored in any
other namespace. The special case of uid == -1 (which must only ever be
able to be set by kuid 0) means use the capabilities in all namespaces.

An alternative format would use a pair of uids to indicate a range of rootids.
This would allow root in a user namespace with uids 100000-165536 mapped to
set the xattr once on a file, then launch nested containers wherein the file
could be used with privilege. That's not what this patch does, but would be
a trivial change if people think it would be worthwhile.

This patch does not actually address the real problem, which is setting the
xattrs from inside containers. For that, I think the best solution is to
add a pair of new system calls, setfcap and getfcap. Userspace would for
instance call fsetfcap(fd, cap_user_header_t, cap_user_data_t), to which
the kernel would, if not in init_user_ns, react by writing an appropriate
security.nscapability xattr.

The libcap2 library's cap_set_file/cap_get_file could be switched over
transparently to use this to hide its use from all callers.

Comments appreciated.

Note - In this patch, file capabilities only work for containers which have
a root uid defined. We may want to allow -1 uids to work in all
namespaces. There certainly would be uses for this, but I'm a bit unsettled
about the implications of allowing a program privilege in a container where
there is no uid with privilege. This needs more thought.

Signed-off-by: Serge Hallyn <[email protected]>
---
include/linux/capability.h | 15 +++++
include/uapi/linux/capability.h | 47 ++++++++++++++
include/uapi/linux/xattr.h | 3 +
security/commoncap.c | 135 ++++++++++++++++++++++++++++++++++++++--
4 files changed, 194 insertions(+), 6 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index af9f0b9..24ac18e 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
#define _LINUX_CAPABILITY_H

#include <uapi/linux/capability.h>
+#include <linux/uidgid.h>


#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -31,6 +32,20 @@ struct cpu_vfs_cap_data {
kernel_cap_t inheritable;
};

+struct cpu_vfs_ns_cap_data {
+ __u32 flags;
+ kuid_t rootid;
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+};
+
+struct cpu_vfs_ns_cap_header {
+ __u32 hdr_info;
+ struct cpu_vfs_ns_cap_data caps[0];
+};
+#define NS_CAPS_VERSION(x) (x & 0xFF)
+#define NS_CAPS_NCAPS(x) ( (x >> 8) & 0xFF )
+
#define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t))

diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a1..2211a33 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -62,10 +62,14 @@ typedef struct __user_cap_data_struct {
#define VFS_CAP_U32_2 2
#define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))

+/* version number for security.nscapability xattrs hdr->hdr_info */
+#define VFS_NS_CAP_REVISION 1
+
#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
#define VFS_CAP_U32 VFS_CAP_U32_2
#define VFS_CAP_REVISION VFS_CAP_REVISION_2

+
struct vfs_cap_data {
__le32 magic_etc; /* Little endian */
struct {
@@ -74,6 +78,49 @@ struct vfs_cap_data {
} data[VFS_CAP_U32];
};

+/*
+ * Q: do we want version in the header, or in the data?
+ * If it is in the header, then a container will need to
+ * make sure it is writing the same data.
+ *
+ * Actually, perhaps we simply do not support writing the
+ * xattr, we just use a new system call to get/set the fscap.
+ * The kernel can be in charge of watching the version numbers.
+ * After all, we can't allow the container to override the
+ * fscaps of the init ns.
+ *
+ * @flags currently only containers the effective bit. The
+ * other bits are reserved, and must be 0 at the moment.
+ * @rootid contains the kuid value of the root in the namespace
+ * for which this capability should be used. If -1, then this
+ * works for all namespaces. Only root in the initial ns can
+ * use this.
+ *
+ * Q: do we want to use a range instead? Then root in a container
+ * could allow one binary with one capability to be used by any
+ * nested containers.
+ */
+#define VFS_NS_CAP_EFFECTIVE 0x1
+struct vfs_ns_cap_data {
+ __le32 flags;
+ __le32 rootid;
+ struct {
+ __le32 permitted; /* Little endian */
+ __le32 inheritable; /* Little endian */
+ } data[VFS_CAP_U32];
+};
+
+/*
+ * 32-bit hdr_info contains
+ * 16 leftmost: reserved
+ * next 8: ncaps
+ * last 8: version
+ */
+struct vfs_ns_cap_header {
+ __le32 hdr_info;
+ /* ncaps * vfs_ns_cap_data */
+};
+
#ifndef __KERNEL__

/*
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 1590c49..67c80ab 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -68,6 +68,9 @@
#define XATTR_CAPS_SUFFIX "capability"
#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX

+#define XATTR_NS_CAPS_SUFFIX "nscapability"
+#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX
+
#define XATTR_POSIX_ACL_ACCESS "posix_acl_access"
#define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
#define XATTR_POSIX_ACL_DEFAULT "posix_acl_default"
diff --git a/security/commoncap.c b/security/commoncap.c
index 1832cf7..c44edf3 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -308,6 +308,10 @@ int cap_inode_need_killpriv(struct dentry *dentry)
if (!inode->i_op->getxattr)
return 0;

+ error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0);
+ if (error > 0)
+ return 1;
+
error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
if (error <= 0)
return 0;
@@ -325,11 +329,17 @@ int cap_inode_need_killpriv(struct dentry *dentry)
int cap_inode_killpriv(struct dentry *dentry)
{
struct inode *inode = d_backing_inode(dentry);
+ int ret1, ret2;;

if (!inode->i_op->removexattr)
return 0;

- return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS);
+
+ if (ret1 != 0)
+ return ret1;
+ return ret2;
}

/*
@@ -433,6 +443,117 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
return 0;
}

+int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+{
+ struct inode *inode = d_backing_inode(dentry);
+ unsigned tocopy, i;
+ int ret = 0, size, expected;
+ unsigned len = 0;
+ struct vfs_ns_cap_header *hdr;
+ struct vfs_ns_cap_data *cap, *nscap = NULL;
+ __u16 ncaps, version;
+ __u32 hdr_info;
+ kuid_t current_root, caprootuid;
+
+ memset(cpu_caps, 0, sizeof(*cpu_caps));
+
+ if (!inode || !inode->i_op->getxattr)
+ return -ENODATA;
+
+ /* get the size */
+ size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
+ NULL, 0);
+ if (size == -ENODATA || size == -EOPNOTSUPP)
+ /* no data, that's ok */
+ return -ENODATA;
+ if (size < 0)
+ return size;
+ if (size < sizeof(struct cpu_vfs_ns_cap_header))
+ return -EINVAL;
+ if (size > sizeof(struct cpu_vfs_ns_cap_header) + 255 * sizeof(struct vfs_ns_cap_data))
+ return -EINVAL;
+ len = size;
+
+ hdr = kmalloc(len + 1, GFP_NOFS);
+ if (!hdr)
+ return -ENOMEM;
+
+ size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, hdr,
+ len);
+ if (size < 0) {
+ ret = size;
+ goto out;
+ }
+
+ if (size != len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ hdr_info = le32_to_cpu(hdr->hdr_info);
+ version = NS_CAPS_VERSION(hdr_info);
+ ncaps = NS_CAPS_NCAPS(hdr_info);
+
+ if (version != VFS_NS_CAP_REVISION) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ expected = sizeof(*hdr) + ncaps * sizeof(*cap);
+ if (size != expected) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tocopy = VFS_CAP_U32;
+
+ /* find an applicable entry */
+ /* a global entry (uid == -1) takes precedence */
+ current_root = make_kuid(current_user_ns(), 0);
+ if (!uid_valid(current_root)) {
+ /* no root user in this namespace; no capabilities */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0, cap = (void *) hdr + sizeof(*hdr); i < ncaps; cap += sizeof(*cap), i++) {
+ uid_t uid = le32_to_cpu(cap->rootid);
+ if (uid == -1) {
+ nscap = cap;
+ break;
+ }
+
+ caprootuid = make_kuid(&init_user_ns, uid);
+ if (uid_eq(caprootuid, current_root))
+ nscap = cap;
+ }
+
+ if (!nscap) {
+ /* nothing found for this namespace */
+ ret = -ENODATA;
+ goto out;
+ }
+
+ /* copy the entry */
+ CAP_FOR_EACH_U32(i) {
+ if (i >= tocopy)
+ break;
+ cpu_caps->permitted.cap[i] = le32_to_cpu(nscap->data[i].permitted);
+ cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap->data[i].inheritable);
+ }
+
+ cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
+ cpu_caps->magic_etc = VFS_CAP_REVISION_2;
+ if (nscap->flags & VFS_NS_CAP_EFFECTIVE)
+ cpu_caps->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
+
+out:
+ kfree(hdr);
+
+ return ret;
+}
+
/*
* Attempt to get the on-exec apply capability sets for an executable file from
* its xattrs and, if present, apply them to the proposed credentials being
@@ -451,11 +572,13 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
return 0;

- rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ if (rc == -ENODATA)
+ rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
- printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
- __func__, rc, bprm->filename);
+ printk(KERN_NOTICE "Got EINVAL reading file caps for %s\n",
+ bprm->filename);
else if (rc == -ENODATA)
rc = 0;
goto out;
@@ -651,7 +774,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
int cap_inode_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
+ if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
if (!capable(CAP_SETFCAP))
return -EPERM;
return 0;
@@ -677,7 +800,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
*/
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
+ if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
if (!capable(CAP_SETFCAP))
return -EPERM;
return 0;
--
2.5.0


2015-11-30 23:17:00

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH RFC] Introduce new security.nscapability xattr

"Serge E. Hallyn" <[email protected]> writes:

> A common way for daemons to run with minimal privilege is to start as root,
> perhaps setuid-root, choose a desired capability set, set PR_SET_KEEPCAPS,
> then change uid to non-root. A simpler way to achieve this is to set file
> capabilities on a not-setuid-root binary. However, when installing a package
> inside a (user-namespaced) container, packages cannot be installed with file
> capabilities. For this reason, containers must install ping setuid-root.

Don't ping sockets avoid that specific problem?

I expect the general case still holds.

> To achieve this, we would need for containers to be able to request file
> capabilities be added to a file without causing these to be honored in the
> initial user namespace.
>
> To this end, the patch below introduces a new capability xattr format. The
> main enhancement over the existing security.capability xattr is that we
> tag capability sets with a uid - the uid of the root user in the namespace
> where the capabilities are set. The capabilities will be ignored in any
> other namespace. The special case of uid == -1 (which must only ever be
> able to be set by kuid 0) means use the capabilities in all
> namespaces.

A quick comment on this.

We currently allow capabilities that have been gained to be valid in all
descendent user namespaces.

Applying this principle to the on-disk capabilities would make it so
that uid 0 would mean capabilities in all namespaces.

It might be worth it to introduce a fixed sized array with a length
parameter of perhaps 32 entries which is a path of root uids as seen by
the initial user namespace. That way the entire construction of the
user namespace could be verified. AKA verify the current user namespace
and the parent and the parents parent. Up to the user namespace the
current filesystem is mounted in. We would look at how much space
allows an xattr to be stored without causing filesystems a challenge
to properly size such an array.

Given that uids are fundamentally flat that might not be particularly
useful. If we add an alternative way of identifying user namespaces
say a privileged operation that set a uuid, then the complete path would
be more interesting.

> An alternative format would use a pair of uids to indicate a range of rootids.
> This would allow root in a user namespace with uids 100000-165536 mapped to
> set the xattr once on a file, then launch nested containers wherein the file
> could be used with privilege. That's not what this patch does, but would be
> a trivial change if people think it would be worthwhile.
>
> This patch does not actually address the real problem, which is setting the
> xattrs from inside containers. For that, I think the best solution is to
> add a pair of new system calls, setfcap and getfcap. Userspace would for
> instance call fsetfcap(fd, cap_user_header_t, cap_user_data_t), to which
> the kernel would, if not in init_user_ns, react by writing an appropriate
> security.nscapability xattr.

That feels hard to maintain, but you may be correct that we have a small
enough userspace that it would not be a problem.

Eric


> The libcap2 library's cap_set_file/cap_get_file could be switched over
> transparently to use this to hide its use from all callers.
>
> Comments appreciated.
>
> Note - In this patch, file capabilities only work for containers which have
> a root uid defined. We may want to allow -1 uids to work in all
> namespaces. There certainly would be uses for this, but I'm a bit unsettled
> about the implications of allowing a program privilege in a container where
> there is no uid with privilege. This needs more thought.
>
> Signed-off-by: Serge Hallyn <[email protected]>
> ---
> include/linux/capability.h | 15 +++++
> include/uapi/linux/capability.h | 47 ++++++++++++++
> include/uapi/linux/xattr.h | 3 +
> security/commoncap.c | 135 ++++++++++++++++++++++++++++++++++++++--
> 4 files changed, 194 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index af9f0b9..24ac18e 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -13,6 +13,7 @@
> #define _LINUX_CAPABILITY_H
>
> #include <uapi/linux/capability.h>
> +#include <linux/uidgid.h>
>
>
> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
> @@ -31,6 +32,20 @@ struct cpu_vfs_cap_data {
> kernel_cap_t inheritable;
> };
>
> +struct cpu_vfs_ns_cap_data {
> + __u32 flags;
> + kuid_t rootid;
> + kernel_cap_t permitted;
> + kernel_cap_t inheritable;
> +};
> +
> +struct cpu_vfs_ns_cap_header {
> + __u32 hdr_info;
> + struct cpu_vfs_ns_cap_data caps[0];
> +};
> +#define NS_CAPS_VERSION(x) (x & 0xFF)
> +#define NS_CAPS_NCAPS(x) ( (x >> 8) & 0xFF )
> +
> #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct))
> #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t))
>
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index 12c37a1..2211a33 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -62,10 +62,14 @@ typedef struct __user_cap_data_struct {
> #define VFS_CAP_U32_2 2
> #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
>
> +/* version number for security.nscapability xattrs hdr->hdr_info */
> +#define VFS_NS_CAP_REVISION 1
> +
> #define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
> #define VFS_CAP_U32 VFS_CAP_U32_2
> #define VFS_CAP_REVISION VFS_CAP_REVISION_2
>
> +
> struct vfs_cap_data {
> __le32 magic_etc; /* Little endian */
> struct {
> @@ -74,6 +78,49 @@ struct vfs_cap_data {
> } data[VFS_CAP_U32];
> };
>
> +/*
> + * Q: do we want version in the header, or in the data?
> + * If it is in the header, then a container will need to
> + * make sure it is writing the same data.
> + *
> + * Actually, perhaps we simply do not support writing the
> + * xattr, we just use a new system call to get/set the fscap.
> + * The kernel can be in charge of watching the version numbers.
> + * After all, we can't allow the container to override the
> + * fscaps of the init ns.
> + *
> + * @flags currently only containers the effective bit. The
> + * other bits are reserved, and must be 0 at the moment.
> + * @rootid contains the kuid value of the root in the namespace
> + * for which this capability should be used. If -1, then this
> + * works for all namespaces. Only root in the initial ns can
> + * use this.
> + *
> + * Q: do we want to use a range instead? Then root in a container
> + * could allow one binary with one capability to be used by any
> + * nested containers.
> + */
> +#define VFS_NS_CAP_EFFECTIVE 0x1
> +struct vfs_ns_cap_data {
> + __le32 flags;
> + __le32 rootid;
> + struct {
> + __le32 permitted; /* Little endian */
> + __le32 inheritable; /* Little endian */
> + } data[VFS_CAP_U32];
> +};
> +
> +/*
> + * 32-bit hdr_info contains
> + * 16 leftmost: reserved
> + * next 8: ncaps
> + * last 8: version
> + */
> +struct vfs_ns_cap_header {
> + __le32 hdr_info;
> + /* ncaps * vfs_ns_cap_data */
> +};
> +
> #ifndef __KERNEL__
>
> /*
> diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
> index 1590c49..67c80ab 100644
> --- a/include/uapi/linux/xattr.h
> +++ b/include/uapi/linux/xattr.h
> @@ -68,6 +68,9 @@
> #define XATTR_CAPS_SUFFIX "capability"
> #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
>
> +#define XATTR_NS_CAPS_SUFFIX "nscapability"
> +#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX
> +
> #define XATTR_POSIX_ACL_ACCESS "posix_acl_access"
> #define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
> #define XATTR_POSIX_ACL_DEFAULT "posix_acl_default"
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 1832cf7..c44edf3 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -308,6 +308,10 @@ int cap_inode_need_killpriv(struct dentry *dentry)
> if (!inode->i_op->getxattr)
> return 0;
>
> + error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0);
> + if (error > 0)
> + return 1;
> +
> error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
> if (error <= 0)
> return 0;
> @@ -325,11 +329,17 @@ int cap_inode_need_killpriv(struct dentry *dentry)
> int cap_inode_killpriv(struct dentry *dentry)
> {
> struct inode *inode = d_backing_inode(dentry);
> + int ret1, ret2;;
>
> if (!inode->i_op->removexattr)
> return 0;
>
> - return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
> + ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
> + ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS);
> +
> + if (ret1 != 0)
> + return ret1;
> + return ret2;
> }
>
> /*
> @@ -433,6 +443,117 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
> return 0;
> }
>
> +int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
> +{
> + struct inode *inode = d_backing_inode(dentry);
> + unsigned tocopy, i;
> + int ret = 0, size, expected;
> + unsigned len = 0;
> + struct vfs_ns_cap_header *hdr;
> + struct vfs_ns_cap_data *cap, *nscap = NULL;
> + __u16 ncaps, version;
> + __u32 hdr_info;
> + kuid_t current_root, caprootuid;
> +
> + memset(cpu_caps, 0, sizeof(*cpu_caps));
> +
> + if (!inode || !inode->i_op->getxattr)
> + return -ENODATA;
> +
> + /* get the size */
> + size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
> + NULL, 0);
> + if (size == -ENODATA || size == -EOPNOTSUPP)
> + /* no data, that's ok */
> + return -ENODATA;
> + if (size < 0)
> + return size;
> + if (size < sizeof(struct cpu_vfs_ns_cap_header))
> + return -EINVAL;
> + if (size > sizeof(struct cpu_vfs_ns_cap_header) + 255 * sizeof(struct vfs_ns_cap_data))
> + return -EINVAL;
> + len = size;
> +
> + hdr = kmalloc(len + 1, GFP_NOFS);
> + if (!hdr)
> + return -ENOMEM;
> +
> + size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, hdr,
> + len);
> + if (size < 0) {
> + ret = size;
> + goto out;
> + }
> +
> + if (size != len) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + hdr_info = le32_to_cpu(hdr->hdr_info);
> + version = NS_CAPS_VERSION(hdr_info);
> + ncaps = NS_CAPS_NCAPS(hdr_info);
> +
> + if (version != VFS_NS_CAP_REVISION) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + expected = sizeof(*hdr) + ncaps * sizeof(*cap);
> + if (size != expected) {
> + ret = -EINVAL;
> + goto out;
> + }
> + tocopy = VFS_CAP_U32;
> +
> + /* find an applicable entry */
> + /* a global entry (uid == -1) takes precedence */
> + current_root = make_kuid(current_user_ns(), 0);
> + if (!uid_valid(current_root)) {
> + /* no root user in this namespace; no capabilities */
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + for (i = 0, cap = (void *) hdr + sizeof(*hdr); i < ncaps; cap += sizeof(*cap), i++) {
> + uid_t uid = le32_to_cpu(cap->rootid);
> + if (uid == -1) {
> + nscap = cap;
> + break;
> + }
> +
> + caprootuid = make_kuid(&init_user_ns, uid);
> + if (uid_eq(caprootuid, current_root))
> + nscap = cap;
> + }
> +
> + if (!nscap) {
> + /* nothing found for this namespace */
> + ret = -ENODATA;
> + goto out;
> + }
> +
> + /* copy the entry */
> + CAP_FOR_EACH_U32(i) {
> + if (i >= tocopy)
> + break;
> + cpu_caps->permitted.cap[i] = le32_to_cpu(nscap->data[i].permitted);
> + cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap->data[i].inheritable);
> + }
> +
> + cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
> + cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
> +
> + cpu_caps->magic_etc = VFS_CAP_REVISION_2;
> + if (nscap->flags & VFS_NS_CAP_EFFECTIVE)
> + cpu_caps->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
> +
> +out:
> + kfree(hdr);
> +
> + return ret;
> +}
> +
> /*
> * Attempt to get the on-exec apply capability sets for an executable file from
> * its xattrs and, if present, apply them to the proposed credentials being
> @@ -451,11 +572,13 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
> if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
> return 0;
>
> - rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
> + rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
> + if (rc == -ENODATA)
> + rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
> if (rc < 0) {
> if (rc == -EINVAL)
> - printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
> - __func__, rc, bprm->filename);
> + printk(KERN_NOTICE "Got EINVAL reading file caps for %s\n",
> + bprm->filename);
> else if (rc == -ENODATA)
> rc = 0;
> goto out;
> @@ -651,7 +774,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
> int cap_inode_setxattr(struct dentry *dentry, const char *name,
> const void *value, size_t size, int flags)
> {
> - if (!strcmp(name, XATTR_NAME_CAPS)) {
> + if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
> if (!capable(CAP_SETFCAP))
> return -EPERM;
> return 0;
> @@ -677,7 +800,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
> */
> int cap_inode_removexattr(struct dentry *dentry, const char *name)
> {
> - if (!strcmp(name, XATTR_NAME_CAPS)) {
> + if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
> if (!capable(CAP_SETFCAP))
> return -EPERM;
> return 0;

2015-12-01 03:51:54

by Serge Hallyn

[permalink] [raw]
Subject: Re: [PATCH RFC] Introduce new security.nscapability xattr

On Mon, Nov 30, 2015 at 05:08:34PM -0600, Eric W. Biederman wrote:
> "Serge E. Hallyn" <[email protected]> writes:
>
> > A common way for daemons to run with minimal privilege is to start as root,
> > perhaps setuid-root, choose a desired capability set, set PR_SET_KEEPCAPS,
> > then change uid to non-root. A simpler way to achieve this is to set file
> > capabilities on a not-setuid-root binary. However, when installing a package
> > inside a (user-namespaced) container, packages cannot be installed with file
> > capabilities. For this reason, containers must install ping setuid-root.
>
> Don't ping sockets avoid that specific problem?
>
> I expect the general case still holds.

Hah - yes, I guess do I have to update my 10 year old default example :)

> > To achieve this, we would need for containers to be able to request file
> > capabilities be added to a file without causing these to be honored in the
> > initial user namespace.
> >
> > To this end, the patch below introduces a new capability xattr format. The
> > main enhancement over the existing security.capability xattr is that we
> > tag capability sets with a uid - the uid of the root user in the namespace
> > where the capabilities are set. The capabilities will be ignored in any
> > other namespace. The special case of uid == -1 (which must only ever be
> > able to be set by kuid 0) means use the capabilities in all
> > namespaces.
>
> A quick comment on this.
>
> We currently allow capabilities that have been gained to be valid in all
> descendent user namespaces.
>
> Applying this principle to the on-disk capabilities would make it so
> that uid 0 would mean capabilities in all namespaces.
>
> It might be worth it to introduce a fixed sized array with a length
> parameter of perhaps 32 entries which is a path of root uids as seen by
> the initial user namespace. That way the entire construction of the
> user namespace could be verified. AKA verify the current user namespace
> and the parent and the parents parent. Up to the user namespace the

Hm, so if container b runs in container a, a has rootid 100000 and a
range of 200000, and b has root kuid 200000, range 65536, iiuc you're
suggesting that for a binary in container b we store [100000,200000] ?
I'm not sure that's helpful, though - uid 200000 in a user namespace
with 200000 mapped to root, is all powerful anyway. I was actually
thinking (with the uid ranges) of making the connection looser, not
tighter.

> current filesystem is mounted in. We would look at how much space
> allows an xattr to be stored without causing filesystems a challenge
> to properly size such an array.
>
> Given that uids are fundamentally flat that might not be particularly
> useful.

Right, I think that's the conclusion I've drawn above (if I'm not
misunderstanding you)

> If we add an alternative way of identifying user namespaces
> say a privileged operation that set a uuid, then the complete path would
> be more interesting.
>
> > An alternative format would use a pair of uids to indicate a range of rootids.
> > This would allow root in a user namespace with uids 100000-165536 mapped to
> > set the xattr once on a file, then launch nested containers wherein the file
> > could be used with privilege. That's not what this patch does, but would be
> > a trivial change if people think it would be worthwhile.
> >
> > This patch does not actually address the real problem, which is setting the
> > xattrs from inside containers. For that, I think the best solution is to
> > add a pair of new system calls, setfcap and getfcap. Userspace would for
> > instance call fsetfcap(fd, cap_user_header_t, cap_user_data_t), to which
> > the kernel would, if not in init_user_ns, react by writing an appropriate
> > security.nscapability xattr.
>
> That feels hard to maintain, but you may be correct that we have a small

Hard to maintain in which sense? Complicated for userspace software, or
becoming too complicated in the kernel's bprm-capabilities code?

> enough userspace that it would not be a problem.

Yeah I'm thinking we can hide it all behind libcap2. Unless we go with
uid ranges, in which case we'd need a way to expose that, but that would
be an optional extension, the sane default would be transparent, so no big
deal.

-serge

2015-12-04 20:21:24

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH RFC] Introduce new security.nscapability xattr

Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> > A common way for daemons to run with minimal privilege is to start as root,
> > perhaps setuid-root, choose a desired capability set, set PR_SET_KEEPCAPS,
> > then change uid to non-root. A simpler way to achieve this is to set file
> > capabilities on a not-setuid-root binary. However, when installing a package
> > inside a (user-namespaced) container, packages cannot be installed with file
> > capabilities. For this reason, containers must install ping setuid-root.
>
> Don't ping sockets avoid that specific problem?
>
> I expect the general case still holds.
>
> > To achieve this, we would need for containers to be able to request file
> > capabilities be added to a file without causing these to be honored in the
> > initial user namespace.
> >
> > To this end, the patch below introduces a new capability xattr format. The
> > main enhancement over the existing security.capability xattr is that we
> > tag capability sets with a uid - the uid of the root user in the namespace
> > where the capabilities are set. The capabilities will be ignored in any
> > other namespace. The special case of uid == -1 (which must only ever be
> > able to be set by kuid 0) means use the capabilities in all
> > namespaces.

really since security.capability xattrs are currently honored in all
namespaces this isn't really necessary. Until and unless Seth's set
changes that.

>
> A quick comment on this.
>
> We currently allow capabilities that have been gained to be valid in all
> descendent user namespaces.
>
> Applying this principle to the on-disk capabilities would make it so
> that uid 0 would mean capabilities in all namespaces.
>
> It might be worth it to introduce a fixed sized array with a length
> parameter of perhaps 32 entries which is a path of root uids as seen by
> the initial user namespace. That way the entire construction of the
> user namespace could be verified. AKA verify the current user namespace
> and the parent and the parents parent. Up to the user namespace the
> current filesystem is mounted in. We would look at how much space
> allows an xattr to be stored without causing filesystems a challenge
> to properly size such an array.
>
> Given that uids are fundamentally flat that might not be particularly
> useful. If we add an alternative way of identifying user namespaces
> say a privileged operation that set a uuid, then the complete path would
> be more interesting.
>
> > An alternative format would use a pair of uids to indicate a range of rootids.
> > This would allow root in a user namespace with uids 100000-165536 mapped to
> > set the xattr once on a file, then launch nested containers wherein the file
> > could be used with privilege. That's not what this patch does, but would be
> > a trivial change if people think it would be worthwhile.
> >
> > This patch does not actually address the real problem, which is setting the
> > xattrs from inside containers. For that, I think the best solution is to
> > add a pair of new system calls, setfcap and getfcap. Userspace would for
> > instance call fsetfcap(fd, cap_user_header_t, cap_user_data_t), to which
> > the kernel would, if not in init_user_ns, react by writing an appropriate
> > security.nscapability xattr.
>
> That feels hard to maintain, but you may be correct that we have a small
> enough userspace that it would not be a problem.
>
> Eric
>
>
> > The libcap2 library's cap_set_file/cap_get_file could be switched over
> > transparently to use this to hide its use from all callers.
> >
> > Comments appreciated.
> >
> > Note - In this patch, file capabilities only work for containers which have
> > a root uid defined. We may want to allow -1 uids to work in all
> > namespaces. There certainly would be uses for this, but I'm a bit unsettled
> > about the implications of allowing a program privilege in a container where
> > there is no uid with privilege. This needs more thought.

So for actually enabling (user-namespaced) containers to use these, there
are a few possibilities that come to mine.

1. A new setfcap (/getfcap) syscall. Uses mapped uid 0 from
current_user_ns() to write a value in the security.nscapability xattr.
Userspace doesn't need to worry at all about namespace issues.

2. Just expect userspace to write a xattr; kernel checks that no values
are changed for any other namespaces. This could be a lot of parsing and
verifying in the kernel.

3. Switch the xattr scheme - instead of one security.nscapability xattr
with multiple entries, use security.nscapability.$(rootid). Now the
kernel only needs to verify that the $rootid is valid for the writing
task, and we don't need a new syscall. OTOH userspace needs to know
what it's doing. Of course we can still hide that behind libcap2's helpers.

Any opinions on which way seems best? 1 does seem cleanest (and supports
use of seccomp if we want to forbit its use by some containers), but
involves a new pair of syscalls. 2 seems to me to be right out, but
others might disagree...

-serge

2016-03-02 00:00:20

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH RFC] Introduce new security.nscapability xattr

On Mon, Feb 29, 2016 at 03:38:20PM -0600, Serge E. Hallyn wrote:
> On Fri, Jan 29, 2016 at 01:31:51AM -0600, Serge E. Hallyn wrote:
> > On Wed, Jan 27, 2016 at 04:36:02PM -0800, Andy Lutomirski wrote:
> > > On Wed, Jan 27, 2016 at 9:22 AM, Jann Horn <[email protected]> wrote:
> > > > I think it sounds good from a security perspective.
> > >
> > > I'm a bit late to the game, but I have a question: why should this be
> > > keyed to the *root* uid of the namespace in particular? Certainly if
> > > user foo trusts the cap bits on some file, then user foo might trust
> > > those caps to be exerted over any namespace that user foo owns, since
> > > user foo owns the namespace.
> >
> > ... Tying it to a kuid which represents the userns->owner of any
> > namespace in which the capability will be honored might be fine
> > with me. Is that what you mean? So if uid 1000 creates a userns
> > mapping uids 100000-200000, and 100000 in that container puts X=pe
> > on /bin/foo, uid 101000 in that container runs /bin/foo with privilege
> > X. Uid 101000 in someone else's container does not.
> >
> > Although, if I create two containers and provide them different
> > uidmaps, it may well be because I want them segragated and want
> > to minimize the changes of one container breaking out into the
> > other. This risks breaking that.
>
> Thinking differently now... I really want it to "just work" to tar
> and untar these. So I'm thinking of simply using the file owner
> as the uid. So to write a security.ns_capability xattr, you must
> be uid 0 in the inode's namespace, the file must be owned by uid 0,
> and the capabilities in the xattr will be honored for any namespace
> where in that uid_t 0 is root.
>
> Does that sound overly restrictive? I expect file capabilities to
> be used on files owned by root but not setuid-root, so I think it
> is ok.
>
> -serge

Here is a working first draft:

>From 019ff81124b7dd3161414720f5666f6793a8ccd9 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <[email protected]>
Date: Tue, 1 Mar 2016 00:09:35 +0000
Subject: [PATCH 1/1] simplified security.nscapability xattr

This can only be set by root in his own namespace, and will
only be respected by namespaces with that same root kuid
mapped as root. The file must be owned by the root user in
the container. This allows us to avoid having to store a
'root user' value in the capability.

This allows a simple setxattr to work, allows tar/untar to
work, and allows us to tar in one namespace and untar in
another while preserving the capability, without risking
leaking privilege into a parent namespace.

Signed-off-by: Serge Hallyn <[email protected]>
---
include/linux/capability.h | 5 ++-
include/uapi/linux/capability.h | 18 +++++++++
include/uapi/linux/xattr.h | 3 ++
security/commoncap.c | 81 +++++++++++++++++++++++++++++++++++++++--
4 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index af9f0b9..19a37a9 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,7 +13,7 @@
#define _LINUX_CAPABILITY_H

#include <uapi/linux/capability.h>
-
+#include <linux/uidgid.h>

#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
@@ -31,6 +31,9 @@ struct cpu_vfs_cap_data {
kernel_cap_t inheritable;
};

+#define NS_CAPS_VERSION(x) (x & 0xFF)
+#define NS_CAPS_FLAGS(x) ((x >> 8) & 0xFF)
+
#define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t))

diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a1..f0b4a66 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -62,6 +62,9 @@ typedef struct __user_cap_data_struct {
#define VFS_CAP_U32_2 2
#define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))

+/* version number for security.nscapability xattrs hdr->hdr_info */
+#define VFS_NS_CAP_REVISION 1
+
#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
#define VFS_CAP_U32 VFS_CAP_U32_2
#define VFS_CAP_REVISION VFS_CAP_REVISION_2
@@ -74,6 +77,21 @@ struct vfs_cap_data {
} data[VFS_CAP_U32];
};

+#define VFS_NS_CAP_EFFECTIVE 0x1
+/*
+ * 32-bit hdr_info contains
+ * 16 leftmost: reserved
+ * next 8: flags (only VFS_NS_CAP_EFFECTIVE so far)
+ * last 8: version
+ */
+struct vfs_ns_cap_data {
+ __le32 magic_etc;
+ struct {
+ __le32 permitted; /* Little endian */
+ __le32 inheritable; /* Little endian */
+ } data[VFS_CAP_U32];
+};
+
#ifndef __KERNEL__

/*
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 1590c49..67c80ab 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -68,6 +68,9 @@
#define XATTR_CAPS_SUFFIX "capability"
#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX

+#define XATTR_NS_CAPS_SUFFIX "nscapability"
+#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX
+
#define XATTR_POSIX_ACL_ACCESS "posix_acl_access"
#define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
#define XATTR_POSIX_ACL_DEFAULT "posix_acl_default"
diff --git a/security/commoncap.c b/security/commoncap.c
index 6f093f3..735d4c7 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -308,6 +308,10 @@ int cap_inode_need_killpriv(struct dentry *dentry)
if (!inode->i_op->getxattr)
return 0;

+ error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0);
+ if (error > 0)
+ return 1;
+
error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
if (error <= 0)
return 0;
@@ -325,11 +329,17 @@ int cap_inode_need_killpriv(struct dentry *dentry)
int cap_inode_killpriv(struct dentry *dentry)
{
struct inode *inode = d_backing_inode(dentry);
+ int ret1, ret2;

if (!inode->i_op->removexattr)
return 0;

- return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS);
+
+ if (ret1 != 0)
+ return ret1;
+ return ret2;
}

/*
@@ -433,6 +443,55 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
return 0;
}

+int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+{
+ struct inode *inode = d_backing_inode(dentry);
+ unsigned tocopy, i;
+ u32 magic_etc;
+ ssize_t size;
+ struct vfs_ns_cap_data nscap;
+
+ memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
+
+ if (!inode || !inode->i_op->getxattr)
+ return -ENODATA;
+
+ /* verify that userns root owns this file */
+ if (from_kuid(current_user_ns(), dentry->d_inode->i_uid) != 0)
+ return -ENODATA;
+
+ size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
+ &nscap, sizeof(nscap));
+ if (size == -ENODATA || size == -EOPNOTSUPP)
+ /* no data, that's ok */
+ return -ENODATA;
+ if (size < 0)
+ return size;
+ if (size != sizeof(nscap))
+ return -EINVAL;
+
+ magic_etc = le32_to_cpu(nscap.magic_etc);
+
+ if (NS_CAPS_VERSION(magic_etc) != VFS_NS_CAP_REVISION)
+ return -EINVAL;
+
+ cpu_caps->magic_etc = VFS_CAP_REVISION_2;
+ if (NS_CAPS_FLAGS(magic_etc) & VFS_NS_CAP_EFFECTIVE)
+ cpu_caps->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
+ /* copy the entry */
+ CAP_FOR_EACH_U32(i) {
+ if (i >= VFS_CAP_U32_2)
+ break;
+ cpu_caps->permitted.cap[i] = le32_to_cpu(nscap.data[i].permitted);
+ cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap.data[i].inheritable);
+ }
+
+ cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
+ return 0;
+}
+
/*
* Attempt to get the on-exec apply capability sets for an executable file from
* its xattrs and, if present, apply them to the proposed credentials being
@@ -453,11 +512,13 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns))
return 0;

- rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ if (rc == -ENODATA)
+ rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
- printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
- __func__, rc, bprm->filename);
+ printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
+ bprm->filename);
else if (rc == -ENODATA)
rc = 0;
goto out;
@@ -661,6 +722,12 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
return 0;
}

+ if (!strcmp(name, XATTR_NAME_NS_CAPS)) {
+ if (!capable_wrt_inode_uidgid(dentry->d_inode, CAP_SETFCAP))
+ return -EPERM;
+ return 0;
+ }
+
if (!strncmp(name, XATTR_SECURITY_PREFIX,
sizeof(XATTR_SECURITY_PREFIX) - 1) &&
!ns_capable(user_ns, CAP_SYS_ADMIN))
@@ -689,6 +756,12 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
return 0;
}

+ if (!strcmp(name, XATTR_NAME_NS_CAPS)) {
+ if (!capable_wrt_inode_uidgid(dentry->d_inode, CAP_SETFCAP))
+ return -EPERM;
+ return 0;
+ }
+
if (!strncmp(name, XATTR_SECURITY_PREFIX,
sizeof(XATTR_SECURITY_PREFIX) - 1) &&
!ns_capable(user_ns, CAP_SYS_ADMIN))
--
2.7.0