To: linux-kernel@vger.kernel.org
Cc: torvalds@transmeta.com, viro@math.psu.edu
Subject: [PATCH][RFC] 2.5.44: Filesystem capabilities
From: Olaf Dietsche <olaf.dietsche#list.linux-kernel@t-online.de>
Date: Thu, 31 Oct 2002 01:02:06 +0100
Message-ID: <87znsvearl.fsf@goat.bogus.local>
User-Agent: Gnus/5.090005 (Oort Gnus v0.05) XEmacs/21.4 (Honest Recruiter,
 i386-debian-linux)
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 16513
Lines: 581

This patch resolves the last issue, access to the capabilities
file. Write access is granted for processes, which have the
CAP_SETFCAP capability. This is done by wrapping the inode_operations
for the root dir and the capability file with appropriate routines.

Although the fs capability patch is complete now, I'm not entirely
sure, wether I got everything right. Maybe the security and file
system experts may take a look at the code.

As always, I'm running this code myself right now.

Thanks for your time.

Regards, Olaf.

diff -urN a/fs/Config.help b/fs/Config.help
--- a/fs/Config.help	Wed Oct 16 09:23:41 2002
+++ b/fs/Config.help	Wed Oct 30 15:15:43 2002
@@ -1152,3 +1152,50 @@
 
   If unsure, say N.
 
+CONFIG_FS_CAPABILITIES
+  If you say Y here, you will be able to grant selective privileges to
+  executables on a needed basis. This means for some executables,
+  there is no need anymore to run as root or as a suid binary.
+
+  For example, you may drop the SUID bit from ping and grant the
+  CAP_NET_RAW capability:
+  # chmod u-s /bin/ping
+  # chcap cap_net_raw=ep /bin/ping
+
+  Another use would be to run system daemons with their own uid:
+  # chcap cap_net_bind_service=ei /usr/sbin/named
+  This sets the effective and inheritable capabilities of named.
+
+  In your startup script:
+  inhcaps cap_net_bind_service=i bind:bind /usr/sbin/named
+
+  This sets the inheritable set to CAP_NET_BIND_SERVICE, which is
+  needed in order to bind to port 53, and runs named as user bind
+  with group bind.
+
+  This allows running named with needed restricted privileges, if the
+  parent process (root) owns them already. When started by regular
+  users, named runs without any privileges.
+
+  For user space tools see:
+  <http://home.t-online.de/home/olaf.dietsche/linux/capability/>
+
+  For libcap and an alternative implementation, based on extended
+  attributes, see:
+  <http://www.kernel.org/pub/linux/libs/security/linux-privs/>
+
+  If you're unsure, say N.
+
+CONFIG_LIBC_ENABLE_SECURE_HACK
+  LD_PRELOAD is a glibc feature, which allows to override system
+  library functions. But this means also a security hole, through
+  which an attacker might gain unauthorized privileges. This is
+  already prevented for SUID and SGID binaries.
+
+  GNU libc doesn't know about filesystem capabilities yet and doesn't
+  disable LD_PRELOAD for privileged executables, which are not SUID or
+  SGID. This hack sets the group id to an invalid value and tricks GNU
+  libc into thinking, this is a SGID binary (unless this is already
+  SUID and/or SGID). However, this may break some programs.
+
+  If you're unsure, say Y.
diff -urN a/fs/Config.in b/fs/Config.in
--- a/fs/Config.in	Wed Oct 16 09:23:41 2002
+++ b/fs/Config.in	Wed Oct 30 00:32:32 2002
@@ -108,6 +108,9 @@
    define_bool CONFIG_QUOTACTL y
 fi
 
+dep_bool 'Filesystem capabilities (Experimental)' CONFIG_FS_CAPABILITIES $CONFIG_EXPERIMENTAL
+dep_bool '  Disable LD_PRELOAD on privileged executables' CONFIG_LIBC_ENABLE_SECURE_HACK $CONFIG_FS_CAPABILITIES
+
 if [ "$CONFIG_NET" = "y" ]; then
 
    mainmenu_option next_comment
diff -urN a/fs/Makefile b/fs/Makefile
--- a/fs/Makefile	Sat Oct 19 13:52:46 2002
+++ b/fs/Makefile	Thu Oct 24 00:11:51 2002
@@ -6,7 +6,7 @@
 # 
 
 export-objs :=	open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \
-                fcntl.o read_write.o dcookies.o
+                fcntl.o read_write.o dcookies.o fscaps.o
 
 obj-y :=	open.o read_write.o devices.o file_table.o buffer.o \
 		bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
@@ -41,7 +41,8 @@
 obj-y				+= devpts/
 
 obj-$(CONFIG_PROFILING)		+= dcookies.o
- 
+obj-$(CONFIG_FS_CAPABILITIES)	+= fscaps.o
+
 # Do not add any filesystems before this line
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_JBD)		+= jbd/
diff -urN a/fs/attr.c b/fs/attr.c
--- a/fs/attr.c	Sat Oct  5 18:44:20 2002
+++ b/fs/attr.c	Mon Oct 28 14:52:02 2002
@@ -13,6 +13,7 @@
 #include <linux/fcntl.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
+#include <linux/fscaps.h>
 
 /* Taken over from the old code... */
 
@@ -170,6 +171,9 @@
 	}
 	if (!error) {
 		unsigned long dn_mask = setattr_mask(ia_valid);
+		if (ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+			fscap_drop(inode);
+
 		if (dn_mask)
 			dnotify_parent(dentry, dn_mask);
 	}
diff -urN a/fs/fscaps.c b/fs/fscaps.c
--- a/fs/fscaps.c	Thu Jan  1 01:00:00 1970
+++ b/fs/fscaps.c	Thu Oct 31 00:11:09 2002
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2002 Olaf Dietsche
+ *
+ * Filesystem capabilities for linux.
+ */
+
+#include <linux/fscaps.h>
+#include <linux/module.h>
+#include <linux/binfmts.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+struct fscap_info {
+	struct vfsmount *mnt;
+	struct dentry *dentry;
+	struct inode_operations rootdir_envelop;
+	struct inode_operations *rootdir_iops;
+	struct inode_operations cap_envelop;
+	struct inode_operations *cap_iops;
+};
+
+static char __capname[] = ".capabilities";
+
+static int __is_capname(const char *name)
+{
+	if (*name != __capname[0])
+		return 0;
+
+	return !strcmp(name, __capname);
+}
+
+static int __is_capentry(struct dentry *dentry)
+{
+	return dentry == dentry->d_sb->s_fscaps->dentry;
+}
+
+static int __cap_permission(struct inode *inode, int mask)
+{
+	struct inode_operations *iops;
+	if ((mask & MAY_WRITE) && !capable(CAP_SETFCAP))
+		return -EPERM;
+
+	iops = inode->i_sb->s_fscaps->cap_iops;
+	if (iops && iops->permission)
+		return iops->permission(inode, mask);
+
+	return vfs_permission(inode, mask);
+}
+
+static void __info_cap_init(struct fscap_info *info, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct inode_operations *iops;
+	if (info->dentry) {
+		inode = info->dentry->d_inode;
+		if (inode)
+			inode->i_op = inode->i_sb->s_fscaps->cap_iops;
+
+		dput(info->dentry);
+	}
+
+	info->dentry = dget(dentry);
+	if (!dentry)
+		return;
+
+	inode = dentry->d_inode;
+	if (!inode) {
+		printk(KERN_WARNING "%s: negative dentry. Disabling capabilities on %s.\n", __FUNCTION__, info->mnt->mnt_mountpoint->d_name.name);
+		dput(info->dentry);
+		info->dentry = NULL;
+		return;
+	}
+
+	info->cap_iops = iops = inode->i_op;
+	memset(&info->cap_envelop, 0, sizeof(info->cap_envelop));
+	if (iops)
+		info->cap_envelop = *iops;
+
+	info->cap_envelop.permission = __cap_permission;
+	inode->i_op = &info->cap_envelop;
+}
+
+static int __rootdir_create(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode_operations *iops;
+	int err, iscapdb = __is_capname(dentry->d_name.name);
+	if (iscapdb && !capable(CAP_SETFCAP))
+		return -EPERM;
+
+	iops = dir->i_sb->s_fscaps->rootdir_iops;
+	err = iops->create(dir, dentry, mode);
+	if (!err && iscapdb)
+		__info_cap_init(dir->i_sb->s_fscaps, dentry);
+
+	return err;
+}
+
+static int __rootdir_link(struct dentry *old_dentry, struct inode *dir,
+			  struct dentry *new_dentry)
+{
+	struct inode_operations *iops;
+	int err, iscapdb = __is_capname(new_dentry->d_name.name);
+	if (iscapdb && !capable(CAP_SETFCAP))
+		return -EPERM;
+
+	iops = dir->i_sb->s_fscaps->rootdir_iops;
+	err = iops->link(old_dentry, dir, new_dentry);
+	if (!err && iscapdb)
+		__info_cap_init(dir->i_sb->s_fscaps, new_dentry);
+
+	return err;
+}
+
+static int __rootdir_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode_operations *iops;
+	int err, iscapdb = __is_capentry(dentry);
+	if (iscapdb && !capable(CAP_SETFCAP))
+		return -EPERM;
+
+	iops = dir->i_sb->s_fscaps->rootdir_iops;
+	err = iops->unlink(dir, dentry);
+	if (!err && iscapdb)
+		__info_cap_init(dir->i_sb->s_fscaps, NULL);
+
+	return err;
+}
+
+static int __rootdir_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+	struct inode_operations *iops;
+	if (__is_capname(dentry->d_name.name))
+		return -EPERM;
+
+	iops = dir->i_sb->s_fscaps->rootdir_iops;
+	return iops->symlink(dir, dentry, oldname);
+}
+
+static int __rootdir_rename(struct inode *old_dir, struct dentry *old_dentry,
+			    struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode_operations *iops;
+	if (__is_capentry(old_dentry) || __is_capname(new_dentry->d_name.name))
+		return -EPERM;
+
+	iops = old_dir->i_sb->s_fscaps->rootdir_iops;
+	return iops->rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+static void __info_rootdir_init(struct fscap_info *info, struct inode *dir)
+{
+	struct inode_operations *iops = dir->i_op;
+	info->rootdir_iops = iops;
+	if (iops) {
+		info->rootdir_envelop = *iops;
+		info->rootdir_envelop.create = iops->create ? __rootdir_create : 0;
+		info->rootdir_envelop.link = iops->link ? __rootdir_link : 0;
+		info->rootdir_envelop.unlink = iops->unlink ? __rootdir_unlink : 0;
+		info->rootdir_envelop.symlink = iops->symlink ? __rootdir_symlink : 0;
+		info->rootdir_envelop.rename = iops->rename ? __rootdir_rename : 0;
+		dir->i_op = &info->rootdir_envelop;
+	}
+}
+
+static void __info_init(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct fscap_info *info = kmalloc(sizeof(struct fscap_info), GFP_KERNEL);
+	if (info) {
+		info->mnt = mnt;
+		info->dentry = NULL;
+		__info_rootdir_init(info, mnt->mnt_sb->s_root->d_inode);
+		__info_cap_init(info, dentry);
+	}
+
+	mnt->mnt_sb->s_fscaps = info;
+}
+
+static void __info_free(struct fscap_info *info)
+{
+	if (info) {
+		dput(info->dentry);
+		kfree(info);
+	}
+}
+
+static inline struct fscap_info *__info_lookup(struct super_block *sb) 
+{
+	return sb->s_fscaps;
+}
+
+static int __fscap_lookup(struct vfsmount *mnt, struct nameidata *nd)
+{
+	nd->mnt = mntget(mnt);
+	nd->dentry = dget(mnt->mnt_sb->s_root);
+	nd->flags = 0;
+	return path_walk(__capname, nd);
+}
+
+static struct file *__fscap_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
+{
+	if (mnt->mnt_flags & MNT_NOSUID)
+		return ERR_PTR(-EPERM);
+
+	dentry = dget(dentry);
+	mnt = mntget(mnt);
+	return dentry_open(dentry, mnt, flags);
+}
+
+static void __fscap_read(struct file *filp, struct linux_binprm *bprm)
+{
+	__u32 fscaps[3];
+	unsigned long ino = bprm->file->f_dentry->d_inode->i_ino;
+	int n = kernel_read(filp, ino * sizeof(fscaps), (char *) fscaps, sizeof(fscaps));
+	if (n == sizeof(fscaps)) {
+		bprm->cap_effective = fscaps[0];
+		bprm->cap_inheritable = fscaps[1];
+		bprm->cap_permitted = fscaps[2];
+	}
+}
+
+static int kernel_write(struct file *file, unsigned long offset,
+		 char *addr, unsigned long count)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	int result;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	result = vfs_write(file, addr, count, &pos);
+	set_fs(old_fs);
+	return result;
+}
+
+static void __fscap_drop(struct file *filp, struct inode *inode)
+{
+	__u32 fscaps[3];
+	unsigned long ino = inode->i_ino;
+	int n = kernel_read(filp, ino * sizeof(fscaps), (char *) fscaps, sizeof(fscaps));
+	if (n == sizeof(fscaps) && (fscaps[0] || fscaps[1] || fscaps[2])) {
+		fscaps[0] = fscaps[1] = fscaps[2] = 0;
+		kernel_write(filp, ino * sizeof(fscaps), (char *) fscaps, sizeof(fscaps));
+	}
+}
+
+void fscap_mount(struct vfsmount *mnt)
+{
+	struct nameidata nd;
+	if (__info_lookup(mnt->mnt_sb))
+		return;
+
+	if (__fscap_lookup(mnt, &nd))
+		return;
+
+	__info_init(mnt, nd.dentry);
+}
+
+void fscap_umount(struct super_block *sb)
+{
+	struct fscap_info *info = __info_lookup(sb);
+	__info_free(info);
+}
+
+void fscap_read(struct linux_binprm *bprm)
+{
+	struct file *filp;
+	struct fscap_info *info = __info_lookup(bprm->file->f_vfsmnt->mnt_sb);
+	if (!info || !info->dentry)
+		return;
+
+	filp = __fscap_open(info->dentry, info->mnt, O_RDONLY);
+	if (filp && !IS_ERR(filp)) {
+		__fscap_read(filp, bprm);
+		filp_close(filp, 0);
+	}
+}
+
+void fscap_drop(struct inode *inode)
+{
+	struct file *filp;
+	struct fscap_info *info = __info_lookup(inode->i_sb);
+	if (!info || !info->dentry)
+		return;
+
+	filp = __fscap_open(info->dentry, info->mnt, O_RDWR);
+	if (filp && !IS_ERR(filp)) {
+		__fscap_drop(filp, inode);
+		filp_close(filp, 0);
+	}
+}
+
+EXPORT_SYMBOL(fscap_mount);
+EXPORT_SYMBOL(fscap_umount);
+EXPORT_SYMBOL(fscap_read);
+EXPORT_SYMBOL(fscap_drop);
diff -urN a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c	Sat Oct  5 18:45:36 2002
+++ b/fs/namespace.c	Sun Oct 27 19:29:58 2002
@@ -19,6 +19,7 @@
 #include <linux/seq_file.h>
 #include <linux/namespace.h>
 #include <linux/namei.h>
+#include <linux/fscaps.h>
 
 #include <asm/uaccess.h>
 
@@ -340,6 +341,7 @@
 		lock_kernel();
 		DQUOT_OFF(sb);
 		acct_auto_close(sb);
+		fscap_umount(sb);
 		unlock_kernel();
 		security_ops->sb_umount_close(mnt);
 		spin_lock(&dcache_lock);
@@ -655,6 +657,8 @@
 
 	mnt->mnt_flags = mnt_flags;
 	err = graft_tree(mnt, nd);
+	if (!err)
+		fscap_mount(mnt);
 unlock:
 	up_write(&current->namespace->sem);
 	mntput(mnt);
diff -urN a/fs/open.c b/fs/open.c
--- a/fs/open.c	Wed Oct 16 09:23:41 2002
+++ b/fs/open.c	Mon Oct 28 14:52:55 2002
@@ -17,6 +17,7 @@
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
 #include <linux/security.h>
+#include <linux/fscaps.h>
 
 #include <asm/uaccess.h>
 
@@ -665,6 +666,9 @@
 				goto cleanup_all;
 		}
 	}
+
+	if (flags & O_CREAT)
+		fscap_drop(inode);
 
 	return f;
 
diff -urN a/fs/super.c b/fs/super.c
--- a/fs/super.c	Sat Oct  5 18:45:22 2002
+++ b/fs/super.c	Mon Oct 28 11:25:05 2002
@@ -72,6 +72,7 @@
 		s->s_maxbytes = MAX_NON_LFS;
 		s->dq_op = sb_dquot_ops;
 		s->s_qcop = sb_quotactl_ops;
+		s->s_fscaps = NULL;
 	}
 out:
 	return s;
diff -urN a/include/linux/capability.h b/include/linux/capability.h
--- a/include/linux/capability.h	Sat Oct  5 18:43:38 2002
+++ b/include/linux/capability.h	Wed Oct 30 13:15:07 2002
@@ -283,6 +283,10 @@
 
 #define CAP_LEASE            28
 
+/* Allow setting capabilities on files */
+
+#define CAP_SETFCAP          29
+
 #ifdef __KERNEL__
 /* 
  * Bounding set
diff -urN a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h	Sat Oct 19 13:52:47 2002
+++ b/include/linux/fs.h	Wed Oct 30 13:55:20 2002
@@ -665,6 +665,7 @@
 	struct block_device	*s_bdev;
 	struct list_head	s_instances;
 	struct quota_info	s_dquot;	/* Diskquota specific options */
+	struct fscap_info	*s_fscaps;	/* Filesystem capability stuff */
 
 	char s_id[32];				/* Informational name */
 
diff -urN a/include/linux/fscaps.h b/include/linux/fscaps.h
--- a/include/linux/fscaps.h	Thu Jan  1 01:00:00 1970
+++ b/include/linux/fscaps.h	Mon Oct 28 14:46:22 2002
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2002 Olaf Dietsche
+ *
+ * Filesystem capabilities for linux.
+ */
+
+#ifndef _LINUX_FS_CAPS_H
+#define _LINUX_FS_CAPS_H
+
+#include <linux/config.h>
+
+struct vfsmount;
+struct super_block;
+struct linux_binprm;
+struct inode;
+
+#if defined(CONFIG_FS_CAPABILITIES) || defined(CONFIG_FS_CAPABILITIES_MODULE)
+extern void fscap_mount(struct vfsmount *mnt);
+extern void fscap_umount(struct super_block *sb);
+extern void fscap_read(struct linux_binprm *bprm);
+extern void fscap_drop(struct inode *inode);
+#else	
+/* !CONFIG_FS_CAPABILITIES */
+static inline void fscap_mount(struct vfsmount *mnt) {}
+static inline void fscap_umount(struct super_block *sb) {}
+static inline void fscap_read(struct linux_binprm *bprm) {}
+static inline void fscap_drop(struct inode *inode) {}
+#endif
+
+#endif
diff -urN a/security/capability.c b/security/capability.c
--- a/security/capability.c	Sat Oct 19 13:52:48 2002
+++ b/security/capability.c	Wed Oct 30 13:48:13 2002
@@ -18,6 +18,7 @@
 #include <linux/smp_lock.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/fscaps.h>
 
 /* flag to keep track of how we were registered */
 static int secondary;
@@ -119,11 +120,12 @@
 {
 	/* Copied from fs/exec.c:prepare_binprm. */
 
-	/* We don't have VFS support for capabilities yet */
 	cap_clear (bprm->cap_inheritable);
 	cap_clear (bprm->cap_permitted);
 	cap_clear (bprm->cap_effective);
 
+	fscap_read(bprm);
+
 	/*  To support inheritance of root-permissions and suid-root
 	 *  executables under compatibility mode, we raise all three
 	 *  capability sets for the file.
@@ -184,6 +186,10 @@
 							       cap_permitted);
 			}
 		}
+#ifdef CONFIG_LIBC_ENABLE_SECURE_HACK
+		if (current->euid == current->uid && current->egid == current->gid)
+			current->gid = -1;
+#endif
 		do_unlock = 1;
 	}
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/