Some filesystems forego the vfs and may_open() and create their
own 'struct file's.
This patch creates a couple of helper functions which can be
used by these filesystems, and will provide a unified place
which the r/o bind mount code may patch.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/file_table.c | 36 ++++++++++++++++++++++++++++++++++++
lxc-dave/fs/hugetlbfs/inode.c | 22 +++++++++-------------
lxc-dave/include/linux/file.h | 8 ++++++++
lxc-dave/mm/shmem.c | 7 ++-----
lxc-dave/mm/tiny-shmem.c | 24 +++++++++---------------
lxc-dave/net/socket.c | 18 +++++++++---------
6 files changed, 73 insertions(+), 42 deletions(-)
diff -puN fs/file_table.c~01-24-filesystem-helpers-for-custom-struct-file-s fs/file_table.c
--- lxc/fs/file_table.c~01-24-filesystem-helpers-for-custom-struct-file-s 2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/fs/file_table.c 2007-02-09 14:26:46.000000000 -0800
@@ -140,6 +140,42 @@ fail:
EXPORT_SYMBOL(get_empty_filp);
+struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
+ mode_t mode, const struct file_operations *fop)
+{
+ struct file *file;
+
+ file = get_empty_filp();
+ if (!file)
+ return NULL;
+
+ init_file(file, mnt, dentry, mode, fop);
+ return file;
+}
+
+EXPORT_SYMBOL(alloc_file);
+
+/*
+ * Note: This is a crappy interface. It is here to make
+ * merging with the existing users of get_empty_filp()
+ * who have complex failure logic easier. All users
+ * of this should be moving to alloc_file().
+ */
+int init_file(struct file *file, struct vfsmount *mnt,
+ struct dentry *dentry, mode_t mode,
+ const struct file_operations *fop)
+{
+ int error = 0;
+ file->f_vfsmnt = mntget(mnt);
+ file->f_dentry = dentry;
+ file->f_mapping = dentry->d_inode->i_mapping;
+ file->f_mode = mode;
+ file->f_op = fop;
+ return error;
+}
+
+EXPORT_SYMBOL(init_file);
+
void fastcall fput(struct file *file)
{
if (atomic_dec_and_test(&file->f_count))
diff -puN fs/hugetlbfs/inode.c~01-24-filesystem-helpers-for-custom-struct-file-s fs/hugetlbfs/inode.c
--- lxc/fs/hugetlbfs/inode.c~01-24-filesystem-helpers-for-custom-struct-file-s 2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/fs/hugetlbfs/inode.c 2007-02-09 14:26:46.000000000 -0800
@@ -756,16 +756,11 @@ struct file *hugetlb_zero_setup(size_t s
if (!dentry)
goto out_shm_unlock;
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto out_dentry;
-
error = -ENOSPC;
inode = hugetlbfs_get_inode(root->d_sb, current->fsuid,
current->fsgid, S_IFREG | S_IRWXUGO, 0);
if (!inode)
- goto out_file;
+ goto out_dentry;
error = -ENOMEM;
if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
@@ -774,17 +769,18 @@ struct file *hugetlb_zero_setup(size_t s
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0;
- file->f_path.mnt = mntget(hugetlbfs_vfsmount);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &hugetlbfs_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+
+ error = -ENFILE;
+ file = alloc_file(hugetlbfs_vfsmount, dentry,
+ FMODE_WRITE | FMODE_READ,
+ &hugetlbfs_file_operations);
+ if (!file)
+ goto out_inode;
+
return file;
out_inode:
iput(inode);
-out_file:
- put_filp(file);
out_dentry:
dput(dentry);
out_shm_unlock:
diff -puN include/linux/file.h~01-24-filesystem-helpers-for-custom-struct-file-s include/linux/file.h
--- lxc/include/linux/file.h~01-24-filesystem-helpers-for-custom-struct-file-s 2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/include/linux/file.h 2007-02-09 14:26:46.000000000 -0800
@@ -62,6 +62,14 @@ extern struct kmem_cache *filp_cachep;
extern void FASTCALL(__fput(struct file *));
extern void FASTCALL(fput(struct file *));
+struct file_operations;
+struct vfsmount;
+struct dentry;
+extern int init_file(struct file *, struct vfsmount *, struct dentry *dentry,
+ mode_t mode, const struct file_operations *fop);
+extern struct file *alloc_file(struct vfsmount *, struct dentry *dentry,
+ mode_t mode, const struct file_operations *fop);
+
static inline void fput_light(struct file *file, int fput_needed)
{
if (unlikely(fput_needed))
diff -puN mm/shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s mm/shmem.c
--- lxc/mm/shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s 2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/mm/shmem.c 2007-02-09 14:26:46.000000000 -0800
@@ -2497,11 +2497,8 @@ struct file *shmem_file_setup(char *name
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
- file->f_path.mnt = mntget(shm_mnt);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &shmem_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &shmem_file_operations);
return file;
close_file:
diff -puN mm/tiny-shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s mm/tiny-shmem.c
--- lxc/mm/tiny-shmem.c~01-24-filesystem-helpers-for-custom-struct-file-s 2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/mm/tiny-shmem.c 2007-02-09 14:26:46.000000000 -0800
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name
if (!dentry)
goto put_memory;
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto put_dentry;
-
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
- goto close_file;
+ goto put_dentry;
d_instantiate(dentry, inode);
- inode->i_nlink = 0; /* It is unlinked */
+ error = -ENFILE;
+ file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &ramfs_file_operations);
+ if (!file)
+ goto put_inode;
- file->f_path.mnt = mntget(shm_mnt);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &ramfs_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+ inode->i_nlink = 0; /* It is unlinked */
/* notify everyone as to the change of file size */
error = do_truncate(dentry, size, 0, file);
@@ -91,9 +86,8 @@ struct file *shmem_file_setup(char *name
goto close_file;
return file;
-
-close_file:
- put_filp(file);
+put_inode:
+ iput(inode);
put_dentry:
dput(dentry);
put_memory:
diff -puN net/socket.c~01-24-filesystem-helpers-for-custom-struct-file-s net/socket.c
--- lxc/net/socket.c~01-24-filesystem-helpers-for-custom-struct-file-s 2007-02-09 14:26:46.000000000 -0800
+++ lxc-dave/net/socket.c 2007-02-09 14:26:46.000000000 -0800
@@ -355,6 +355,7 @@ static int sock_alloc_fd(struct file **f
static int sock_attach_fd(struct socket *sock, struct file *file)
{
+ struct dentry *dentry;
struct qstr this;
char name[32];
@@ -362,24 +363,23 @@ static int sock_attach_fd(struct socket
this.name = name;
this.hash = 0;
- file->f_path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
- if (unlikely(!file->f_path.dentry))
+ dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
+ if (unlikely(!dentry))
return -ENOMEM;
- file->f_path.dentry->d_op = &sockfs_dentry_operations;
+ dentry->d_op = &sockfs_dentry_operations;
/*
* We dont want to push this dentry into global dentry hash table.
* We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
* This permits a working /proc/$pid/fd/XXX on sockets
*/
- file->f_path.dentry->d_flags &= ~DCACHE_UNHASHED;
- d_instantiate(file->f_path.dentry, SOCK_INODE(sock));
- file->f_path.mnt = mntget(sock_mnt);
- file->f_mapping = file->f_path.dentry->d_inode->i_mapping;
-
+ dentry->d_flags &= ~DCACHE_UNHASHED;
+ d_instantiate(dentry, SOCK_INODE(sock));
+ init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
+ &socket_file_ops);
+ SOCK_INODE(sock)->i_fop = &socket_file_ops;
sock->file = file;
file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
- file->f_mode = FMODE_READ | FMODE_WRITE;
file->f_flags = O_RDWR;
file->f_pos = 0;
file->private_data = sock;
_
There are a number of filesystems that do iput()s without first
having messed with i_nlink. In order to keep from accidentally
decrementing the superblock writer count for these, we record
when the count is bumped up, so that we can properly balance
it.
I first tried to do this by assuming that, for each dec_nlink() to
zero, there was exactly one call to iput_final(). But, there are
a number of cases where this isn't true, especially in error handling
code. Even if all of the filesystems were fixed up, it would be simple
to reintroduce new bugs imbalancing the mnt writer count. This patch
trades that possibility for the chance that we will miss a i_nlink--,
and not bump the sb writer count.
I like the idea screwing up writing out a single inode better than
screwing up a global superblock count imbalance that will affect
all inodes on the superblock.
Also, since this is the first non-trivial use of the inc/drop_nlink()
functions, add some kernel docs for them.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/inode.c | 7 +++++
lxc-dave/fs/libfs.c | 1
lxc-dave/include/linux/fs.h | 58 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 66 insertions(+)
diff -puN fs/inode.c~04-24-record-when-sb-writer-count-elevated-for-inode fs/inode.c
--- lxc/fs/inode.c~04-24-record-when-sb-writer-count-elevated-for-inode 2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/fs/inode.c 2007-02-09 14:26:48.000000000 -0800
@@ -1097,10 +1097,17 @@ static inline void iput_final(struct ino
{
const struct super_operations *op = inode->i_sb->s_op;
void (*drop)(struct inode *) = generic_drop_inode;
+ int must_drop_sb_write = (inode->i_state & I_AWAITING_FINAL_IPUT);
+ struct super_block *sb = inode->i_sb;
if (op && op->drop_inode)
drop = op->drop_inode;
drop(inode);
+ if (must_drop_sb_write) {
+ spin_lock(&sb->s_mnt_writers_lock);
+ sb->s_writers--;
+ spin_unlock(&sb->s_mnt_writers_lock);
+ }
}
/**
diff -puN fs/libfs.c~04-24-record-when-sb-writer-count-elevated-for-inode fs/libfs.c
--- lxc/fs/libfs.c~04-24-record-when-sb-writer-count-elevated-for-inode 2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/fs/libfs.c 2007-02-09 14:26:48.000000000 -0800
@@ -388,6 +388,7 @@ int simple_fill_super(struct super_block
* because the root inode is 1, the files array must not contain an
* entry at index 1
*/
+ inode->i_state |= I_AWAITING_FINAL_IPUT;
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
inode->i_uid = inode->i_gid = 0;
diff -puN include/linux/fs.h~04-24-record-when-sb-writer-count-elevated-for-inode include/linux/fs.h
--- lxc/include/linux/fs.h~04-24-record-when-sb-writer-count-elevated-for-inode 2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/include/linux/fs.h 2007-02-09 14:26:48.000000000 -0800
@@ -1230,6 +1230,7 @@ struct super_operations {
#define I_CLEAR 32
#define I_NEW 64
#define I_WILL_FREE 128
+#define I_AWAITING_FINAL_IPUT 256
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
@@ -1244,6 +1245,14 @@ static inline void mark_inode_dirty_sync
__mark_inode_dirty(inode, I_DIRTY_SYNC);
}
+/**
+ * inc_nlink - directly increment an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink. Currently,
+ * it is only here for parity with dec_nlink().
+ */
static inline void inc_nlink(struct inode *inode)
{
inode->i_nlink++;
@@ -1255,14 +1264,63 @@ static inline void inode_inc_link_count(
mark_inode_dirty(inode);
}
+/**
+ * check_nlink - check an inode's status after direct
+ * i_nlink modification.
+ * @inode: inode
+ *
+ * Some filesystems can not make simple incremental changes
+ * to i_nlink, most notably clustered ones. They must do
+ * direct manipulation of i_nlink. This function must be
+ * called after such modifications are complete to make
+ * sure that the VFS knows that the inode is going to go
+ * away.
+ */
+static inline void check_nlink(struct inode *inode)
+{
+ if (inode->i_nlink)
+ return;
+
+ inode->i_state |= I_AWAITING_FINAL_IPUT;
+ spin_lock(&inode->i_sb->s_mnt_writers_lock);
+ inode->i_sb->s_writers++;
+ spin_unlock(&inode->i_sb->s_mnt_writers_lock);
+}
+
+/**
+ * drop_nlink - directly drop an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink. In cases
+ * where we are attempting to track writes to the
+ * filesystem, a decrement to zero means an imminent
+ * write when the file is truncated and actually unlinked
+ * on the filesystem.
+ */
static inline void drop_nlink(struct inode *inode)
{
inode->i_nlink--;
+ check_nlink(inode);
}
+/**
+ * clear_nlink - directly zero an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink. See
+ * drop_nlink() for why we care about i_nlink hitting zero.
+ *
+ * Note that we could do the i_state flag directly in here,
+ * but we call check_nlink() to keep the number of places
+ * where the flag is set to exactly one. The compiler
+ * should get rid of the superfluous i_nlink check.
+ */
static inline void clear_nlink(struct inode *inode)
{
inode->i_nlink = 0;
+ check_nlink(inode);
}
static inline void inode_dec_link_count(struct inode *inode)
_
Originally from: Herbert Poetzl <[email protected]>
This is the core of the read-only bind mount patch set.
Note that this does _not_ add a "ro" option directly to
the bind mount operation. If you require such a mount,
you must first do the bind, then follow it up with a
'mount -o remount,ro' operation.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namespace.c | 24 ++++++++++++++++++++++--
lxc-dave/fs/open.c | 2 +-
2 files changed, 23 insertions(+), 3 deletions(-)
diff -puN fs/namespace.c~23-24-honor-r-w-changes-at-do-remount-time fs/namespace.c
--- lxc/fs/namespace.c~23-24-honor-r-w-changes-at-do-remount-time 2007-02-09 14:27:00.000000000 -0800
+++ lxc-dave/fs/namespace.c 2007-02-09 14:27:00.000000000 -0800
@@ -443,7 +443,7 @@ static int show_vfsmnt(struct seq_file *
seq_path(m, mnt, mnt->mnt_root, " \t\n\\");
seq_putc(m, ' ');
mangle(m, mnt->mnt_sb->s_type->name);
- seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
+ seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
if (mnt->mnt_sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
@@ -1017,6 +1017,23 @@ out:
return err;
}
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+ int error = 0;
+ int readonly_request = 0;
+
+ if (ms_flags & MS_RDONLY)
+ readonly_request = 1;
+ if (readonly_request == __mnt_is_readonly(mnt))
+ return 0;
+
+ if (readonly_request)
+ error = mnt_make_readonly(mnt);
+ else
+ __mnt_unmake_readonly(mnt);
+ return error;
+}
+
/*
* change filesystem flags. dir should be a physical root of filesystem.
* If you've mounted a non-root directory somewhere and want to do remount
@@ -1038,7 +1055,10 @@ static int do_remount(struct nameidata *
return -EINVAL;
down_write(&sb->s_umount);
- err = do_remount_sb(sb, flags, data, 0);
+ if (flags & MS_BIND)
+ err = change_mount_flags(nd->mnt, flags);
+ else
+ err = do_remount_sb(sb, flags, data, 0);
if (!err)
nd->mnt->mnt_flags = mnt_flags;
up_write(&sb->s_umount);
diff -puN fs/open.c~23-24-honor-r-w-changes-at-do-remount-time fs/open.c
--- lxc/fs/open.c~23-24-honor-r-w-changes-at-do-remount-time 2007-02-09 14:27:00.000000000 -0800
+++ lxc-dave/fs/open.c 2007-02-09 14:27:00.000000000 -0800
@@ -401,7 +401,7 @@ asmlinkage long sys_faccessat(int dfd, c
special_file(nd.dentry->d_inode->i_mode))
goto out_path_release;
- if(IS_RDONLY(nd.dentry->d_inode))
+ if(__mnt_is_readonly(nd.mnt) || IS_RDONLY(nd.dentry->d_inode))
res = -EROFS;
out_path_release:
_
elevate mnt writers for callers of vfs_mkdir()
Pretty self-explanatory. Fits in with the rest of the series.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namei.c | 5 +++++
lxc-dave/fs/nfsd/nfs4recover.c | 4 ++++
2 files changed, 9 insertions(+)
diff -puN fs/namei.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir fs/namei.c
--- lxc/fs/namei.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir 2007-02-09 14:26:49.000000000 -0800
+++ lxc-dave/fs/namei.c 2007-02-09 14:26:49.000000000 -0800
@@ -1963,7 +1963,12 @@ asmlinkage long sys_mkdirat(int dfd, con
if (!IS_POSIXACL(nd.dentry->d_inode))
mode &= ~current->fs->umask;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_dput;
error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
+ mnt_drop_write(nd.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
diff -puN fs/nfsd/nfs4recover.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir fs/nfsd/nfs4recover.c
--- lxc/fs/nfsd/nfs4recover.c~07-24-elevate-mnt-writers-for-callers-of-vfs-mkdir 2007-02-09 14:26:49.000000000 -0800
+++ lxc-dave/fs/nfsd/nfs4recover.c 2007-02-09 14:26:49.000000000 -0800
@@ -156,7 +156,11 @@ nfsd4_create_clid_dir(struct nfs4_client
dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
goto out_put;
}
+ status = mnt_want_write(rec_dir.mnt);
+ if (status)
+ goto out_put;
status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+ mnt_drop_write(rec_dir.mnt);
out_put:
dput(dentry);
out_unlock:
_
This area of code is currently #ifdef'd out, so add a comment
for the time when it is actually used.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namespace.c | 4 ++++
1 file changed, 4 insertions(+)
diff -puN fs/namespace.c~11-24-mount-is-safe-add-comment fs/namespace.c
--- lxc/fs/namespace.c~11-24-mount-is-safe-add-comment 2007-02-09 14:26:52.000000000 -0800
+++ lxc-dave/fs/namespace.c 2007-02-09 14:26:52.000000000 -0800
@@ -744,6 +744,10 @@ static int mount_is_safe(struct nameidat
if (current->uid != nd->dentry->d_inode->i_uid)
return -EPERM;
}
+ /*
+ * We will eventually check for the mnt->writer_count here,
+ * but since the code is not used now, skip it - Dave Hansen
+ */
if (vfs_permission(nd, MAY_WRITE))
return -EPERM;
return 0;
_
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/net/unix/af_unix.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff -puN net/unix/af_unix.c~12-24-unix-find-other-elevate-write-count-for-touch-atime net/unix/af_unix.c
--- lxc/net/unix/af_unix.c~12-24-unix-find-other-elevate-write-count-for-touch-atime 2007-02-09 14:26:52.000000000 -0800
+++ lxc-dave/net/unix/af_unix.c 2007-02-09 14:26:52.000000000 -0800
@@ -703,21 +703,27 @@ static struct sock *unix_find_other(stru
err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
if (err)
goto fail;
+
+ err = mnt_want_write(nd.mnt);
+ if (err)
+ goto put_path_fail;
+
err = vfs_permission(&nd, MAY_WRITE);
if (err)
- goto put_fail;
+ goto mnt_drop_write_fail;
err = -ECONNREFUSED;
if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
- goto put_fail;
+ goto mnt_drop_write_fail;
u=unix_find_socket_byinode(nd.dentry->d_inode);
if (!u)
- goto put_fail;
+ goto mnt_drop_write_fail;
if (u->sk_type == type)
touch_atime(nd.mnt, nd.dentry);
path_release(&nd);
+ mnt_drop_write(nd.mnt);
err=-EPROTOTYPE;
if (u->sk_type != type) {
@@ -737,7 +743,9 @@ static struct sock *unix_find_other(stru
}
return u;
-put_fail:
+mnt_drop_write_fail:
+ mnt_drop_write(nd.mnt);
+put_path_fail:
path_release(&nd);
fail:
*error=err;
_
This does create a little helper in the NFS code to
make an if() a little bit less ugly.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namei.c | 4 ++++
lxc-dave/fs/nfsd/vfs.c | 23 +++++++++++++++++++----
2 files changed, 23 insertions(+), 4 deletions(-)
diff -puN fs/namei.c~13-24-elevate-write-count-over-calls-to-vfs-rename fs/namei.c
--- lxc/fs/namei.c~13-24-elevate-write-count-over-calls-to-vfs-rename 2007-02-09 14:26:53.000000000 -0800
+++ lxc-dave/fs/namei.c 2007-02-09 14:26:53.000000000 -0800
@@ -2567,8 +2567,12 @@ static int do_rename(int olddfd, const c
if (new_dentry == trap)
goto exit5;
+ error = mnt_want_write(oldnd.mnt);
+ if (error)
+ goto exit5;
error = vfs_rename(old_dir->d_inode, old_dentry,
new_dir->d_inode, new_dentry);
+ mnt_drop_write(oldnd.mnt);
exit5:
dput(new_dentry);
exit4:
diff -puN fs/nfsd/vfs.c~13-24-elevate-write-count-over-calls-to-vfs-rename fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~13-24-elevate-write-count-over-calls-to-vfs-rename 2007-02-09 14:26:53.000000000 -0800
+++ lxc-dave/fs/nfsd/vfs.c 2007-02-09 14:26:53.000000000 -0800
@@ -1555,6 +1555,14 @@ out_nfserr:
goto out_unlock;
}
+static inline int svc_msnfs(struct svc_fh *ffhp)
+{
+#ifdef MSNFS
+ return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
+#else
+ return 0;
+#endif
+}
/*
* Rename a file
* N.B. After this call _both_ ffhp and tfhp need an fh_put
@@ -1616,13 +1624,20 @@ nfsd_rename(struct svc_rqst *rqstp, stru
if (ndentry == trap)
goto out_dput_new;
-#ifdef MSNFS
- if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ if (svc_msnfs(ffhp) &&
((atomic_read(&odentry->d_count) > 1)
|| (atomic_read(&ndentry->d_count) > 1))) {
host_err = -EPERM;
- } else
-#endif
+ goto out_dput_new;
+ }
+
+ host_err = -EXDEV;
+ if (ffhp->fh_export->ex_mnt != tfhp->fh_export->ex_mnt)
+ goto out_dput_new;
+ host_err = mnt_want_write(ffhp->fh_export->ex_mnt);
+ if (host_err)
+ goto out_dput_new;
+
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
host_err = nfsd_sync_dir(tdentry);
_
Elevate the write count during the vfs_rmdir() call.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namei.c | 5 +++++
1 file changed, 5 insertions(+)
diff -puN fs/namei.c~20-24-do-rmdir-elevate-write-count fs/namei.c
--- lxc/fs/namei.c~20-24-do-rmdir-elevate-write-count 2007-02-09 14:26:58.000000000 -0800
+++ lxc-dave/fs/namei.c 2007-02-09 14:26:58.000000000 -0800
@@ -2101,7 +2101,12 @@ static long do_rmdir(int dfd, const char
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit2;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto exit3;
error = vfs_rmdir(nd.dentry->d_inode, dentry);
+ mnt_drop_write(nd.mnt);
+exit3:
dput(dentry);
exit2:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
_
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/open.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff -puN fs/open.c~15-24-elevate-writer-count-for-do-sys-truncate fs/open.c
--- lxc/fs/open.c~15-24-elevate-writer-count-for-do-sys-truncate 2007-02-09 14:26:55.000000000 -0800
+++ lxc-dave/fs/open.c 2007-02-09 14:26:55.000000000 -0800
@@ -241,28 +241,32 @@ static long do_sys_truncate(const char _
if (!S_ISREG(inode->i_mode))
goto dput_and_out;
- error = vfs_permission(&nd, MAY_WRITE);
+ error = mnt_want_write(nd.mnt);
if (error)
goto dput_and_out;
+ error = vfs_permission(&nd, MAY_WRITE);
+ if (error)
+ goto mnt_drop_write_and_out;
+
error = -EROFS;
if (IS_RDONLY(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
/*
* Make sure that there are no leases.
*/
error = break_lease(inode, FMODE_WRITE);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
error = get_write_access(inode);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
error = locks_verify_truncate(inode, NULL, length);
if (!error) {
@@ -271,6 +275,8 @@ static long do_sys_truncate(const char _
}
put_write_access(inode);
+mnt_drop_write_and_out:
+ mnt_drop_write(nd.mnt);
dput_and_out:
path_release(&nd);
out:
_
---
lxc-dave/fs/gfs2/inode.c | 1 +
1 file changed, 1 insertion(+)
diff -puN fs/gfs2/inode.c~gfs-check-nlink-count fs/gfs2/inode.c
--- lxc/fs/gfs2/inode.c~gfs-check-nlink-count 2007-02-09 14:26:59.000000000 -0800
+++ lxc-dave/fs/gfs2/inode.c 2007-02-09 14:26:59.000000000 -0800
@@ -169,6 +169,7 @@ static int gfs2_dinode_in(struct gfs2_in
* to do that.
*/
ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
+ check_nlink(&ip->i_inode);
di->di_size = be64_to_cpu(str->di_size);
i_size_write(&ip->i_inode, di->di_size);
di->di_blocks = be64_to_cpu(str->di_blocks);
_
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namei.c | 4 ++++
lxc-dave/ipc/mqueue.c | 5 ++++-
2 files changed, 8 insertions(+), 1 deletion(-)
diff -puN fs/namei.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers fs/namei.c
--- lxc/fs/namei.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers 2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/fs/namei.c 2007-02-09 14:26:57.000000000 -0800
@@ -2181,7 +2181,11 @@ static long do_unlinkat(int dfd, const c
inode = dentry->d_inode;
if (inode)
atomic_inc(&inode->i_count);
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto exit2;
error = vfs_unlink(nd.dentry->d_inode, dentry);
+ mnt_drop_write(nd.mnt);
exit2:
dput(dentry);
}
diff -puN ipc/mqueue.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers ipc/mqueue.c
--- lxc/ipc/mqueue.c~19-24-elevate-mnt-writers-for-vfs-unlink-callers 2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/ipc/mqueue.c 2007-02-09 14:26:57.000000000 -0800
@@ -749,8 +749,11 @@ asmlinkage long sys_mq_unlink(const char
inode = dentry->d_inode;
if (inode)
atomic_inc(&inode->i_count);
-
+ err = mnt_want_write(mqueue_mnt);
+ if (err)
+ goto out_err;
err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+ mnt_drop_write(mqueue_mnt);
out_err:
dput(dentry);
_
This is the first really tricky patch in the series. It
elevates the writer count on a mount each time a
non-special file is opened for write.
This is not completely apparent in the patch because the
two if() conditions in may_open() above the
mnt_want_write() call are, combined, equivalent to
special_file().
There is also an elevated count around the vfs_create()
call in open_namei(). The count needs to be kept elevated
all the way into the may_open() call. Otherwise, when the
write is dropped, a ro->rw transisition could occur. This
would lead to having rw access on the newly created file,
while the vfsmount is ro. That is bad.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/file_table.c | 5 ++++-
lxc-dave/fs/namei.c | 22 ++++++++++++++++++----
lxc-dave/ipc/mqueue.c | 3 +++
3 files changed, 25 insertions(+), 5 deletions(-)
diff -puN fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/file_table.c
--- lxc/fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed 2007-02-09 14:26:54.000000000 -0800
+++ lxc-dave/fs/file_table.c 2007-02-09 14:26:54.000000000 -0800
@@ -209,8 +209,11 @@ void fastcall __fput(struct file *file)
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
cdev_put(inode->i_cdev);
fops_put(file->f_op);
- if (file->f_mode & FMODE_WRITE)
+ if (file->f_mode & FMODE_WRITE) {
put_write_access(inode);
+ if(!special_file(inode->i_mode))
+ mnt_drop_write(mnt);
+ }
put_pid(file->f_owner.pid);
put_user_ns(file->f_owner.user_ns);
file_kill(file);
diff -puN fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/namei.c
--- lxc/fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed 2007-02-09 14:26:54.000000000 -0800
+++ lxc-dave/fs/namei.c 2007-02-09 14:26:54.000000000 -0800
@@ -1548,8 +1548,17 @@ int may_open(struct nameidata *nd, int a
return -EACCES;
flag &= ~O_TRUNC;
- } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
- return -EROFS;
+ } else if (flag & FMODE_WRITE) {
+ /*
+ * effectively: !special_file()
+ * balanced by __fput()
+ */
+ error = mnt_want_write(nd->mnt);
+ if (error)
+ return error;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ }
/*
* An append-only file must be opened in append mode for writing.
*/
@@ -1688,14 +1697,17 @@ do_last:
}
if (IS_ERR(nd->intent.open.file)) {
- mutex_unlock(&dir->d_inode->i_mutex);
error = PTR_ERR(nd->intent.open.file);
- goto exit_dput;
+ goto exit_mutex_unlock;
}
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
+ error = mnt_want_write(nd->mnt);
+ if (error)
+ goto exit_mutex_unlock;
error = open_namei_create(nd, &path, flag, mode);
+ mnt_drop_write(nd->mnt);
if (error)
goto exit;
return 0;
@@ -1733,6 +1745,8 @@ ok:
goto exit;
return 0;
+exit_mutex_unlock:
+ mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
dput_path(&path, nd);
exit:
diff -puN ipc/mqueue.c~14-24-tricky-elevate-write-count-files-are-open-ed ipc/mqueue.c
--- lxc/ipc/mqueue.c~14-24-tricky-elevate-write-count-files-are-open-ed 2007-02-09 14:26:54.000000000 -0800
+++ lxc-dave/ipc/mqueue.c 2007-02-09 14:26:54.000000000 -0800
@@ -687,6 +687,9 @@ asmlinkage long sys_mq_open(const char _
goto out;
filp = do_open(dentry, oflag);
} else {
+ error = mnt_want_write(mqueue_mnt);
+ if (error)
+ goto out;
filp = do_create(mqueue_mnt->mnt_root, dentry,
oflag, mode, u_attr);
}
_
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/inode.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff -puN fs/inode.c~17-24-elevate-write-count-for-do-sys-utime-and-touch-atime fs/inode.c
--- lxc/fs/inode.c~17-24-elevate-write-count-for-do-sys-utime-and-touch-atime 2007-02-09 14:26:56.000000000 -0800
+++ lxc-dave/fs/inode.c 2007-02-09 14:26:56.000000000 -0800
@@ -1170,22 +1170,23 @@ void touch_atime(struct vfsmount *mnt, s
struct inode *inode = dentry->d_inode;
struct timespec now;
- if (inode->i_flags & S_NOATIME)
+ if (mnt && mnt_want_write(mnt))
return;
+ if (inode->i_flags & S_NOATIME)
+ goto out;
if (IS_NOATIME(inode))
- return;
+ goto out;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
+ goto out;
/*
* We may have a NULL vfsmount when coming from NFSD
*/
if (mnt) {
if (mnt->mnt_flags & MNT_NOATIME)
- return;
+ goto out;
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
-
+ goto out;
if (mnt->mnt_flags & MNT_RELATIME) {
/*
* With relative atime, only update atime if the
@@ -1196,16 +1197,19 @@ void touch_atime(struct vfsmount *mnt, s
&inode->i_atime) < 0 &&
timespec_compare(&inode->i_ctime,
&inode->i_atime) < 0)
- return;
+ goto out;
}
}
now = current_fs_time(inode->i_sb);
if (timespec_equal(&inode->i_atime, &now))
- return;
+ goto out;
inode->i_atime = now;
mark_inode_dirty_sync(inode);
+out:
+ if (mnt)
+ mnt_drop_write(mnt);
}
EXPORT_SYMBOL(touch_atime);
_
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/utimes.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff -puN fs/utimes.c~16-24-elevate-write-count-for-do-utimes fs/utimes.c
--- lxc/fs/utimes.c~16-24-elevate-write-count-for-do-utimes 2007-02-09 14:26:55.000000000 -0800
+++ lxc-dave/fs/utimes.c 2007-02-09 14:26:55.000000000 -0800
@@ -58,16 +58,19 @@ static long do_utimes_nsec(int dfd, char
goto out;
inode = nd.dentry->d_inode;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto dput_and_out;
error = -EROFS;
if (IS_RDONLY(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
/* Don't worry, the checks are done in inode_change_ok() */
newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
if (times) {
error = -EPERM;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
newattrs.ia_atime = times[0];
newattrs.ia_mtime = times[1];
@@ -75,15 +78,17 @@ static long do_utimes_nsec(int dfd, char
} else {
error = -EACCES;
if (IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
if (current->fsuid != inode->i_uid &&
(error = vfs_permission(&nd, MAY_WRITE)) != 0)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
}
mutex_lock(&inode->i_mutex);
error = notify_change(nd.dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+mnt_drop_write_and_out:
+ mnt_drop_write(nd.mnt);
dput_and_out:
path_release(&nd);
out:
_
This takes care of all of the direct callers of vfs_mknod().
Since a few of these cases also handle normal file creation
as well, this also covers some calls to vfs_create().
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namei.c | 12 ++++++++++++
lxc-dave/fs/nfsd/vfs.c | 4 ++++
lxc-dave/net/unix/af_unix.c | 4 ++++
3 files changed, 20 insertions(+)
diff -puN fs/namei.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create fs/namei.c
--- lxc/fs/namei.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create 2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/fs/namei.c 2007-02-09 14:26:57.000000000 -0800
@@ -1903,14 +1903,26 @@ asmlinkage long sys_mknodat(int dfd, con
if (!IS_ERR(dentry)) {
switch (mode & S_IFMT) {
case 0: case S_IFREG:
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ break;
error = vfs_create(nd.dentry->d_inode,dentry,mode,&nd);
+ mnt_drop_write(nd.mnt);
break;
case S_IFCHR: case S_IFBLK:
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ break;
error = vfs_mknod(nd.dentry->d_inode,dentry,mode,
new_decode_dev(dev));
+ mnt_drop_write(nd.mnt);
break;
case S_IFIFO: case S_IFSOCK:
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ break;
error = vfs_mknod(nd.dentry->d_inode,dentry,mode,0);
+ mnt_drop_write(nd.mnt);
break;
case S_IFDIR:
error = -EPERM;
diff -puN fs/nfsd/vfs.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create 2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/fs/nfsd/vfs.c 2007-02-09 14:26:57.000000000 -0800
@@ -664,6 +664,9 @@ nfsd_open(struct svc_rqst *rqstp, struct
/* Disallow write access to files with the append-only bit set
* or any access when mandatory locking enabled
*/
+ err = mnt_want_write(fhp->fh_export->ex_mnt);
+ if (err)
+ goto out_nfserr;
err = nfserr_perm;
if (IS_APPEND(inode) && (access & MAY_WRITE))
goto out;
@@ -1199,6 +1202,7 @@ nfsd_create(struct svc_rqst *rqstp, stru
printk("nfsd: bad file type %o in nfsd_create\n", type);
host_err = -EINVAL;
}
+ mnt_drop_write(fhp->fh_export->ex_mnt);
if (host_err < 0)
goto out_nfserr;
diff -puN net/unix/af_unix.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create net/unix/af_unix.c
--- lxc/net/unix/af_unix.c~18-24-sys-mknodat-elevate-write-count-for-vfs-mknod-create 2007-02-09 14:26:57.000000000 -0800
+++ lxc-dave/net/unix/af_unix.c 2007-02-09 14:26:57.000000000 -0800
@@ -816,7 +816,11 @@ static int unix_bind(struct socket *sock
*/
mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current->fs->umask);
+ err = mnt_want_write(nd.mnt);
+ if (err)
+ goto out_mknod_dput;
err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
+ mnt_drop_write(nd.mnt);
if (err)
goto out_mknod_dput;
mutex_unlock(&nd.dentry->d_inode->i_mutex);
_
Some filesystems forego the use of normal vfs calls to create
struct files. Make sure that these users elevate the mnt writer
count. These probably don't have any real meaning because there
is no real backing store for these mounts, but it is here for
consistency.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/file_table.c | 4 ++++
1 file changed, 4 insertions(+)
diff -puN fs/file_table.c~22-24-elevate-writer-count-for-custom-struct-file fs/file_table.c
--- lxc/fs/file_table.c~22-24-elevate-writer-count-for-custom-struct-file 2007-02-09 14:26:59.000000000 -0800
+++ lxc-dave/fs/file_table.c 2007-02-09 14:26:59.000000000 -0800
@@ -171,6 +171,10 @@ int init_file(struct file *file, struct
file->f_mapping = dentry->d_inode->i_mapping;
file->f_mode = mode;
file->f_op = fop;
+ if (mode & FMODE_WRITE) {
+ error = mnt_want_write(mnt);
+ WARN_ON(error);
+ }
return error;
}
_
Now that we have the sb writer count, and all of the
writers marked with mnt_want_write(), we don't need to
go looking at all of the individual open files.
Kill the open files walk, and use the sb writer count.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/file_table.c | 25 -------------------------
lxc-dave/fs/super.c | 13 ++++++++++++-
lxc-dave/include/linux/fs.h | 2 --
3 files changed, 12 insertions(+), 28 deletions(-)
diff -puN fs/file_table.c~24-24-kill-open-files-traverse-on-remount-ro fs/file_table.c
--- lxc/fs/file_table.c~24-24-kill-open-files-traverse-on-remount-ro 2007-02-09 14:27:01.000000000 -0800
+++ lxc-dave/fs/file_table.c 2007-02-09 14:27:01.000000000 -0800
@@ -308,31 +308,6 @@ void file_kill(struct file *file)
}
}
-int fs_may_remount_ro(struct super_block *sb)
-{
- struct list_head *p;
-
- /* Check that no files are currently opened for writing. */
- file_list_lock();
- list_for_each(p, &sb->s_files) {
- struct file *file = list_entry(p, struct file, f_u.fu_list);
- struct inode *inode = file->f_path.dentry->d_inode;
-
- /* File with pending delete? */
- if (inode->i_nlink == 0)
- goto too_bad;
-
- /* Writeable file? */
- if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
- goto too_bad;
- }
- file_list_unlock();
- return 1; /* Tis' cool bro. */
-too_bad:
- file_list_unlock();
- return 0;
-}
-
void __init files_init(unsigned long mempages)
{
int n;
diff -puN fs/super.c~24-24-kill-open-files-traverse-on-remount-ro fs/super.c
--- lxc/fs/super.c~24-24-kill-open-files-traverse-on-remount-ro 2007-02-09 14:27:01.000000000 -0800
+++ lxc-dave/fs/super.c 2007-02-09 14:27:01.000000000 -0800
@@ -580,7 +580,18 @@ static void mark_files_ro(struct super_b
static int sb_remount_ro(struct super_block *sb)
{
- return fs_may_remount_ro(sb);
+ int ret = 0;
+
+ /*
+ * The r/o flag actually gets set
+ * by the caller.
+ */
+ spin_lock(&sb->s_mnt_writers_lock);
+ if (sb->s_writers)
+ ret = -EBUSY;
+ spin_unlock(&sb->s_mnt_writers_lock);
+
+ return ret;
}
/**
diff -puN include/linux/fs.h~24-24-kill-open-files-traverse-on-remount-ro include/linux/fs.h
--- lxc/include/linux/fs.h~24-24-kill-open-files-traverse-on-remount-ro 2007-02-09 14:27:01.000000000 -0800
+++ lxc-dave/include/linux/fs.h 2007-02-09 14:27:01.000000000 -0800
@@ -1657,8 +1657,6 @@ extern const struct file_operations read
extern const struct file_operations write_fifo_fops;
extern const struct file_operations rdwr_fifo_fops;
-extern int fs_may_remount_ro(struct super_block *);
-
#ifdef CONFIG_BLOCK
/*
* return READ, READA, or WRITE
_
This patch actually adds the mount and superblock writer
counts, and the mnt_want/drop_write() functions that use
them.
Before these can become useful, we must first cover each
place in the VFS where writes are performed with a
want/drop pair. When that is complete, we can actually
introduce code that will safely check the counts before
allowing r/w<->r/o transitions to occur.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namespace.c | 53 +++++++++++++++++++++++++++++++++++++++++
lxc-dave/fs/super.c | 18 ++++++++++---
lxc-dave/include/linux/fs.h | 2 +
lxc-dave/include/linux/mount.h | 21 ++++++++++++++++
4 files changed, 90 insertions(+), 4 deletions(-)
diff -puN fs/namespace.c~03-24-add-vfsmount-writer-count fs/namespace.c
--- lxc/fs/namespace.c~03-24-add-vfsmount-writer-count 2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/fs/namespace.c 2007-02-09 14:26:47.000000000 -0800
@@ -58,6 +58,7 @@ struct vfsmount *alloc_vfsmnt(const char
if (mnt) {
mnt->mnt_user_ns = get_user_ns(current->nsproxy->user_ns);
atomic_set(&mnt->mnt_count, 1);
+ mnt->mnt_writers = 0;
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -78,6 +79,56 @@ struct vfsmount *alloc_vfsmnt(const char
return mnt;
}
+int mnt_make_readonly(struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ WARN_ON(__mnt_is_readonly(mnt));
+
+ /*
+ * This flag set is actually redundant with what
+ * happens in do_remount(), but since we do this
+ * under the lock, anyone attempting to get a write
+ * on it after this will fail.
+ */
+ spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+ if (!mnt->mnt_writers)
+ mnt->mnt_flags |= MNT_READONLY;
+ else
+ ret = -EBUSY;
+ spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+ return ret;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+ if (mnt->mnt_writers)
+ goto out;
+
+ if (__mnt_is_readonly(mnt)) {
+ ret = -EROFS;
+ goto out;
+ }
+ mnt->mnt_sb->s_writers++;
+ mnt->mnt_writers++;
+out:
+ spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+void mnt_drop_write(struct vfsmount *mnt)
+{
+ spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+ mnt->mnt_sb->s_writers--;
+ mnt->mnt_writers--;
+ spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
@@ -1415,6 +1466,8 @@ long do_mount(char *dev_name, char *dir_
((char *)data_page)[PAGE_SIZE - 1] = 0;
/* Separate the per-mountpoint flags */
+ if (flags & MS_RDONLY)
+ mnt_flags |= MNT_READONLY;
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
diff -puN fs/super.c~03-24-add-vfsmount-writer-count fs/super.c
--- lxc/fs/super.c~03-24-add-vfsmount-writer-count 2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/fs/super.c 2007-02-09 14:26:47.000000000 -0800
@@ -93,6 +93,8 @@ static struct super_block *alloc_super(s
s->s_qcop = sb_quotactl_ops;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
+ s->s_writers = 0;
+ spin_lock_init(&s->s_mnt_writers_lock);
}
out:
return s;
@@ -576,6 +578,11 @@ static void mark_files_ro(struct super_b
file_list_unlock();
}
+static int sb_remount_ro(struct super_block *sb)
+{
+ return fs_may_remount_ro(sb);
+}
+
/**
* do_remount_sb - asks filesystem to change mount options.
* @sb: superblock in question
@@ -587,7 +594,8 @@ static void mark_files_ro(struct super_b
*/
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
- int retval;
+ int retval = 0;
+ int sb_started_ro = (sb->s_flags & MS_RDONLY);
#ifdef CONFIG_BLOCK
if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
@@ -600,11 +608,13 @@ int do_remount_sb(struct super_block *sb
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
- if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+ if ((flags & MS_RDONLY) && !sb_started_ro) {
if (force)
mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
- return -EBUSY;
+ else
+ retval = sb_remount_ro(sb);
+ if (retval)
+ return retval;
}
if (sb->s_op->remount_fs) {
diff -puN include/linux/fs.h~03-24-add-vfsmount-writer-count include/linux/fs.h
--- lxc/include/linux/fs.h~03-24-add-vfsmount-writer-count 2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/include/linux/fs.h 2007-02-09 14:26:47.000000000 -0800
@@ -972,6 +972,8 @@ struct super_block {
struct list_head s_io; /* parked for writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
+ int s_writers; /* number of files open for write */
+ spinlock_t s_mnt_writers_lock; /* taken when mounts change rw state */
struct block_device *s_bdev;
struct list_head s_instances;
diff -puN include/linux/mount.h~03-24-add-vfsmount-writer-count include/linux/mount.h
--- lxc/include/linux/mount.h~03-24-add-vfsmount-writer-count 2007-02-09 14:26:47.000000000 -0800
+++ lxc-dave/include/linux/mount.h 2007-02-09 14:26:47.000000000 -0800
@@ -29,6 +29,7 @@ struct user_namespace;
#define MNT_NOATIME 0x08
#define MNT_NODIRATIME 0x10
#define MNT_RELATIME 0x20
+#define MNT_READONLY 0x40 /* does the user want this to be r/o? */
#define MNT_SHRINKABLE 0x100
@@ -56,6 +57,7 @@ struct vfsmount {
struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
struct user_namespace *mnt_user_ns; /* namespace for uid interpretation */
+ int mnt_writers; /* nr files open for write */
/*
* We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
* to let these frequently modified fields in a separate cache line
@@ -72,7 +74,26 @@ static inline struct vfsmount *mntget(st
atomic_inc(&mnt->mnt_count);
return mnt;
}
+/*
+ * This is temporary for now. We also don't want to check
+ * the SB in because it is already checked in other
+ * code paths. We'll have a better way to do this in
+ * the end of this series
+ */
+static inline int __mnt_is_readonly(struct vfsmount *mnt)
+{
+ return mnt->mnt_flags & MNT_READONLY;
+}
+
+static inline void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+ WARN_ON(!__mnt_is_readonly(mnt));
+ mnt->mnt_flags &= ~MNT_READONLY;
+}
+extern int mnt_make_readonly(struct vfsmount *mnt);
+extern int mnt_want_write(struct vfsmount *mnt);
+extern void mnt_drop_write(struct vfsmount *mnt);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
_
This basically audits the callers of xattr_permission(), which
calls permission() and can perform writes to the filesystem.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/nfsd/nfs4proc.c | 7 ++++++-
lxc-dave/fs/xattr.c | 14 ++++++++++++++
2 files changed, 20 insertions(+), 1 deletion(-)
diff -puN fs/nfsd/nfs4proc.c~10-24-elevate-mount-count-for-extended-attributes fs/nfsd/nfs4proc.c
--- lxc/fs/nfsd/nfs4proc.c~10-24-elevate-mount-count-for-extended-attributes 2007-02-09 14:26:51.000000000 -0800
+++ lxc-dave/fs/nfsd/nfs4proc.c 2007-02-09 14:26:51.000000000 -0800
@@ -626,14 +626,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
return status;
}
}
+ status = mnt_want_write(cstate->current_fh.fh_export->ex_mnt);
+ if (status)
+ return status;
status = nfs_ok;
if (setattr->sa_acl != NULL)
status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
setattr->sa_acl);
if (status)
- return status;
+ goto out;
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
+out:
+ mnt_drop_write(cstate->current_fh.fh_export->ex_mnt);
return status;
}
diff -puN fs/xattr.c~10-24-elevate-mount-count-for-extended-attributes fs/xattr.c
--- lxc/fs/xattr.c~10-24-elevate-mount-count-for-extended-attributes 2007-02-09 14:26:51.000000000 -0800
+++ lxc-dave/fs/xattr.c 2007-02-09 14:26:51.000000000 -0800
@@ -12,6 +12,7 @@
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/xattr.h>
+#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -237,7 +238,11 @@ sys_setxattr(char __user *path, char __u
error = user_path_walk(path, &nd);
if (error)
return error;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ return error;
error = setxattr(nd.dentry, name, value, size, flags);
+ mnt_drop_write(nd.mnt);
path_release(&nd);
return error;
}
@@ -252,7 +257,11 @@ sys_lsetxattr(char __user *path, char __
error = user_path_walk_link(path, &nd);
if (error)
return error;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ return error;
error = setxattr(nd.dentry, name, value, size, flags);
+ mnt_drop_write(nd.mnt);
path_release(&nd);
return error;
}
@@ -268,9 +277,14 @@ sys_fsetxattr(int fd, char __user *name,
f = fget(fd);
if (!f)
return error;
+ error = mnt_want_write(f->f_vfsmnt);
+ if (error)
+ goto out_fput;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry->d_inode);
error = setxattr(dentry, name, value, size, flags);
+ mnt_drop_write(f->f_vfsmnt);
+out_fput:
fput(f);
return error;
}
_
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namei.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff -puN fs/namei.c~09-24-elevate-write-count-for-link-and-symlink-calls fs/namei.c
--- lxc/fs/namei.c~09-24-elevate-write-count-for-link-and-symlink-calls 2007-02-09 14:26:50.000000000 -0800
+++ lxc-dave/fs/namei.c 2007-02-09 14:26:50.000000000 -0800
@@ -2236,7 +2236,12 @@ asmlinkage long sys_symlinkat(const char
if (IS_ERR(dentry))
goto out_unlock;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_dput;
error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
+ mnt_drop_write(nd.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
@@ -2331,7 +2336,12 @@ asmlinkage long sys_linkat(int olddfd, c
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out_unlock;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_dput;
error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+ mnt_drop_write(nd.mnt);
+out_dput:
dput(new_dentry);
out_unlock:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
_
Some ioctls need write access, but others don't. Make a helper
function to decide when write access is needed, and take it.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/ncpfs/ioctl.c | 55 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 54 insertions(+), 1 deletion(-)
diff -puN fs/ncpfs/ioctl.c~08-24-elevate-write-count-during-entire-ncp-ioctl fs/ncpfs/ioctl.c
--- lxc/fs/ncpfs/ioctl.c~08-24-elevate-write-count-during-entire-ncp-ioctl 2007-02-09 14:26:50.000000000 -0800
+++ lxc-dave/fs/ncpfs/ioctl.c 2007-02-09 14:26:50.000000000 -0800
@@ -14,6 +14,7 @@
#include <linux/ioctl.h>
#include <linux/time.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/highuid.h>
#include <linux/smp_lock.h>
#include <linux/vmalloc.h>
@@ -260,7 +261,7 @@ ncp_get_charsets(struct ncp_server* serv
}
#endif /* CONFIG_NCPFS_NLS */
-int ncp_ioctl(struct inode *inode, struct file *filp,
+static int __ncp_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct ncp_server *server = NCP_SERVER(inode);
@@ -821,6 +822,58 @@ outrel:
return -EINVAL;
}
+static int ncp_ioctl_need_write(unsigned int cmd)
+{
+ switch (cmd) {
+ case NCP_IOC_GET_FS_INFO:
+ case NCP_IOC_GET_FS_INFO_V2:
+ case NCP_IOC_NCPREQUEST:
+ case NCP_IOC_SETDENTRYTTL:
+ case NCP_IOC_SIGN_INIT:
+ case NCP_IOC_LOCKUNLOCK:
+ case NCP_IOC_SET_SIGN_WANTED:
+ return 1;
+ case NCP_IOC_GETOBJECTNAME:
+ case NCP_IOC_SETOBJECTNAME:
+ case NCP_IOC_GETPRIVATEDATA:
+ case NCP_IOC_SETPRIVATEDATA:
+ case NCP_IOC_SETCHARSETS:
+ case NCP_IOC_GETCHARSETS:
+ case NCP_IOC_CONN_LOGGED_IN:
+ case NCP_IOC_GETDENTRYTTL:
+ case NCP_IOC_GETMOUNTUID2:
+ case NCP_IOC_SIGN_WANTED:
+ case NCP_IOC_GETROOT:
+ case NCP_IOC_SETROOT:
+ return 0;
+ default:
+ /* unkown IOCTL command, assume write */
+ WARN_ON(1);
+ }
+ return 1;
+}
+
+int ncp_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ if (ncp_ioctl_need_write(cmd)) {
+ /*
+ * inside the ioctl(), any failures which
+ * are because of file_permission() are
+ * -EACCESS, so it seems consistent to keep
+ * that here.
+ */
+ if (mnt_want_write(filp->f_vfsmnt))
+ return -EACCES;
+ }
+ ret = __ncp_ioctl(inode, filp, cmd, arg);
+ if (ncp_ioctl_need_write(cmd))
+ mnt_drop_write(filp->f_vfsmnt);
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
_
chown/chmod,etc... don't call permission in the same way
that the normal "open for write" calls do. They still
write to the filesystem, so bump the write count during
these operations.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/open.c | 37 +++++++++++++++++++++++++++++++++----
1 file changed, 33 insertions(+), 4 deletions(-)
diff -puN fs/open.c~06-24-elevate-writer-count-for-chown-and-friends fs/open.c
--- lxc/fs/open.c~06-24-elevate-writer-count-for-chown-and-friends 2007-02-09 14:26:48.000000000 -0800
+++ lxc-dave/fs/open.c 2007-02-09 14:26:48.000000000 -0800
@@ -511,9 +511,12 @@ asmlinkage long sys_fchmod(unsigned int
err = -EROFS;
if (IS_RDONLY(inode))
goto out_putf;
+ err = mnt_want_write(file->f_vfsmnt);
+ if (err)
+ goto out_putf;
err = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out_putf;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
mode = inode->i_mode;
@@ -522,6 +525,8 @@ asmlinkage long sys_fchmod(unsigned int
err = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(file->f_vfsmnt);
out_putf:
fput(file);
out:
@@ -541,13 +546,16 @@ asmlinkage long sys_fchmodat(int dfd, co
goto out;
inode = nd.dentry->d_inode;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto dput_and_out;
error = -EROFS;
if (IS_RDONLY(inode))
- goto dput_and_out;
+ goto out_drop_write;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
@@ -557,6 +565,8 @@ asmlinkage long sys_fchmodat(int dfd, co
error = notify_change(nd.dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(nd.mnt);
dput_and_out:
path_release(&nd);
out:
@@ -582,7 +592,7 @@ static int chown_common(struct dentry *
error = -EROFS;
if (IS_RDONLY(inode))
goto out;
- error = -EPERM;
+ error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
goto out;
newattrs.ia_valid = ATTR_CTIME;
@@ -611,7 +621,12 @@ asmlinkage long sys_chown(const char __u
error = user_path_walk(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.dentry, user, group);
+ mnt_drop_write(nd.mnt);
+out_release:
path_release(&nd);
out:
return error;
@@ -631,7 +646,12 @@ asmlinkage long sys_fchownat(int dfd, co
error = __user_walk_fd(dfd, filename, follow, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.dentry, user, group);
+ mnt_drop_write(nd.mnt);
+out_release:
path_release(&nd);
out:
return error;
@@ -645,7 +665,11 @@ asmlinkage long sys_lchown(const char __
error = user_path_walk_link(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.dentry, user, group);
+out_release:
path_release(&nd);
out:
return error;
@@ -662,9 +686,14 @@ asmlinkage long sys_fchown(unsigned int
if (!file)
goto out;
+ error = mnt_want_write(file->f_vfsmnt);
+ if (error)
+ goto out_fput;
dentry = file->f_path.dentry;
audit_inode(NULL, dentry->d_inode);
error = chown_common(dentry, user, group);
+ mnt_drop_write(file->f_vfsmnt);
+out_fput:
fput(file);
out:
return error;
_
On Fri, 09 Feb 2007 14:53:29 -0800
Dave Hansen <[email protected]> wrote:
> +/*
> + * Note: This is a crappy interface. It is here to make
> + * merging with the existing users of get_empty_filp()
> + * who have complex failure logic easier. All users
> + * of this should be moving to alloc_file().
> + */
> +int init_file(struct file *file, struct vfsmount *mnt,
> + struct dentry *dentry, mode_t mode,
> + const struct file_operations *fop)
crappy name too ;) At least two filesystems have defined their own
static-scope init_file() and so they'll explode if they somehow maange
to include file.h.
I guess we can cross that bridge when we fall off it, but sometime it might be
prudent to do s/init_file/configfs_init_file/ and ditto sysfs_init_file.
On Fri, 09 Feb 2007 14:53:44 -0800
Dave Hansen <[email protected]> wrote:
> This is the core of the read-only bind mount patch set.
Who wants read-only bind mounts, and for what reason?
On Fri, 2007-02-09 at 15:22 -0800, Andrew Morton wrote:
> On Fri, 09 Feb 2007 14:53:44 -0800
> Dave Hansen <[email protected]> wrote:
>
> > This is the core of the read-only bind mount patch set.
>
> Who wants read-only bind mounts, and for what reason?
The original desire came out of the linux-vserver project. It allows a
sysadmin to share directories between many vservers/containers and keep
those containers from writing to it, even though the users in that
vserver may have "root" privileges.
This also has the advantage of cleaning up the somewhat hackish "look
for writable-open-files during remount/ro operations". It should also
allow us to separate the concepts of the user wanting a filesystem to be
r/o and the filesystem _itself_ being r/o because of a r/o device or
some kind of corruption.
-- Dave
Dave Hansen a ?crit :
> @@ -56,6 +57,7 @@ struct vfsmount {
> struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
> struct mnt_namespace *mnt_ns; /* containing namespace */
> struct user_namespace *mnt_user_ns; /* namespace for uid interpretation */
> + int mnt_writers; /* nr files open for write */
> /*
> * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
> * to let these frequently modified fields in a separate cache line
> @@ -72,7 +74,26 @@ static inline struct vfsmount *mntget(st
> atomic_inc(&mnt->mnt_count);
> return mnt;
Dave, please read again this comment in struct vfsmount definition.
If I understand your infrastructure, mnt_writers is going to be frequently
modified, so it should be placed at the end of struct vfsmount, in the same
cache line than mnt_count.
Thank you
Eric
On Sat, 2007-02-10 at 00:41 +0100, Eric Dumazet wrote:
> Dave, please read again this comment in struct vfsmount definition.
>
> If I understand your infrastructure, mnt=5Fwriters is going to be frequently
> modified, so it should be placed at the end of struct vfsmount, in the same
> cache line than mnt_count.
That's an excellent point, thanks for catching it. Here's an updated
patch.
-- Dave
This patch actually adds the mount and superblock writer
counts, and the mnt_want/drop_write() functions that use
them.
Before these can become useful, we must first cover each
place in the VFS where writes are performed with a
want/drop pair. When that is complete, we can actually
introduce code that will safely check the counts before
allowing r/w<->r/o transitions to occur.
Signed-off-by: Dave Hansen <[email protected]>
---
lxc-dave/fs/namespace.c | 53 +++++++++++++++++++++++++++++++++++++++++
lxc-dave/fs/super.c | 18 ++++++++++---
lxc-dave/include/linux/fs.h | 2 +
lxc-dave/include/linux/mount.h | 28 +++++++++++++++++++--
4 files changed, 94 insertions(+), 7 deletions(-)
diff -puN fs/namespace.c~03-24-add-vfsmount-writer-count fs/namespace.c
--- lxc/fs/namespace.c~03-24-add-vfsmount-writer-count 2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/fs/namespace.c 2007-02-09 16:04:40.000000000 -0800
@@ -58,6 +58,7 @@ struct vfsmount *alloc_vfsmnt(const char
if (mnt) {
mnt->mnt_user_ns = get_user_ns(current->nsproxy->user_ns);
atomic_set(&mnt->mnt_count, 1);
+ mnt->mnt_writers = 0;
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -78,6 +79,56 @@ struct vfsmount *alloc_vfsmnt(const char
return mnt;
}
+int mnt_make_readonly(struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ WARN_ON(__mnt_is_readonly(mnt));
+
+ /*
+ * This flag set is actually redundant with what
+ * happens in do_remount(), but since we do this
+ * under the lock, anyone attempting to get a write
+ * on it after this will fail.
+ */
+ spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+ if (!mnt->mnt_writers)
+ mnt->mnt_flags |= MNT_READONLY;
+ else
+ ret = -EBUSY;
+ spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+ return ret;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+ if (mnt->mnt_writers)
+ goto out;
+
+ if (__mnt_is_readonly(mnt)) {
+ ret = -EROFS;
+ goto out;
+ }
+ mnt->mnt_sb->s_writers++;
+ mnt->mnt_writers++;
+out:
+ spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+void mnt_drop_write(struct vfsmount *mnt)
+{
+ spin_lock(&mnt->mnt_sb->s_mnt_writers_lock);
+ mnt->mnt_sb->s_writers--;
+ mnt->mnt_writers--;
+ spin_unlock(&mnt->mnt_sb->s_mnt_writers_lock);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
@@ -1415,6 +1466,8 @@ long do_mount(char *dev_name, char *dir_
((char *)data_page)[PAGE_SIZE - 1] = 0;
/* Separate the per-mountpoint flags */
+ if (flags & MS_RDONLY)
+ mnt_flags |= MNT_READONLY;
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
diff -puN fs/super.c~03-24-add-vfsmount-writer-count fs/super.c
--- lxc/fs/super.c~03-24-add-vfsmount-writer-count 2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/fs/super.c 2007-02-09 16:04:40.000000000 -0800
@@ -93,6 +93,8 @@ static struct super_block *alloc_super(s
s->s_qcop = sb_quotactl_ops;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
+ s->s_writers = 0;
+ spin_lock_init(&s->s_mnt_writers_lock);
}
out:
return s;
@@ -576,6 +578,11 @@ static void mark_files_ro(struct super_b
file_list_unlock();
}
+static int sb_remount_ro(struct super_block *sb)
+{
+ return fs_may_remount_ro(sb);
+}
+
/**
* do_remount_sb - asks filesystem to change mount options.
* @sb: superblock in question
@@ -587,7 +594,8 @@ static void mark_files_ro(struct super_b
*/
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
- int retval;
+ int retval = 0;
+ int sb_started_ro = (sb->s_flags & MS_RDONLY);
#ifdef CONFIG_BLOCK
if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
@@ -600,11 +608,13 @@ int do_remount_sb(struct super_block *sb
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
- if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+ if ((flags & MS_RDONLY) && !sb_started_ro) {
if (force)
mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
- return -EBUSY;
+ else
+ retval = sb_remount_ro(sb);
+ if (retval)
+ return retval;
}
if (sb->s_op->remount_fs) {
diff -puN include/linux/fs.h~03-24-add-vfsmount-writer-count include/linux/fs.h
--- lxc/include/linux/fs.h~03-24-add-vfsmount-writer-count 2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/include/linux/fs.h 2007-02-09 16:04:40.000000000 -0800
@@ -972,6 +972,8 @@ struct super_block {
struct list_head s_io; /* parked for writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
+ int s_writers; /* number of files open for write */
+ spinlock_t s_mnt_writers_lock; /* taken when mounts change rw state */
struct block_device *s_bdev;
struct list_head s_instances;
diff -puN include/linux/mount.h~03-24-add-vfsmount-writer-count include/linux/mount.h
--- lxc/include/linux/mount.h~03-24-add-vfsmount-writer-count 2007-02-09 16:04:40.000000000 -0800
+++ lxc-dave/include/linux/mount.h 2007-02-09 16:07:28.000000000 -0800
@@ -29,6 +29,7 @@ struct user_namespace;
#define MNT_NOATIME 0x08
#define MNT_NODIRATIME 0x10
#define MNT_RELATIME 0x20
+#define MNT_READONLY 0x40 /* does the user want this to be r/o? */
#define MNT_SHRINKABLE 0x100
@@ -57,12 +58,14 @@ struct vfsmount {
struct mnt_namespace *mnt_ns; /* containing namespace */
struct user_namespace *mnt_user_ns; /* namespace for uid interpretation */
/*
- * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
- * to let these frequently modified fields in a separate cache line
- * (so that reads of mnt_flags wont ping-pong on SMP machines)
+ * We put mnt_count, mnt_expiry_mark, and mnt_writers at the end of
+ * struct vfsmount to let these frequently modified fields in a
+ * separate cache line (so that reads of mnt_flags wont ping-pong
+ * on SMP machines)
*/
atomic_t mnt_count;
int mnt_expiry_mark; /* true if marked for expiry */
+ int mnt_writers; /* nr files open for write */
int mnt_pinned;
};
@@ -72,7 +75,26 @@ static inline struct vfsmount *mntget(st
atomic_inc(&mnt->mnt_count);
return mnt;
}
+/*
+ * This is temporary for now. We also don't want to check
+ * the SB in because it is already checked in other
+ * code paths. We'll have a better way to do this in
+ * the end of this series
+ */
+static inline int __mnt_is_readonly(struct vfsmount *mnt)
+{
+ return mnt->mnt_flags & MNT_READONLY;
+}
+
+static inline void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+ WARN_ON(!__mnt_is_readonly(mnt));
+ mnt->mnt_flags &= ~MNT_READONLY;
+}
+extern int mnt_make_readonly(struct vfsmount *mnt);
+extern int mnt_want_write(struct vfsmount *mnt);
+extern void mnt_drop_write(struct vfsmount *mnt);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
_
On 9 Feb 2007, at 23:22, Andrew Morton wrote:
> On Fri, 09 Feb 2007 14:53:44 -0800
> Dave Hansen <[email protected]> wrote:
>
>> This is the core of the read-only bind mount patch set.
>
> Who wants read-only bind mounts, and for what reason?
On our local mirror server (mirrors just under 3TiB worth of stuff)
we hold all data on r/w mounted storage in a private location in the
file tree. (Note the server runs Solaris 10 not Linux or the
following would not be possible at present...)
We then bind mount (i.e. loopback mount on Solaris) various
directories from inside the private paths to various other locations
so for example we create /export/ftp/pub/* where "*" are directories
we want to export via FTP and we do all of those as read-only bind
mounts. This gives us that little bit of extra confidence that no-
one from the outside can cause any writes to happen to our mirrored
data. We do similar for NFS by creating lots of read-only bind
mounts in /* that again point into the private locations.
It would be nice if the Linux box that we have that is a copy/backup
of the Solaris box could do the same rather than have all the bind
mounts be read-write because we need the storage in the private
locations to be writable.
Best regards,
Anton
On Feb 9 2007 15:22, Andrew Morton wrote:
>On Fri, 09 Feb 2007 14:53:44 -0800
>Dave Hansen <[email protected]> wrote:
>
>> This is the core of the read-only bind mount patch set.
>
>Who wants read-only bind mounts, and for what reason?
And another case could be, that some application modifies ~/.xyz, but the user
(with root's help) does not want that:
mount --bind -r ~/.xyz ~/.xyz
chmoding out the w bits does not always work, as programs might tamper with the
permissions of ~/.xyz itself... so a ro mount seems to be best.
Jan
--
ft: http://freshmeat.net/p/chaostables/
On Fri, 09 Feb 2007 14:53:37 -0800 Dave Hansen <[email protected]> wrote:
> diff -puN fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/file_table.c
> --- lxc/fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed 2007-02-09 14:26:54.000000000 -0800
> +++ lxc-dave/fs/file_table.c 2007-02-09 14:26:54.000000000 -0800
> @@ -209,8 +209,11 @@ void fastcall __fput(struct file *file)
> if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
> cdev_put(inode->i_cdev);
> fops_put(file->f_op);
> - if (file->f_mode & FMODE_WRITE)
> + if (file->f_mode & FMODE_WRITE) {
> put_write_access(inode);
> + if(!special_file(inode->i_mode))
> + mnt_drop_write(mnt);
> + }
> put_pid(file->f_owner.pid);
> put_user_ns(file->f_owner.user_ns);
> file_kill(file);
> diff -puN fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/namei.c
> --- lxc/fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed 2007-02-09 14:26:54.000000000 -0800
> +++ lxc-dave/fs/namei.c 2007-02-09 14:26:54.000000000 -0800
> @@ -1548,8 +1548,17 @@ int may_open(struct nameidata *nd, int a
> return -EACCES;
>
> flag &= ~O_TRUNC;
> - } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
> - return -EROFS;
> + } else if (flag & FMODE_WRITE) {
> + /*
> + * effectively: !special_file()
> + * balanced by __fput()
> + */
> + error = mnt_want_write(nd->mnt);
> + if (error)
> + return error;
> + if (IS_RDONLY(inode))
> + return -EROFS;
> + }
yipes. A new mount-wide spin_lock/unlock for each for-writing open() and close().
Can we have a microbenchmark on this please?
Are you sure that fget_light() and fput_light() don't accidentally bypass this
new logic?
On Mon, 2007-02-12 at 21:11 -0800, Andrew Morton wrote:
> On Fri, 09 Feb 2007 14:53:37 -0800 Dave Hansen <[email protected]> wrote:
>
> > diff -puN fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/file_table.c
> > --- lxc/fs/file_table.c~14-24-tricky-elevate-write-count-files-are-open-ed 2007-02-09 14:26:54.000000000 -0800
> > +++ lxc-dave/fs/file_table.c 2007-02-09 14:26:54.000000000 -0800
> > @@ -209,8 +209,11 @@ void fastcall __fput(struct file *file)
> > if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
> > cdev_put(inode->i_cdev);
> > fops_put(file->f_op);
> > - if (file->f_mode & FMODE_WRITE)
> > + if (file->f_mode & FMODE_WRITE) {
> > put_write_access(inode);
> > + if(!special_file(inode->i_mode))
> > + mnt_drop_write(mnt);
> > + }
> > put_pid(file->f_owner.pid);
> > put_user_ns(file->f_owner.user_ns);
> > file_kill(file);
> > diff -puN fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed fs/namei.c
> > --- lxc/fs/namei.c~14-24-tricky-elevate-write-count-files-are-open-ed 2007-02-09 14:26:54.000000000 -0800
> > +++ lxc-dave/fs/namei.c 2007-02-09 14:26:54.000000000 -0800
> > @@ -1548,8 +1548,17 @@ int may_open(struct nameidata *nd, int a
> > return -EACCES;
> >
> > flag &= ~O_TRUNC;
> > - } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
> > - return -EROFS;
> > + } else if (flag & FMODE_WRITE) {
> > + /*
> > + * effectively: !special_file()
> > + * balanced by __fput()
> > + */
> > + error = mnt_want_write(nd->mnt);
> > + if (error)
> > + return error;
> > + if (IS_RDONLY(inode))
> > + return -EROFS;
> > + }
>
> yipes. A new mount-wide spin_lock/unlock for each for-writing open() and close().
> Can we have a microbenchmark on this please?
Yeah, I'll schedule some dbench time on a NUMA machine.
> Are you sure that fget_light() and fput_light() don't accidentally bypass this
> new logic?
Pretty sure. My code actually surrounds all of the permission() checks
in the VFS. To even use fget, you had to get a fd at some point, and to
do that you have to go through open, where both the mount and normal
filesystem checks are.
Is there something particular you had in mind?
-- Dave
> On Tue, 13 Feb 2007 08:58:16 -0800 Dave Hansen <[email protected]> wrote:
> > yipes. A new mount-wide spin_lock/unlock for each for-writing open() and close().
> > Can we have a microbenchmark on this please?
>
> Yeah, I'll schedule some dbench time on a NUMA machine.
dbench doesn't do open() a lot. To assess the worst-case we'd need one
process per cpu camping in an open/close loop.
On Tue, 2007-02-13 at 09:58 -0800, Andrew Morton wrote:
> > On Tue, 13 Feb 2007 08:58:16 -0800 Dave Hansen <[email protected]> wrote:
> > > yipes. A new mount-wide spin_lock/unlock for each for-writing open() and close().
> > > Can we have a microbenchmark on this please?
> >
> > Yeah, I'll schedule some dbench time on a NUMA machine.
>
> dbench doesn't do open() a lot. To assess the worst-case we'd need one
> process per cpu camping in an open/close loop.
This is definitely a worst-case scenario. A 32-way x86_64 NUMA machine
(with a pretty crappy interconnect) with a process-per-cpu all beating
on the same filesystem.
no patch:
real: 30.111s
user: 0.031s
sys: 2.685s
r/o bind mount patch:
real: 48.359s
user: 0.146s
sys: 47.984s
It definitely makes a huge difference in system time, although not a
fatal one. Christoph, what do you think? Back to caching the
superblock flag in the mount?
#!/bin/sh
# go.sh
name=`uname -r`
grep -q /mnt/ram /proc/mounts || mount -t ramfs ram /mnt/ram;
make openbench;
nr_cpus=`cat /proc/cpuinfo | grep -c ^processor`
for ((run=0;run<5;run++)); do
dir=$name.run.$run;
mkdir -p $dir;
for ((i=0;i<nr_cpus;i++)); do
{ time taskset -c $i ./openbench $((1<<16)) & } \
> $dir/openbench.time.$i 2>&1
done;
wait
echo run $run done
done
// openbench.c
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
void main(int argc, char **argv)
{
pid_t pid = getpid();
char buf[100];
int ret;
int fd;
int i;
int loops = atoi(argv[1]);
sprintf(&buf[0], "/mnt/ram/openbench.%d", pid);
for (i=0; i< loops; i++) {
fd = open(&buf[0], O_WRONLY|O_CREAT);
if (fd < 0) {
perror("open error");
exit(fd);
}
write(fd, "foo");
close(fd);
}
ret = unlink(&buf[0]);
if (ret)
perror("unlink error");
}
-- Dave