This is just RFC for now. I'm tracking down a wee bit of
list corruption. But, I wanted to send out so you could
compare to the last approach.
We need to make sure that all mounts get into the
sb list. So, require that new setters of mnt->mnt_sb
use simple_set_mnt() instead of doing it themselves.
NFS does the same thing a few times in a row, so give
it a nice little helper.
---
linux-2.6.git-dave/fs/namespace.c | 3 +--
linux-2.6.git-dave/fs/nfs/super.c | 31 ++++++++++++++++---------------
2 files changed, 17 insertions(+), 17 deletions(-)
diff -puN fs/namespace.c~make-mnt_sb-off-limits fs/namespace.c
--- linux-2.6.git/fs/namespace.c~make-mnt_sb-off-limits 2008-01-10 10:36:37.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:36:37.000000000 -0800
@@ -486,8 +486,7 @@ static struct vfsmount *clone_mnt(struct
if (mnt) {
mnt->mnt_flags = old->mnt_flags;
atomic_inc(&sb->s_active);
- mnt->mnt_sb = sb;
- mnt->mnt_root = dget(root);
+ simple_set_mnt(mnt, sb);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
diff -puN fs/nfs/super.c~make-mnt_sb-off-limits fs/nfs/super.c
--- linux-2.6.git/fs/nfs/super.c~make-mnt_sb-off-limits 2008-01-10 10:36:37.000000000 -0800
+++ linux-2.6.git-dave/fs/nfs/super.c 2008-01-10 10:36:37.000000000 -0800
@@ -1364,6 +1364,17 @@ static int nfs_compare_super(struct supe
return nfs_compare_mount_options(sb, server, mntflags);
}
+static void nfs_set_mnt(struct vfsmount *mnt, struct super_block *s)
+{
+ s->s_flags |= MS_ACTIVE;
+ simple_set_mnt(mnt, s);
+ /*
+ * simple_set_mnt() does a dput, which we already did
+ * in nfs_get_root().
+ */
+ dput(s->s_root);
+}
+
static int nfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
{
@@ -1417,9 +1428,7 @@ static int nfs_get_sb(struct file_system
goto error_splat_super;
}
- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);
error = 0;
out:
@@ -1505,9 +1514,7 @@ static int nfs_xdev_get_sb(struct file_s
goto error_splat_super;
}
- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);
dprintk("<-- nfs_xdev_get_sb() = 0\n");
return 0;
@@ -1766,9 +1773,7 @@ static int nfs4_get_sb(struct file_syste
goto error_splat_super;
}
- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);
error = 0;
out:
@@ -1851,9 +1856,7 @@ static int nfs4_xdev_get_sb(struct file_
goto error_splat_super;
}
- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);
dprintk("<-- nfs4_xdev_get_sb() = 0\n");
return 0;
@@ -1925,9 +1928,7 @@ static int nfs4_referral_get_sb(struct f
goto error_splat_super;
}
- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);
dprintk("<-- nfs4_referral_get_sb() = 0\n");
return 0;
_
---
linux-2.6.git-dave/fs/namespace.c | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff -puN fs/namespace.c~change-spinlock-to-mutex fs/namespace.c
--- linux-2.6.git/fs/namespace.c~change-spinlock-to-mutex 2008-01-10 10:45:47.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:46:01.000000000 -0800
@@ -118,7 +118,7 @@ struct mnt_writer {
* If holding multiple instances of this lock, they
* must be ordered by cpu number.
*/
- spinlock_t lock;
+ struct mutex lock;
struct lock_class_key lock_class; /* compiles out with !lockdep */
unsigned long count;
struct vfsmount *mnt;
@@ -130,7 +130,7 @@ static int __init init_mnt_writers(void)
int cpu;
for_each_possible_cpu(cpu) {
struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
- spin_lock_init(&writer->lock);
+ mutex_init(&writer->lock);
lockdep_set_class(&writer->lock, &writer->lock_class);
writer->count = 0;
}
@@ -145,7 +145,7 @@ static void mnt_unlock_cpus(void)
for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
}
}
@@ -191,8 +191,8 @@ int mnt_want_write(struct vfsmount *mnt)
int ret = 0;
struct mnt_writer *cpu_writer;
- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+ cpu_writer = &__get_cpu_var(mnt_writers);
+ mutex_lock(&cpu_writer->lock);
if (__mnt_is_readonly(mnt)) {
ret = -EROFS;
goto out;
@@ -200,8 +200,7 @@ int mnt_want_write(struct vfsmount *mnt)
use_cpu_writer_for_mount(cpu_writer, mnt);
cpu_writer->count++;
out:
- spin_unlock(&cpu_writer->lock);
- put_cpu_var(mnt_writers);
+ mutex_unlock(&cpu_writer->lock);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -213,7 +212,7 @@ static void lock_and_coalesce_cpu_mnt_wr
for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_lock(&cpu_writer->lock);
+ mutex_lock(&cpu_writer->lock);
__clear_mnt_count(cpu_writer);
cpu_writer->mnt = NULL;
}
@@ -269,7 +268,7 @@ void mnt_drop_write(struct vfsmount *mnt
retry:
cpu_writer = &__get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+ mutex_lock(&cpu_writer->lock);
use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
@@ -286,14 +285,14 @@ retry:
*/
if (atomic_read(&mnt->__mnt_writers) <
MNT_WRITER_UNDERFLOW_LIMIT) {
- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
goto retry;
}
atomic_dec(&mnt->__mnt_writers);
must_check_underflow = 1;
}
- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
/*
* Logically, we could call this each time,
* but the __mnt_writers cacheline tends to
_
We're moving a big chunk of the burden of keeping people from
writing to r/o filesystems from the superblock into the
vfsmount. This requires that we keep track somehow of the
mounts that might allow writes to a superblock.
You could track this directly by keeping a count of such
mounts in the superblock, but I think this is much simpler.
Introduce a list of vfsmounts hanging off the superblock.
On remount rw->ro operations, consult each mount of the
superblock and examine its __mnt_writers. Lock out any
new writers to the mount until the remount operation is
complete.
Signed-off-by: Dave Hansen <[email protected]>
---
linux-2.6.git-dave/fs/namespace.c | 28 ++++++++++++++
linux-2.6.git-dave/fs/super.c | 58 ++++++++++++++++++++++++++++---
linux-2.6.git-dave/include/linux/fs.h | 1
linux-2.6.git-dave/include/linux/mount.h | 3 +
4 files changed, 85 insertions(+), 5 deletions(-)
diff -puN fs/namespace.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/namespace.c
--- linux-2.6.git/fs/namespace.c~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:46:08.000000000 -0800
@@ -219,6 +219,21 @@ static void lock_and_coalesce_cpu_mnt_wr
}
/*
+ * These are just some shorter names for
+ * external users.
+ *
+ * Note that this nests outside lock_super().
+ */
+void lock_mnt_writers(void)
+{
+ lock_and_coalesce_cpu_mnt_writer_counts();
+}
+void unlock_mnt_writers(void)
+{
+ mnt_unlock_cpus();
+}
+
+/*
* These per-cpu write counts are not guaranteed to have
* matched increments and decrements on any given cpu.
* A file open()ed for write on one cpu and close()d on
@@ -331,10 +346,18 @@ static void __mnt_unmake_readonly(struct
mnt->mnt_flags &= ~MNT_READONLY;
}
+void add_mount_to_sb_list(struct vfsmount *mnt, struct super_block *sb)
+{
+ spin_lock(&vfsmount_lock);
+ list_add(&mnt->mnt_sb_list, &sb->s_vfsmounts);
+ spin_unlock(&vfsmount_lock);
+}
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
mnt->mnt_root = dget(sb->s_root);
+ add_mount_to_sb_list(mnt, sb);
return 0;
}
@@ -342,6 +365,10 @@ EXPORT_SYMBOL(simple_set_mnt);
void free_vfsmnt(struct vfsmount *mnt)
{
+ spin_lock(&vfsmount_lock);
+ if (mnt->mnt_sb)
+ list_del(&mnt->mnt_sb_list);
+ spin_unlock(&vfsmount_lock);
kfree(mnt->mnt_devname);
kmem_cache_free(mnt_cache, mnt);
}
@@ -494,6 +521,7 @@ static struct vfsmount *clone_mnt(struct
simple_set_mnt(mnt, sb);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
+ add_mount_to_sb_list(mnt, sb);
if (flag & CL_SLAVE) {
list_add(&mnt->mnt_slave, &old->mnt_slave_list);
diff -puN fs/super.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/super.c
--- linux-2.6.git/fs/super.c~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/fs/super.c 2008-01-10 10:46:08.000000000 -0800
@@ -36,6 +36,7 @@
#include <linux/writeback.h> /* for the emergency remount stuff */
#include <linux/idr.h>
#include <linux/kobject.h>
+#include <linux/list.h>
#include <linux/mutex.h>
#include <asm/uaccess.h>
@@ -65,6 +66,7 @@ static struct super_block *alloc_super(s
INIT_LIST_HEAD(&s->s_io);
INIT_LIST_HEAD(&s->s_more_io);
INIT_LIST_HEAD(&s->s_files);
+ INIT_LIST_HEAD(&s->s_vfsmounts);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
@@ -575,6 +577,34 @@ static void mark_files_ro(struct super_b
}
/**
+ * __sb_can_remount_ro - check a superblock for active writers
+ * @sb: superblock in question
+ *
+ * Must ensure that no new writers to the superblock can come
+ * in (must hold lock_mnt_writers()) and that the s_vfsmounts
+ * list can not change (must acquire lock_super()).
+ *
+ * Returns 0 on success.
+ */
+static int __sb_can_remount_ro(struct super_block *sb)
+{
+ struct list_head *entry;
+ int ret = 0;
+
+ lock_super(sb);
+ list_for_each(entry, &sb->s_vfsmounts) {
+ struct vfsmount *mnt;
+ mnt = list_entry(entry, struct vfsmount, mnt_sb_list);
+ if (!atomic_read(&mnt->__mnt_writers))
+ continue;
+ ret = -EBUSY;
+ break;
+ }
+ unlock_super(sb);
+ return ret;
+}
+
+/**
* do_remount_sb - asks filesystem to change mount options.
* @sb: superblock in question
* @flags: numeric part of options
@@ -585,6 +615,7 @@ static void mark_files_ro(struct super_b
*/
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
+ int mnt_writers_locked = 0;
int retval;
#ifdef CONFIG_BLOCK
@@ -599,10 +630,19 @@ int do_remount_sb(struct super_block *sb
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
- if (force)
+ if (force) {
mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
- return -EBUSY;
+ } else {
+ /*
+ * keeps new mnt writers out until
+ * we can set the sb r/o flag
+ */
+ lock_mnt_writers();
+ mnt_writers_locked = 1;
+ retval = __sb_can_remount_ro(sb);
+ if (retval)
+ goto out;
+ }
}
if (sb->s_op->remount_fs) {
@@ -610,10 +650,18 @@ int do_remount_sb(struct super_block *sb
retval = sb->s_op->remount_fs(sb, &flags, data);
unlock_super(sb);
if (retval)
- return retval;
+ goto out;
}
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
- return 0;
+out:
+ /*
+ * This must stay locked until after the s_flags are
+ * set because the flag will also lock out any new
+ * writers.
+ */
+ if (mnt_writers_locked)
+ unlock_mnt_writers();
+ return retval;
}
static void do_emergency_remount(unsigned long foo)
diff -puN include/linux/fs.h~25-24-reintroduce-list-of-vfsmounts-over-superblock include/linux/fs.h
--- linux-2.6.git/include/linux/fs.h~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/include/linux/fs.h 2008-01-10 10:46:08.000000000 -0800
@@ -1012,6 +1012,7 @@ struct super_block {
struct list_head s_io; /* parked for writeback */
struct list_head s_more_io; /* parked for more writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
+ struct list_head s_vfsmounts;
struct list_head s_files;
struct block_device *s_bdev;
diff -puN include/linux/mount.h~25-24-reintroduce-list-of-vfsmounts-over-superblock include/linux/mount.h
--- linux-2.6.git/include/linux/mount.h~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/include/linux/mount.h 2008-01-10 10:46:08.000000000 -0800
@@ -44,6 +44,7 @@ struct vfsmount {
struct dentry *mnt_mountpoint; /* dentry of mountpoint */
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
+ struct list_head mnt_sb_list; /* list of all mounts on same sb */
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
int mnt_flags;
@@ -80,6 +81,8 @@ static inline struct vfsmount *mntget(st
extern int mnt_want_write(struct vfsmount *mnt);
extern void mnt_drop_write(struct vfsmount *mnt);
+extern void lock_mnt_writers(void);
+extern void unlock_mnt_writers(void);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
diff -puN fs/xfs/xfs_inode.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/xfs/xfs_inode.c
diff -puN fs/dquot.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/dquot.c
_
The comment tells most of the story. I want to make the
spinlock in this case into a mutex, and the current
underflow protection mechanism uses preempt disabling from
put/get_cpu_Var(). I can't use that with a mutex.
Without the preempt disabling, there is no limit to the
number of cpus that might get to:
use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
cpu_writer->count--;
} else {
atomic_dec(&mnt->__mnt_writers);
}
spin_unlock(&cpu_writer->lock);
---->HERE
if (must_check_underflow)
handle_write_count_underflow(mnt);
because they get preempted once the spinlock is unlocked.
So, there's no limit on how many times __mnt_writers may
be decremented. (I know the limit is still the number of
tasks on the system, but that's a heck of a lot higher
than the number of cpus.) Doing the simple check in this
patch before the decrement and under a lock removes the
possibility that this can happen.
Since there are only NR_CPUS mnt_writer[]s, we can only
have NR_CPUS lock holders in the critical section at a
time, __mnt_writers can only underflow by
MNT_WRITER_UNDERFLOW_LIMIT+NR_CPUS.
Signed-off-by: Dave Hansen <[email protected]>
---
linux-2.6.git-dave/fs/namespace.c | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)
diff -puN fs/namespace.c~change-underflow-protection-logic fs/namespace.c
--- linux-2.6.git/fs/namespace.c~change-underflow-protection-logic 2008-01-10 10:36:37.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:36:37.000000000 -0800
@@ -267,15 +267,30 @@ void mnt_drop_write(struct vfsmount *mnt
int must_check_underflow = 0;
struct mnt_writer *cpu_writer;
- cpu_writer = &get_cpu_var(mnt_writers);
+retry:
+ cpu_writer = &__get_cpu_var(mnt_writers);
spin_lock(&cpu_writer->lock);
use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
cpu_writer->count--;
} else {
- must_check_underflow = 1;
+ /* Without this check, it is theoretically
+ * possible to underflow __mnt_writers.
+ * An unlimited number of processes could
+ * all do this decrement, unlock, and then
+ * stall before the underflow handling.
+ * Doing this check limits the underflow
+ * to the number of cpu_writer->lock
+ * holders (NR_CPUS).
+ */
+ if (atomic_read(&mnt->__mnt_writers) <
+ MNT_WRITER_UNDERFLOW_LIMIT) {
+ spin_unlock(&cpu_writer->lock);
+ goto retry;
+ }
atomic_dec(&mnt->__mnt_writers);
+ must_check_underflow = 1;
}
spin_unlock(&cpu_writer->lock);
@@ -286,15 +301,6 @@ void mnt_drop_write(struct vfsmount *mnt
*/
if (must_check_underflow)
handle_write_count_underflow(mnt);
- /*
- * This could be done right after the spinlock
- * is taken because the spinlock keeps us on
- * the cpu, and disables preemption. However,
- * putting it here bounds the amount that
- * __mnt_writers can underflow. Without it,
- * we could theoretically wrap __mnt_writers.
- */
- put_cpu_var(mnt_writers);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
_
Missed the description on that one. Here it is:
We're shortly going to need to be able to block new
mnt_writers for long periods of time during a
superblock remount operation. Since this operation
can sleep, we can not use a spinlock. We opt for
a mutex instead.
This are very, very rarely contented, mostly because
they are per-cpu. So, this should be very close to
as fast as the spinlocks just with the added benefit
that we can sleep while holding them.
We also need to change the get_cpu_var() to use
__get_cpu_var() so that we don't disable preemption.
Otherwise, we'll be in_atomic() when we try to lock
the (sleepable) mutex. We only use the per-cpu data
for cache benefits and its per-cpuness is not part
of locking logic, so this is OK.
_
-- Dave
Quoting Dave Hansen ([email protected]):
> This is just RFC for now. I'm tracking down a wee bit of
> list corruption. But, I wanted to send out so you could
> compare to the last approach.
Looks reasonable to me, and quite readable.
So after this set, you'd be able to remove s_files altogether?
-serge