2008-01-10 19:08:16

by Dave Hansen

[permalink] [raw]
Subject: [RFC][PATCH 0/4] kill open files traverse on remount ro

This is just RFC for now. I'm tracking down a wee bit of
list corruption. But, I wanted to send out so you could
compare to the last approach.


2008-01-10 19:07:38

by Dave Hansen

[permalink] [raw]
Subject: [RFC][PATCH 1/4] use helper to set mnt_sb


We need to make sure that all mounts get into the
sb list. So, require that new setters of mnt->mnt_sb
use simple_set_mnt() instead of doing it themselves.

NFS does the same thing a few times in a row, so give
it a nice little helper.

---

linux-2.6.git-dave/fs/namespace.c | 3 +--
linux-2.6.git-dave/fs/nfs/super.c | 31 ++++++++++++++++---------------
2 files changed, 17 insertions(+), 17 deletions(-)

diff -puN fs/namespace.c~make-mnt_sb-off-limits fs/namespace.c
--- linux-2.6.git/fs/namespace.c~make-mnt_sb-off-limits 2008-01-10 10:36:37.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:36:37.000000000 -0800
@@ -486,8 +486,7 @@ static struct vfsmount *clone_mnt(struct
if (mnt) {
mnt->mnt_flags = old->mnt_flags;
atomic_inc(&sb->s_active);
- mnt->mnt_sb = sb;
- mnt->mnt_root = dget(root);
+ simple_set_mnt(mnt, sb);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;

diff -puN fs/nfs/super.c~make-mnt_sb-off-limits fs/nfs/super.c
--- linux-2.6.git/fs/nfs/super.c~make-mnt_sb-off-limits 2008-01-10 10:36:37.000000000 -0800
+++ linux-2.6.git-dave/fs/nfs/super.c 2008-01-10 10:36:37.000000000 -0800
@@ -1364,6 +1364,17 @@ static int nfs_compare_super(struct supe
return nfs_compare_mount_options(sb, server, mntflags);
}

+static void nfs_set_mnt(struct vfsmount *mnt, struct super_block *s)
+{
+ s->s_flags |= MS_ACTIVE;
+ simple_set_mnt(mnt, s);
+ /*
+ * simple_set_mnt() does a dput, which we already did
+ * in nfs_get_root().
+ */
+ dput(s->s_root);
+}
+
static int nfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
{
@@ -1417,9 +1428,7 @@ static int nfs_get_sb(struct file_system
goto error_splat_super;
}

- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);
error = 0;

out:
@@ -1505,9 +1514,7 @@ static int nfs_xdev_get_sb(struct file_s
goto error_splat_super;
}

- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);

dprintk("<-- nfs_xdev_get_sb() = 0\n");
return 0;
@@ -1766,9 +1773,7 @@ static int nfs4_get_sb(struct file_syste
goto error_splat_super;
}

- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);
error = 0;

out:
@@ -1851,9 +1856,7 @@ static int nfs4_xdev_get_sb(struct file_
goto error_splat_super;
}

- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);

dprintk("<-- nfs4_xdev_get_sb() = 0\n");
return 0;
@@ -1925,9 +1928,7 @@ static int nfs4_referral_get_sb(struct f
goto error_splat_super;
}

- s->s_flags |= MS_ACTIVE;
- mnt->mnt_sb = s;
- mnt->mnt_root = mntroot;
+ nfs_set_mnt(mnt, s);

dprintk("<-- nfs4_referral_get_sb() = 0\n");
return 0;
_

2008-01-10 19:07:54

by Dave Hansen

[permalink] [raw]
Subject: [RFC][PATCH 3/4] change mnt_writers[] spinlock to mutex

---

linux-2.6.git-dave/fs/namespace.c | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)

diff -puN fs/namespace.c~change-spinlock-to-mutex fs/namespace.c
--- linux-2.6.git/fs/namespace.c~change-spinlock-to-mutex 2008-01-10 10:45:47.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:46:01.000000000 -0800
@@ -118,7 +118,7 @@ struct mnt_writer {
* If holding multiple instances of this lock, they
* must be ordered by cpu number.
*/
- spinlock_t lock;
+ struct mutex lock;
struct lock_class_key lock_class; /* compiles out with !lockdep */
unsigned long count;
struct vfsmount *mnt;
@@ -130,7 +130,7 @@ static int __init init_mnt_writers(void)
int cpu;
for_each_possible_cpu(cpu) {
struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
- spin_lock_init(&writer->lock);
+ mutex_init(&writer->lock);
lockdep_set_class(&writer->lock, &writer->lock_class);
writer->count = 0;
}
@@ -145,7 +145,7 @@ static void mnt_unlock_cpus(void)

for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
}
}

@@ -191,8 +191,8 @@ int mnt_want_write(struct vfsmount *mnt)
int ret = 0;
struct mnt_writer *cpu_writer;

- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+ cpu_writer = &__get_cpu_var(mnt_writers);
+ mutex_lock(&cpu_writer->lock);
if (__mnt_is_readonly(mnt)) {
ret = -EROFS;
goto out;
@@ -200,8 +200,7 @@ int mnt_want_write(struct vfsmount *mnt)
use_cpu_writer_for_mount(cpu_writer, mnt);
cpu_writer->count++;
out:
- spin_unlock(&cpu_writer->lock);
- put_cpu_var(mnt_writers);
+ mutex_unlock(&cpu_writer->lock);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -213,7 +212,7 @@ static void lock_and_coalesce_cpu_mnt_wr

for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_lock(&cpu_writer->lock);
+ mutex_lock(&cpu_writer->lock);
__clear_mnt_count(cpu_writer);
cpu_writer->mnt = NULL;
}
@@ -269,7 +268,7 @@ void mnt_drop_write(struct vfsmount *mnt

retry:
cpu_writer = &__get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+ mutex_lock(&cpu_writer->lock);

use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
@@ -286,14 +285,14 @@ retry:
*/
if (atomic_read(&mnt->__mnt_writers) <
MNT_WRITER_UNDERFLOW_LIMIT) {
- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
goto retry;
}
atomic_dec(&mnt->__mnt_writers);
must_check_underflow = 1;
}

- spin_unlock(&cpu_writer->lock);
+ mutex_unlock(&cpu_writer->lock);
/*
* Logically, we could call this each time,
* but the __mnt_writers cacheline tends to
_

2008-01-10 19:08:33

by Dave Hansen

[permalink] [raw]
Subject: [RFC][PATCH 4/4] check mount writers at superblock remount


We're moving a big chunk of the burden of keeping people from
writing to r/o filesystems from the superblock into the
vfsmount. This requires that we keep track somehow of the
mounts that might allow writes to a superblock.

You could track this directly by keeping a count of such
mounts in the superblock, but I think this is much simpler.

Introduce a list of vfsmounts hanging off the superblock.
On remount rw->ro operations, consult each mount of the
superblock and examine its __mnt_writers. Lock out any
new writers to the mount until the remount operation is
complete.

Signed-off-by: Dave Hansen <[email protected]>
---

linux-2.6.git-dave/fs/namespace.c | 28 ++++++++++++++
linux-2.6.git-dave/fs/super.c | 58 ++++++++++++++++++++++++++++---
linux-2.6.git-dave/include/linux/fs.h | 1
linux-2.6.git-dave/include/linux/mount.h | 3 +
4 files changed, 85 insertions(+), 5 deletions(-)

diff -puN fs/namespace.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/namespace.c
--- linux-2.6.git/fs/namespace.c~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:46:08.000000000 -0800
@@ -219,6 +219,21 @@ static void lock_and_coalesce_cpu_mnt_wr
}

/*
+ * These are just some shorter names for
+ * external users.
+ *
+ * Note that this nests outside lock_super().
+ */
+void lock_mnt_writers(void)
+{
+ lock_and_coalesce_cpu_mnt_writer_counts();
+}
+void unlock_mnt_writers(void)
+{
+ mnt_unlock_cpus();
+}
+
+/*
* These per-cpu write counts are not guaranteed to have
* matched increments and decrements on any given cpu.
* A file open()ed for write on one cpu and close()d on
@@ -331,10 +346,18 @@ static void __mnt_unmake_readonly(struct
mnt->mnt_flags &= ~MNT_READONLY;
}

+void add_mount_to_sb_list(struct vfsmount *mnt, struct super_block *sb)
+{
+ spin_lock(&vfsmount_lock);
+ list_add(&mnt->mnt_sb_list, &sb->s_vfsmounts);
+ spin_unlock(&vfsmount_lock);
+}
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
mnt->mnt_root = dget(sb->s_root);
+ add_mount_to_sb_list(mnt, sb);
return 0;
}

@@ -342,6 +365,10 @@ EXPORT_SYMBOL(simple_set_mnt);

void free_vfsmnt(struct vfsmount *mnt)
{
+ spin_lock(&vfsmount_lock);
+ if (mnt->mnt_sb)
+ list_del(&mnt->mnt_sb_list);
+ spin_unlock(&vfsmount_lock);
kfree(mnt->mnt_devname);
kmem_cache_free(mnt_cache, mnt);
}
@@ -494,6 +521,7 @@ static struct vfsmount *clone_mnt(struct
simple_set_mnt(mnt, sb);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
+ add_mount_to_sb_list(mnt, sb);

if (flag & CL_SLAVE) {
list_add(&mnt->mnt_slave, &old->mnt_slave_list);
diff -puN fs/super.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/super.c
--- linux-2.6.git/fs/super.c~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/fs/super.c 2008-01-10 10:46:08.000000000 -0800
@@ -36,6 +36,7 @@
#include <linux/writeback.h> /* for the emergency remount stuff */
#include <linux/idr.h>
#include <linux/kobject.h>
+#include <linux/list.h>
#include <linux/mutex.h>
#include <asm/uaccess.h>

@@ -65,6 +66,7 @@ static struct super_block *alloc_super(s
INIT_LIST_HEAD(&s->s_io);
INIT_LIST_HEAD(&s->s_more_io);
INIT_LIST_HEAD(&s->s_files);
+ INIT_LIST_HEAD(&s->s_vfsmounts);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
@@ -575,6 +577,34 @@ static void mark_files_ro(struct super_b
}

/**
+ * __sb_can_remount_ro - check a superblock for active writers
+ * @sb: superblock in question
+ *
+ * Must ensure that no new writers to the superblock can come
+ * in (must hold lock_mnt_writers()) and that the s_vfsmounts
+ * list can not change (must acquire lock_super()).
+ *
+ * Returns 0 on success.
+ */
+static int __sb_can_remount_ro(struct super_block *sb)
+{
+ struct list_head *entry;
+ int ret = 0;
+
+ lock_super(sb);
+ list_for_each(entry, &sb->s_vfsmounts) {
+ struct vfsmount *mnt;
+ mnt = list_entry(entry, struct vfsmount, mnt_sb_list);
+ if (!atomic_read(&mnt->__mnt_writers))
+ continue;
+ ret = -EBUSY;
+ break;
+ }
+ unlock_super(sb);
+ return ret;
+}
+
+/**
* do_remount_sb - asks filesystem to change mount options.
* @sb: superblock in question
* @flags: numeric part of options
@@ -585,6 +615,7 @@ static void mark_files_ro(struct super_b
*/
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
+ int mnt_writers_locked = 0;
int retval;

#ifdef CONFIG_BLOCK
@@ -599,10 +630,19 @@ int do_remount_sb(struct super_block *sb
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
- if (force)
+ if (force) {
mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
- return -EBUSY;
+ } else {
+ /*
+ * keeps new mnt writers out until
+ * we can set the sb r/o flag
+ */
+ lock_mnt_writers();
+ mnt_writers_locked = 1;
+ retval = __sb_can_remount_ro(sb);
+ if (retval)
+ goto out;
+ }
}

if (sb->s_op->remount_fs) {
@@ -610,10 +650,18 @@ int do_remount_sb(struct super_block *sb
retval = sb->s_op->remount_fs(sb, &flags, data);
unlock_super(sb);
if (retval)
- return retval;
+ goto out;
}
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
- return 0;
+out:
+ /*
+ * This must stay locked until after the s_flags are
+ * set because the flag will also lock out any new
+ * writers.
+ */
+ if (mnt_writers_locked)
+ unlock_mnt_writers();
+ return retval;
}

static void do_emergency_remount(unsigned long foo)
diff -puN include/linux/fs.h~25-24-reintroduce-list-of-vfsmounts-over-superblock include/linux/fs.h
--- linux-2.6.git/include/linux/fs.h~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/include/linux/fs.h 2008-01-10 10:46:08.000000000 -0800
@@ -1012,6 +1012,7 @@ struct super_block {
struct list_head s_io; /* parked for writeback */
struct list_head s_more_io; /* parked for more writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
+ struct list_head s_vfsmounts;
struct list_head s_files;

struct block_device *s_bdev;
diff -puN include/linux/mount.h~25-24-reintroduce-list-of-vfsmounts-over-superblock include/linux/mount.h
--- linux-2.6.git/include/linux/mount.h~25-24-reintroduce-list-of-vfsmounts-over-superblock 2008-01-10 10:46:08.000000000 -0800
+++ linux-2.6.git-dave/include/linux/mount.h 2008-01-10 10:46:08.000000000 -0800
@@ -44,6 +44,7 @@ struct vfsmount {
struct dentry *mnt_mountpoint; /* dentry of mountpoint */
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
+ struct list_head mnt_sb_list; /* list of all mounts on same sb */
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
int mnt_flags;
@@ -80,6 +81,8 @@ static inline struct vfsmount *mntget(st

extern int mnt_want_write(struct vfsmount *mnt);
extern void mnt_drop_write(struct vfsmount *mnt);
+extern void lock_mnt_writers(void);
+extern void unlock_mnt_writers(void);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
diff -puN fs/xfs/xfs_inode.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/xfs/xfs_inode.c
diff -puN fs/dquot.c~25-24-reintroduce-list-of-vfsmounts-over-superblock fs/dquot.c
_

2008-01-10 19:08:51

by Dave Hansen

[permalink] [raw]
Subject: [RFC][PATCH 2/4] change mnt_writers underflow protection logic


The comment tells most of the story. I want to make the
spinlock in this case into a mutex, and the current
underflow protection mechanism uses preempt disabling from
put/get_cpu_Var(). I can't use that with a mutex.

Without the preempt disabling, there is no limit to the
number of cpus that might get to:

use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
cpu_writer->count--;
} else {
atomic_dec(&mnt->__mnt_writers);
}
spin_unlock(&cpu_writer->lock);
---->HERE
if (must_check_underflow)
handle_write_count_underflow(mnt);

because they get preempted once the spinlock is unlocked.

So, there's no limit on how many times __mnt_writers may
be decremented. (I know the limit is still the number of
tasks on the system, but that's a heck of a lot higher
than the number of cpus.) Doing the simple check in this
patch before the decrement and under a lock removes the
possibility that this can happen.

Since there are only NR_CPUS mnt_writer[]s, we can only
have NR_CPUS lock holders in the critical section at a
time, __mnt_writers can only underflow by
MNT_WRITER_UNDERFLOW_LIMIT+NR_CPUS.

Signed-off-by: Dave Hansen <[email protected]>
---

linux-2.6.git-dave/fs/namespace.c | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)

diff -puN fs/namespace.c~change-underflow-protection-logic fs/namespace.c
--- linux-2.6.git/fs/namespace.c~change-underflow-protection-logic 2008-01-10 10:36:37.000000000 -0800
+++ linux-2.6.git-dave/fs/namespace.c 2008-01-10 10:36:37.000000000 -0800
@@ -267,15 +267,30 @@ void mnt_drop_write(struct vfsmount *mnt
int must_check_underflow = 0;
struct mnt_writer *cpu_writer;

- cpu_writer = &get_cpu_var(mnt_writers);
+retry:
+ cpu_writer = &__get_cpu_var(mnt_writers);
spin_lock(&cpu_writer->lock);

use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
cpu_writer->count--;
} else {
- must_check_underflow = 1;
+ /* Without this check, it is theoretically
+ * possible to underflow __mnt_writers.
+ * An unlimited number of processes could
+ * all do this decrement, unlock, and then
+ * stall before the underflow handling.
+ * Doing this check limits the underflow
+ * to the number of cpu_writer->lock
+ * holders (NR_CPUS).
+ */
+ if (atomic_read(&mnt->__mnt_writers) <
+ MNT_WRITER_UNDERFLOW_LIMIT) {
+ spin_unlock(&cpu_writer->lock);
+ goto retry;
+ }
atomic_dec(&mnt->__mnt_writers);
+ must_check_underflow = 1;
}

spin_unlock(&cpu_writer->lock);
@@ -286,15 +301,6 @@ void mnt_drop_write(struct vfsmount *mnt
*/
if (must_check_underflow)
handle_write_count_underflow(mnt);
- /*
- * This could be done right after the spinlock
- * is taken because the spinlock keeps us on
- * the cpu, and disables preemption. However,
- * putting it here bounds the amount that
- * __mnt_writers can underflow. Without it,
- * we could theoretically wrap __mnt_writers.
- */
- put_cpu_var(mnt_writers);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

_

2008-01-10 19:11:40

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/4] change mnt_writers[] spinlock to mutex

Missed the description on that one. Here it is:

We're shortly going to need to be able to block new
mnt_writers for long periods of time during a
superblock remount operation. Since this operation
can sleep, we can not use a spinlock. We opt for
a mutex instead.

This are very, very rarely contented, mostly because
they are per-cpu. So, this should be very close to
as fast as the spinlocks just with the added benefit
that we can sleep while holding them.

We also need to change the get_cpu_var() to use
__get_cpu_var() so that we don't disable preemption.
Otherwise, we'll be in_atomic() when we try to lock
the (sleepable) mutex. We only use the per-cpu data
for cache benefits and its per-cpuness is not part
of locking logic, so this is OK.
_
-- Dave

2008-01-10 21:47:39

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/4] kill open files traverse on remount ro

Quoting Dave Hansen ([email protected]):
> This is just RFC for now. I'm tracking down a wee bit of
> list corruption. But, I wanted to send out so you could
> compare to the last approach.

Looks reasonable to me, and quite readable.

So after this set, you'd be able to remove s_files altogether?

-serge