2009-03-10 14:37:38

by Nick Piggin

[permalink] [raw]
Subject: [patch 1/2] fs: mnt_want_write speedup

Hi,

I wouldn't mind getting these going again. Improved comments explaining
the algorithm since last posting.

--

fs: mnt_want_write speedup

This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
A microbenchmark yes, but it exercises some important paths in the mm.
Most of the calls would come from page faults when updating inode access time.

Before:
avg = 501.9
std = 14.7773

After:
avg = 462.286
std = 5.46106

(50 runs of each, stddev gives a reasonable confidence, but there is quite
a bit of variation there still)

It does this by removing the complex per-cpu locking and counter-cache and
replaces it with a percpu counter in struct vfsmount. This makes the code
much simpler, and avoids spinlocks (although the msync is still pretty
costly, unfortunately). It results in about 900 bytes smaller code too. It
does increase the size of a vfsmount, however.

It should also give a speedup on large systems if CPUs are frequently operating
on different mounts (because the existing scheme has to operate on an atomic in
the struct vfsmount when switching between mounts). But I'm most interested in
the single threaded path performance for the moment.

Cc: Dave Hansen <[email protected]>
Signed-off-by: Nick Piggin <[email protected]>
---
fs/namespace.c | 263 ++++++++++++++++----------------------------------
include/linux/mount.h | 21 ++-
2 files changed, 103 insertions(+), 181 deletions(-)

Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c
+++ linux-2.6/fs/namespace.c
@@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
- atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+ mnt->mnt_writers = alloc_percpu(int);
+ if (!mnt->mnt_writers)
+ goto out_free_devname;
+#else
+ mnt->mnt_writers = 0;
+#endif
}
return mnt;

+#ifdef CONFIG_SMP
+out_free_devname:
+ kfree(mnt->mnt_devname);
+#endif
out_free_id:
mnt_free_id(mnt);
out_free_cache:
@@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

-struct mnt_writer {
- /*
- * If holding multiple instances of this lock, they
- * must be ordered by cpu number.
- */
- spinlock_t lock;
- struct lock_class_key lock_class; /* compiles out with !lockdep */
- unsigned long count;
- struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static inline void inc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+#else
+ mnt->mnt_writers++;
+#endif
+}

-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
{
- int cpu;
- for_each_possible_cpu(cpu) {
- struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
- spin_lock_init(&writer->lock);
- lockdep_set_class(&writer->lock, &writer->lock_class);
- writer->count = 0;
- }
- return 0;
+#ifdef CONFIG_SMP
+ (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+#else
+ mnt->mnt_writers--;
+#endif
}
-fs_initcall(init_mnt_writers);

-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
{
+#ifdef CONFIG_SMP
+ unsigned int count = 0;
int cpu;
- struct mnt_writer *cpu_writer;

for_each_possible_cpu(cpu) {
- cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_unlock(&cpu_writer->lock);
+ count += *per_cpu_ptr(mnt->mnt_writers, cpu);
}
-}

-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
-{
- if (!cpu_writer->mnt)
- return;
- /*
- * This is in case anyone ever leaves an invalid,
- * old ->mnt and a count of 0.
- */
- if (!cpu_writer->count)
- return;
- atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
- cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
- struct vfsmount *mnt)
-{
- if (cpu_writer->mnt == mnt)
- return;
- __clear_mnt_count(cpu_writer);
- cpu_writer->mnt = mnt;
+ return count;
+#else
+ return mnt->mnt_writers;
+#endif
}

/*
@@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
int mnt_want_write(struct vfsmount *mnt)
{
int ret = 0;
- struct mnt_writer *cpu_writer;

- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+ preempt_disable();
+ inc_mnt_writers(mnt);
+ /*
+ * The store to inc_mnt_writers must be visible before we pass
+ * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+ * incremented count after it has set MNT_WRITE_HOLD.
+ */
+ smp_mb();
+ while (mnt->mnt_flags & MNT_WRITE_HOLD)
+ cpu_relax();
+ /*
+ * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+ * be set to match its requirements. So we must not load that until
+ * MNT_WRITE_HOLD is cleared.
+ */
+ smp_rmb();
if (__mnt_is_readonly(mnt)) {
+ dec_mnt_writers(mnt);
ret = -EROFS;
goto out;
}
- use_cpu_writer_for_mount(cpu_writer, mnt);
- cpu_writer->count++;
out:
- spin_unlock(&cpu_writer->lock);
- put_cpu_var(mnt_writers);
+ preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

-static void lock_mnt_writers(void)
-{
- int cpu;
- struct mnt_writer *cpu_writer;
-
- for_each_possible_cpu(cpu) {
- cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_lock(&cpu_writer->lock);
- __clear_mnt_count(cpu_writer);
- cpu_writer->mnt = NULL;
- }
-}
-
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count. Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
- if (atomic_read(&mnt->__mnt_writers) >=
- MNT_WRITER_UNDERFLOW_LIMIT)
- return;
- /*
- * It isn't necessary to hold all of the locks
- * at the same time, but doing it this way makes
- * us share a lot more code.
- */
- lock_mnt_writers();
- /*
- * vfsmount_lock is for mnt_flags.
- */
- spin_lock(&vfsmount_lock);
- /*
- * If coalescing the per-cpu writer counts did not
- * get us back to a positive writer count, we have
- * a bug.
- */
- if ((atomic_read(&mnt->__mnt_writers) < 0) &&
- !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
- WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
- "count: %d\n",
- mnt, atomic_read(&mnt->__mnt_writers));
- /* use the flag to keep the dmesg spam down */
- mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
- }
- spin_unlock(&vfsmount_lock);
- unlock_mnt_writers();
-}
-
/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
@@ -331,37 +273,9 @@ static void handle_write_count_underflow
*/
void mnt_drop_write(struct vfsmount *mnt)
{
- int must_check_underflow = 0;
- struct mnt_writer *cpu_writer;
-
- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
-
- use_cpu_writer_for_mount(cpu_writer, mnt);
- if (cpu_writer->count > 0) {
- cpu_writer->count--;
- } else {
- must_check_underflow = 1;
- atomic_dec(&mnt->__mnt_writers);
- }
-
- spin_unlock(&cpu_writer->lock);
- /*
- * Logically, we could call this each time,
- * but the __mnt_writers cacheline tends to
- * be cold, and makes this expensive.
- */
- if (must_check_underflow)
- handle_write_count_underflow(mnt);
- /*
- * This could be done right after the spinlock
- * is taken because the spinlock keeps us on
- * the cpu, and disables preemption. However,
- * putting it here bounds the amount that
- * __mnt_writers can underflow. Without it,
- * we could theoretically wrap __mnt_writers.
- */
- put_cpu_var(mnt_writers);
+ preempt_disable();
+ dec_mnt_writers(mnt);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

@@ -369,24 +283,44 @@ static int mnt_make_readonly(struct vfsm
{
int ret = 0;

- lock_mnt_writers();
+ spin_lock(&vfsmount_lock);
+ mnt->mnt_flags |= MNT_WRITE_HOLD;
+ /*
+ * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+ * should be visible before we do.
+ */
+ smp_mb();
+
/*
- * With all the locks held, this value is stable
+ * With writers on hold, if this value is zero, then there are
+ * definitely no active writers (although held writers may subsequently
+ * increment the count, they'll have to wait, and decrement it after
+ * seeing MNT_READONLY).
+ *
+ * It is OK to have counter incremented on one CPU and decremented on
+ * another: the sum will add up correctly. The danger would be when we
+ * sum up each counter, if we read a counter before it is incremented,
+ * but then read another CPU's count which it has been subsequently
+ * decremented from -- we would see more decrements than we should.
+ * MNT_WRITE_HOLD protects against this scenario, because
+ * mnt_want_write first increments count, then smp_mb, then spins on
+ * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+ * we're counting up here.
*/
- if (atomic_read(&mnt->__mnt_writers) > 0) {
+ if (count_mnt_writers(mnt) > 0) {
ret = -EBUSY;
goto out;
}
- /*
- * nobody can do a successful mnt_want_write() with all
- * of the counts in MNT_DENIED_WRITE and the locks held.
- */
- spin_lock(&vfsmount_lock);
if (!ret)
mnt->mnt_flags |= MNT_READONLY;
- spin_unlock(&vfsmount_lock);
out:
- unlock_mnt_writers();
+ /*
+ * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+ * that become unheld will see MNT_READONLY.
+ */
+ smp_wmb();
+ mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+ spin_unlock(&vfsmount_lock);
return ret;
}

@@ -410,6 +344,9 @@ void free_vfsmnt(struct vfsmount *mnt)
{
kfree(mnt->mnt_devname);
mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+ free_percpu(mnt->mnt_writers);
+#endif
kmem_cache_free(mnt_cache, mnt);
}

@@ -604,38 +541,14 @@ static struct vfsmount *clone_mnt(struct

static inline void __mntput(struct vfsmount *mnt)
{
- int cpu;
struct super_block *sb = mnt->mnt_sb;
/*
- * We don't have to hold all of the locks at the
- * same time here because we know that we're the
- * last reference to mnt and that no new writers
- * can come in.
- */
- for_each_possible_cpu(cpu) {
- struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_lock(&cpu_writer->lock);
- if (cpu_writer->mnt != mnt) {
- spin_unlock(&cpu_writer->lock);
- continue;
- }
- atomic_add(cpu_writer->count, &mnt->__mnt_writers);
- cpu_writer->count = 0;
- /*
- * Might as well do this so that no one
- * ever sees the pointer and expects
- * it to be valid.
- */
- cpu_writer->mnt = NULL;
- spin_unlock(&cpu_writer->lock);
- }
- /*
* This probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this
* happens, the filesystem was probably unable
* to make r/w->r/o transitions.
*/
- WARN_ON(atomic_read(&mnt->__mnt_writers));
+ WARN_ON(count_mnt_writers(mnt));
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
Index: linux-2.6/include/linux/mount.h
===================================================================
--- linux-2.6.orig/include/linux/mount.h
+++ linux-2.6/include/linux/mount.h
@@ -29,7 +29,7 @@ struct mnt_namespace;
#define MNT_READONLY 0x40 /* does the user want this to be r/o? */

#define MNT_SHRINKABLE 0x100
-#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
+#define MNT_WRITE_HOLD 0x200

#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -64,13 +64,22 @@ struct vfsmount {
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
- /*
- * This value is not stable unless all of the mnt_writers[] spinlocks
- * are held, and all mnt_writer[]s on this mount have 0 as their ->count
- */
- atomic_t __mnt_writers;
+#ifdef CONFIG_SMP
+ int *mnt_writers;
+#else
+ int mnt_writers;
+#endif
};

+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ return mnt->mnt_writers;
+#else
+ return &mnt->mnt_writers;
+#endif
+}
+
static inline struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt)


2009-03-10 14:38:24

by Nick Piggin

[permalink] [raw]
Subject: [patch 2/2] fs: introduce mnt_clone_write


fs: introduce mnt_clone_write

This patch speeds up lmbench lat_mmap test by about another 2% after the
first patch.

Before:
avg = 462.286
std = 5.46106

After:
avg = 453.12
std = 9.58257

(50 runs of each, stddev gives a reasonable confidence)

It does this by introducing mnt_clone_write, which avoids some heavyweight
operations of mnt_want_write if called on a vfsmount which we know already
has a write count; and mnt_want_write_file, which can call mnt_clone_write
if the file is open for write.

After these two patches, mnt_want_write and mnt_drop_write go from 7% on
the profile down to 1.3% (including mnt_clone_write).

Cc: Dave Hansen <[email protected]>
Signed-off-by: Nick Piggin <[email protected]>
---
fs/file_table.c | 3 +--
fs/inode.c | 2 +-
fs/namespace.c | 38 ++++++++++++++++++++++++++++++++++++++
fs/open.c | 4 ++--
fs/xattr.c | 4 ++--
include/linux/mount.h | 2 ++
6 files changed, 46 insertions(+), 7 deletions(-)

Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c
+++ linux-2.6/fs/file_table.c
@@ -213,8 +213,7 @@ int init_file(struct file *file, struct
*/
if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
file_take_write(file);
- error = mnt_want_write(mnt);
- WARN_ON(error);
+ mnt_clone_write(mnt);
}
return error;
}
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c
+++ linux-2.6/fs/inode.c
@@ -1362,7 +1362,7 @@ void file_update_time(struct file *file)
if (IS_NOCMTIME(inode))
return;

- err = mnt_want_write(file->f_path.mnt);
+ err = mnt_want_write_file(file->f_path.mnt, file);
if (err)
return;

Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c
+++ linux-2.6/fs/namespace.c
@@ -264,6 +264,44 @@ out:
EXPORT_SYMBOL_GPL(mnt_want_write);

/**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+void mnt_clone_write(struct vfsmount *mnt)
+{
+ preempt_disable();
+ inc_mnt_writers(mnt);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct vfsmount *mnt, struct file *file)
+{
+ if (!(file->f_mode & FMODE_WRITE))
+ return mnt_want_write(mnt);
+ else {
+ mnt_clone_write(mnt);
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
+/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
*
Index: linux-2.6/fs/open.c
===================================================================
--- linux-2.6.orig/fs/open.c
+++ linux-2.6/fs/open.c
@@ -611,7 +611,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd

audit_inode(NULL, dentry);

- err = mnt_want_write(file->f_path.mnt);
+ err = mnt_want_write_file(file->f_path.mnt, file);
if (err)
goto out_putf;
mutex_lock(&inode->i_mutex);
@@ -760,7 +760,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd
if (!file)
goto out;

- error = mnt_want_write(file->f_path.mnt);
+ error = mnt_want_write_file(file->f_path.mnt, file);
if (error)
goto out_fput;
dentry = file->f_path.dentry;
Index: linux-2.6/fs/xattr.c
===================================================================
--- linux-2.6.orig/fs/xattr.c
+++ linux-2.6/fs/xattr.c
@@ -301,7 +301,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, cons
return error;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry);
- error = mnt_want_write(f->f_path.mnt);
+ error = mnt_want_write_file(f->f_path.mnt, f);
if (!error) {
error = setxattr(dentry, name, value, size, flags);
mnt_drop_write(f->f_path.mnt);
@@ -528,7 +528,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, c
return error;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry);
- error = mnt_want_write(f->f_path.mnt);
+ error = mnt_want_write_file(f->f_path.mnt, f);
if (!error) {
error = removexattr(dentry, name);
mnt_drop_write(f->f_path.mnt);
Index: linux-2.6/include/linux/mount.h
===================================================================
--- linux-2.6.orig/include/linux/mount.h
+++ linux-2.6/include/linux/mount.h
@@ -89,6 +89,8 @@ static inline struct vfsmount *mntget(st
}

extern int mnt_want_write(struct vfsmount *mnt);
+extern int mnt_want_write_file(struct vfsmount *mnt, struct file *file);
+extern void mnt_clone_write(struct vfsmount *mnt);
extern void mnt_drop_write(struct vfsmount *mnt);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);

2009-03-10 14:49:28

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Tue, Mar 10, 2009 at 03:37:18PM +0100, Nick Piggin wrote:
> costly, unfortunately). It results in about 900 bytes smaller code too. It
> does increase the size of a vfsmount, however.

Only on 64-bit SMP systems, and then only by four bytes. And, best of
all, you can fix that if you care. Look at this:

/* --- cacheline 1 boundary (64 bytes) --- */
struct list_head mnt_child; /* 64 16 */
int mnt_flags; /* 80 4 */

/* XXX 4 bytes hole, try to pack */

const char * mnt_devname; /* 88 8 */
struct list_head mnt_list; /* 96 16 */
struct list_head mnt_expire; /* 112 16 */

So move mnt_flags to later in the struct (after the pointers), and move

> +#ifdef CONFIG_SMP
> + int *mnt_writers;
> +#else
> + int mnt_writers;
> +#endif

to be with the other pointers. Bonus points for putting it between
struct mnt_namespace * mnt_ns; /* 184 8 */
and
int mnt_id; /* 192 4 */

so that it doesn't become a new 4-byte hole for those incredibly common
64-bit uniprocessor builds. *cough*.

--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2009-03-10 14:56:20

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch 2/2] fs: introduce mnt_clone_write

On Tue, Mar 10, 2009 at 03:38:01PM +0100, Nick Piggin wrote:
> +EXPORT_SYMBOL_GPL(mnt_clone_write);
> +EXPORT_SYMBOL_GPL(mnt_want_write_file);

You don't add any users of these functions in this patch ... is there a
good reason to export them?

--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2009-03-10 15:03:53

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Tue, Mar 10, 2009 at 08:48:57AM -0600, Matthew Wilcox wrote:
> On Tue, Mar 10, 2009 at 03:37:18PM +0100, Nick Piggin wrote:
> > costly, unfortunately). It results in about 900 bytes smaller code too. It
> > does increase the size of a vfsmount, however.
>
> Only on 64-bit SMP systems, and then only by four bytes. And, best of
> all, you can fix that if you care. Look at this:
>
> /* --- cacheline 1 boundary (64 bytes) --- */
> struct list_head mnt_child; /* 64 16 */
> int mnt_flags; /* 80 4 */
>
> /* XXX 4 bytes hole, try to pack */
>
> const char * mnt_devname; /* 88 8 */
> struct list_head mnt_list; /* 96 16 */
> struct list_head mnt_expire; /* 112 16 */
>
> So move mnt_flags to later in the struct (after the pointers), and move
>
> > +#ifdef CONFIG_SMP
> > + int *mnt_writers;
> > +#else
> > + int mnt_writers;
> > +#endif
>
> to be with the other pointers. Bonus points for putting it between
> struct mnt_namespace * mnt_ns; /* 184 8 */
> and
> int mnt_id; /* 192 4 */
>
> so that it doesn't become a new 4-byte hole for those incredibly common
> 64-bit uniprocessor builds. *cough*.

Oh good point, although yes I was more worried about mnt_writers in
the SMP case (yes I didn't state it very well). Basically I would be
worried if huge machinges have huge numbers of mounts.... but I think
a) if they did they would probably like the scalability improvements,
b) the improvement on smaller systems is so significant that 100s of
CPU systems will have to find a way to cut down memory if it really
was a problem for them.

2009-03-10 15:08:54

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/2] fs: introduce mnt_clone_write

On Tue, Mar 10, 2009 at 08:55:58AM -0600, Matthew Wilcox wrote:
> On Tue, Mar 10, 2009 at 03:38:01PM +0100, Nick Piggin wrote:
> > +EXPORT_SYMBOL_GPL(mnt_clone_write);
> > +EXPORT_SYMBOL_GPL(mnt_want_write_file);
>
> You don't add any users of these functions in this patch ... is there a
> good reason to export them?

Only to encourage modular code to use them (as a replacement for
mnt_want_write -- which itself is exported).

If that's not the right way to go, I don't mind waiting for a
user to turn up.


[I see many ioctls in btrfs (coming first in the alphabet) could
probably easily use mnt_want_write_file (otoh probably none are
so performance critical).]

2009-03-10 15:31:33

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Tue, Mar 10, 2009 at 03:37:18PM +0100, Nick Piggin wrote:
> It does this by removing the complex per-cpu locking and counter-cache and
> replaces it with a percpu counter in struct vfsmount. This makes the code
> much simpler, and avoids spinlocks (although the msync is still pretty
> costly, unfortunately).

Hmm, it's stupid to say msync when I mean smp_mb() (which turns out to
be msync on x86). Not least because we have an msync syscall.

Anyway, the following is just an RFC at this stage (and I think exploration
in this area should not hold up the proposed patch 1/2). Having seqcounts
does reduce some barriers, but as we can see it actually potentially
opens a hole and is not exactly a trivial exercise when your read-side is
performing stores as well. So I don't have a magic bullet to avoid
thinking about barriers yet, I'm afraid.

--
OK, this is a way we could use seqcounts in order to reduce the open-coded
barriers. However one problem with using seqcounts like this is that the
write seqcount only has smp_wmb, however it subsequently loads each of the
percpu counters, and those loads could pass the store to the seqcount,
which would enable both mnt_make_readonly and a mnt_want_write()r to succeed.

One could argue that seqlocks should have acquire/release semantics
(especially on the write-side), although that would add weight to these
primitives. I prefer explicit barriers... although smp_mb() is actually
heavier than the barriers present in seqlock readside, so possibly open
coding a seqlock with the required barriers would be even better again?

But for now I think the previous patch is still an improvement on the old
scheme.

---
fs/namespace.c | 48 ++++++++++++++----------------------------------
include/linux/mount.h | 4 +++-
2 files changed, 17 insertions(+), 35 deletions(-)

Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c
+++ linux-2.6/fs/namespace.c
@@ -235,28 +235,19 @@ static unsigned int count_mnt_writers(st
int mnt_want_write(struct vfsmount *mnt)
{
int ret = 0;
+ unsigned seq;

preempt_disable();
+again:
+ seq = read_seqcount_begin(&mnt->mnt_seqcount);
inc_mnt_writers(mnt);
- /*
- * The store to inc_mnt_writers must be visible before we pass
- * MNT_WRITE_HOLD loop below, so that the slowpath can see our
- * incremented count after it has set MNT_WRITE_HOLD.
- */
- smp_mb();
- while (mnt->mnt_flags & MNT_WRITE_HOLD)
- cpu_relax();
- /*
- * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
- * be set to match its requirements. So we must not load that until
- * MNT_WRITE_HOLD is cleared.
- */
- smp_rmb();
if (__mnt_is_readonly(mnt)) {
dec_mnt_writers(mnt);
ret = -EROFS;
goto out;
}
+ if (read_seqcount_retry(&mnt->mnt_seqcount, seq))
+ goto again;
out:
preempt_enable();
return ret;
@@ -284,28 +275,22 @@ static int mnt_make_readonly(struct vfsm
int ret = 0;

spin_lock(&vfsmount_lock);
- mnt->mnt_flags |= MNT_WRITE_HOLD;
- /*
- * After storing MNT_WRITE_HOLD, we'll read the counters. This store
- * should be visible before we do.
- */
- smp_mb();
+ /* vfsmount_lock protects mnt_seqcount */
+ write_seqcount_begin(&mnt->mnt_seqcount);

/*
- * With writers on hold, if this value is zero, then there are
- * definitely no active writers (although held writers may subsequently
- * increment the count, they'll have to wait, and decrement it after
- * seeing MNT_READONLY).
+ * Writers will be held in mnt_want_write (although they will be
+ * wildly incrementing and decrementing their write counters). But if
+ * this value is zero, then there are _definitely_ no active writers,
+ * so we can proceed.
*
* It is OK to have counter incremented on one CPU and decremented on
* another: the sum will add up correctly. The danger would be when we
* sum up each counter, if we read a counter before it is incremented,
* but then read another CPU's count which it has been subsequently
* decremented from -- we would see more decrements than we should.
- * MNT_WRITE_HOLD protects against this scenario, because
- * mnt_want_write first increments count, then smp_mb, then spins on
- * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
- * we're counting up here.
+ * However the seqlock in mnt_want_write ensures that increments will
+ * not be decremented by another CPU until we drop the seqcount.
*/
if (count_mnt_writers(mnt) > 0) {
ret = -EBUSY;
@@ -314,12 +299,7 @@ static int mnt_make_readonly(struct vfsm
if (!ret)
mnt->mnt_flags |= MNT_READONLY;
out:
- /*
- * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
- * that become unheld will see MNT_READONLY.
- */
- smp_wmb();
- mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+ write_seqcount_end(&mnt->mnt_seqcount);
spin_unlock(&vfsmount_lock);
return ret;
}
Index: linux-2.6/include/linux/mount.h
===================================================================
--- linux-2.6.orig/include/linux/mount.h
+++ linux-2.6/include/linux/mount.h
@@ -13,6 +13,7 @@
#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <asm/atomic.h>

struct super_block;
@@ -29,7 +30,6 @@ struct mnt_namespace;
#define MNT_READONLY 0x40 /* does the user want this to be r/o? */

#define MNT_SHRINKABLE 0x100
-#define MNT_WRITE_HOLD 0x200

#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -64,6 +64,8 @@ struct vfsmount {
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
+
+ seqcount_t mnt_seqcount; /* protects mnt_writers */
#ifdef CONFIG_SMP
int *mnt_writers;
#else

2009-03-11 22:11:32

by Dave Hansen

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

I'm feeling a bit better about these, although I am still honestly quite
afraid of the barriers. I also didn't like all the #ifdefs much, but
here's some help on that.

How about this on top of what you have as a bit of a cleanup? It gets
rid of all the new #ifdefs in .c files?

Did I miss the use of get_mnt_writers_ptr()? I don't think I actually
saw it used anywhere in this pair of patches, so I've stolen it. I
think gcc should compile all this new stuff down to be basically the
same as you had before. The one thing I'm not horribly sure of is the
"out_free_devname:" label. It shouldn't be reachable in the !SMP case.

I could also consolidate the header #ifdefs into a single one if you
think that looks better.

This is just compile tested, btw.

---

linux-2.6.git-dave/fs/namespace.c | 35 ++++++-------------------------
linux-2.6.git-dave/include/linux/mount.h | 30 ++++++++++++++++++++++++--
2 files changed, 35 insertions(+), 30 deletions(-)

diff -puN include/linux/mount.h~move-ifdefs-take2 include/linux/mount.h
--- linux-2.6.git/include/linux/mount.h~move-ifdefs-take2 2009-03-11 15:01:10.000000000 -0700
+++ linux-2.6.git-dave/include/linux/mount.h 2009-03-11 15:02:01.000000000 -0700
@@ -71,15 +71,41 @@ struct vfsmount {
#endif
};

-static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+static inline int *get_mnt_writers_ptr_cpu(struct vfsmount *mnt,
+ int cpu)
{
#ifdef CONFIG_SMP
- return mnt->mnt_writers;
+ return per_cpu_ptr(mnt->mnt_writers, cpu);
#else
return &mnt->mnt_writers;
#endif
}

+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+ return get_mnt_writers_ptr_cpu(mnt, smp_processor_id());
+}
+
+static inline int alloc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ mnt->mnt_writers = alloc_percpu(int);
+ if (!mnt->mnt_writers)
+ return -ENOMEM;
+#else
+ mnt->mnt_writers = 0;
+#endif
+ return 0;
+}
+
+static inline void free_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ free_percpu(mnt->mnt_writers);
+#endif
+}
+
+
static inline struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt)
diff -puN fs/namespace.c~move-ifdefs-take2 fs/namespace.c
--- linux-2.6.git/fs/namespace.c~move-ifdefs-take2 2009-03-11 15:01:10.000000000 -0700
+++ linux-2.6.git-dave/fs/namespace.c 2009-03-11 15:04:05.000000000 -0700
@@ -130,20 +130,14 @@ struct vfsmount *alloc_vfsmnt(const char
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
-#ifdef CONFIG_SMP
- mnt->mnt_writers = alloc_percpu(int);
- if (!mnt->mnt_writers)
+ err = alloc_mnt_writers(mnt);
+ if (err)
goto out_free_devname;
-#else
- mnt->mnt_writers = 0;
-#endif
}
return mnt;

-#ifdef CONFIG_SMP
out_free_devname:
kfree(mnt->mnt_devname);
-#endif
out_free_id:
mnt_free_id(mnt);
out_free_cache:
@@ -182,36 +176,23 @@ EXPORT_SYMBOL_GPL(__mnt_is_readonly);

static inline void inc_mnt_writers(struct vfsmount *mnt)
{
-#ifdef CONFIG_SMP
- (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
-#else
- mnt->mnt_writers++;
-#endif
+ (*get_mnt_writers_ptr(mnt))++;
}

static inline void dec_mnt_writers(struct vfsmount *mnt)
{
-#ifdef CONFIG_SMP
- (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
-#else
- mnt->mnt_writers--;
-#endif
+ (*get_mnt_writers_ptr(mnt))--;
}

static unsigned int count_mnt_writers(struct vfsmount *mnt)
{
-#ifdef CONFIG_SMP
unsigned int count = 0;
int cpu;

- for_each_possible_cpu(cpu) {
- count += *per_cpu_ptr(mnt->mnt_writers, cpu);
- }
+ for_each_possible_cpu(cpu)\
+ count += *get_mnt_writers_ptr_cpu(mnt, cpu);

return count;
-#else
- return mnt->mnt_writers;
-#endif
}

/*
@@ -344,9 +325,7 @@ void free_vfsmnt(struct vfsmount *mnt)
{
kfree(mnt->mnt_devname);
mnt_free_id(mnt);
-#ifdef CONFIG_SMP
- free_percpu(mnt->mnt_writers);
-#endif
+ free_mnt_writers(mnt);
kmem_cache_free(mnt_cache, mnt);
}

_


-- Dave

2009-03-12 04:13:49

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> I'm feeling a bit better about these, although I am still honestly quite
> afraid of the barriers. I also didn't like all the #ifdefs much, but
> here's some help on that.

FWIW, we have this in suse kernels because page fault performance was
so bad compared with SLES10. mnt_want_write & co was I think the 2nd
biggest offender for file backed mappings (after pvops). I think we're
around parity again even with pvops.

Basically I think we have to improve this one way or another in mainline
too. Is there any way to make you feel better about the barriers? More
comments?

mnt_make_readonly() mnt_want_write()
1. mnt_flags |= MNT_WRITE_HOLD A. mnt_writers[x]++
2. smp_mb() B. smp_mb()
3. count += mnt_writers[0] C. while (mnt_flags & MNT_WRITE_HOLD) ;
... D. smp_rmb()
count += mnt_writers[N] E. if (mnt_flags & MNT_READONLY)
4. if (count == 0) F. mnt_writers[x]-- /* fail */
5. mnt_flags |= MNT_READONLY G. else /* success */
6. else /* fail */
7. smp_wmb()
8. mnt_flags &= ~MNT_WRITE_HOLD

* 2 ensures that 1 is visible before 3 is loaded
* B ensures that A is visible before C is loaded
* Therefore, either count != 0 at 4, or C will loop (or both)
* If count == 0
* (make_readonly success)
* C will loop until 8
* D ensures E is not loaded until loop ends
* 7 ensures 5 is visible before 8 is
* Therefore E will find MNT_READONLY (want_write fail)
* If C does not loop
* 4 will find count != 0 (make_readonly fail)
* Therefore 5 is not executed.
* Therefore E will not find MNT_READONLY (want_write success)
* If count != 0 and C loops
* (make_readonly fail)
* 5 will not be executed
* Therefore E will not find MNT_READONLY (want_write success)

I don't know if that helps (I should reference which statements rely
on which). I think it shows that either one or the other only must
succeed.

It does not illustrate how the loop in the want_write side prevents
the sumation from getting confused by decrementing count on a different
CPU than it was incremented, but I've commented that case in the code
fairly well I think.


> How about this on top of what you have as a bit of a cleanup? It gets
> rid of all the new #ifdefs in .c files?
>
> Did I miss the use of get_mnt_writers_ptr()? I don't think I actually
> saw it used anywhere in this pair of patches, so I've stolen it. I
> think gcc should compile all this new stuff down to be basically the
> same as you had before. The one thing I'm not horribly sure of is the
> "out_free_devname:" label. It shouldn't be reachable in the !SMP case.
>
> I could also consolidate the header #ifdefs into a single one if you
> think that looks better.

I don't like the get_mnt_writers_ptr terribly. The *_mnt_writers functions
are quite primitive and just happen to be in the .c file because they're
private to it. The alloc/free_mnt_writers is good (they could be
in the .c file too?).

Another thing I should probably do is slash away most of the crap from
mnt_want_write in the UP case. It only needs to do a preempt_disable,
test MNT_READONLY, increment mnt_writers (and similarly for mnt_make_readonly)

2009-03-18 19:13:58

by Dave Hansen

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > I'm feeling a bit better about these, although I am still honestly quite
> > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > here's some help on that.
>
> FWIW, we have this in suse kernels because page fault performance was
> so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> biggest offender for file backed mappings (after pvops). I think we're
> around parity again even with pvops.

Page faults themselves? Which path was that from?

> Basically I think we have to improve this one way or another in mainline
> too. Is there any way to make you feel better about the barriers? More
> comments?
>
> mnt_make_readonly() mnt_want_write()
> 1. mnt_flags |= MNT_WRITE_HOLD A. mnt_writers[x]++
> 2. smp_mb() B. smp_mb()
> 3. count += mnt_writers[0] C. while (mnt_flags & MNT_WRITE_HOLD) ;
> ... D. smp_rmb()
> count += mnt_writers[N] E. if (mnt_flags & MNT_READONLY)
> 4. if (count == 0) F. mnt_writers[x]-- /* fail */
> 5. mnt_flags |= MNT_READONLY G. else /* success */
> 6. else /* fail */
> 7. smp_wmb()
> 8. mnt_flags &= ~MNT_WRITE_HOLD
>
> * 2 ensures that 1 is visible before 3 is loaded
> * B ensures that A is visible before C is loaded
> * Therefore, either count != 0 at 4, or C will loop (or both)
> * If count == 0
> * (make_readonly success)
> * C will loop until 8
> * D ensures E is not loaded until loop ends
> * 7 ensures 5 is visible before 8 is
> * Therefore E will find MNT_READONLY (want_write fail)
> * If C does not loop
> * 4 will find count != 0 (make_readonly fail)
> * Therefore 5 is not executed.
> * Therefore E will not find MNT_READONLY (want_write success)
> * If count != 0 and C loops
> * (make_readonly fail)
> * 5 will not be executed
> * Therefore E will not find MNT_READONLY (want_write success)

It is great to spell it out like that. But, honestly, I think the code
and comments that are there are probably better than looking at an out
of line description like that.

> I don't know if that helps (I should reference which statements rely
> on which). I think it shows that either one or the other only must
> succeed.
>
> It does not illustrate how the loop in the want_write side prevents
> the sumation from getting confused by decrementing count on a different
> CPU than it was incremented, but I've commented that case in the code
> fairly well I think.

I think you mentioned a seqlock being a possibility here. That would
slot in as a replacement for MNT_WRITE_HOLD, right? mnt_make_readonly()
takes a seq_write, mnt_want_write() takes a seq_read and doesn't
consider MNT_READONLY valid until it gets a clear read of it. It would
internalize all the barriers into the seqlock implementation except for
the mnt_writers[]-related ones.

As for mnt_writers, you'd need an smp_rmb() in mnt_make_readonly()
before reading the counters and an smp_wmb() after writing the counter
in mnt_want_write().

Is see that those are effectively consolidated down in your version into
a single smp_mb() at both call sites when combined with the
MNT_WRITE_HOLD barriers.

If we can get this down to two explicit calls to the barrier functions
that paired and very clear, I think I'd be much more OK with it.

> > How about this on top of what you have as a bit of a cleanup? It gets
> > rid of all the new #ifdefs in .c files?
> >
> > Did I miss the use of get_mnt_writers_ptr()? I don't think I actually
> > saw it used anywhere in this pair of patches, so I've stolen it. I
> > think gcc should compile all this new stuff down to be basically the
> > same as you had before. The one thing I'm not horribly sure of is the
> > "out_free_devname:" label. It shouldn't be reachable in the !SMP case.
> >
> > I could also consolidate the header #ifdefs into a single one if you
> > think that looks better.
>
> I don't like the get_mnt_writers_ptr terribly. The *_mnt_writers functions
> are quite primitive and just happen to be in the .c file because they're
> private to it. The alloc/free_mnt_writers is good (they could be
> in the .c file too?).

Yeah, it could all move to the .c file.

> Another thing I should probably do is slash away most of the crap from
> mnt_want_write in the UP case. It only needs to do a preempt_disable,
> test MNT_READONLY, increment mnt_writers (and similarly for mnt_make_readonly)

Yeah, that's true. I probably tried to prematurely optimize it.

-- Dave

2009-04-02 18:11:41

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Wed, 11 Mar 2009 15:11:17 -0700
Dave Hansen <[email protected]> wrote:

> I'm feeling a bit better about these, although I am still honestly quite
> afraid of the barriers. I also didn't like all the #ifdefs much, but
> here's some help on that.

Do we need the ifdefs at all?

> How about this on top of what you have as a bit of a cleanup? It gets
> rid of all the new #ifdefs in .c files?
>
> Did I miss the use of get_mnt_writers_ptr()? I don't think I actually
> saw it used anywhere in this pair of patches, so I've stolen it. I
> think gcc should compile all this new stuff down to be basically the
> same as you had before. The one thing I'm not horribly sure of is the
> "out_free_devname:" label. It shouldn't be reachable in the !SMP case.
>
> I could also consolidate the header #ifdefs into a single one if you
> think that looks better.
>
> This is just compile tested, btw.
>
> ---
>
> linux-2.6.git-dave/fs/namespace.c | 35 ++++++-------------------------
> linux-2.6.git-dave/include/linux/mount.h | 30 ++++++++++++++++++++++++--
> 2 files changed, 35 insertions(+), 30 deletions(-)
>
> diff -puN include/linux/mount.h~move-ifdefs-take2 include/linux/mount.h
> --- linux-2.6.git/include/linux/mount.h~move-ifdefs-take2 2009-03-11 15:01:10.000000000 -0700
> +++ linux-2.6.git-dave/include/linux/mount.h 2009-03-11 15:02:01.000000000 -0700
> @@ -71,15 +71,41 @@ struct vfsmount {
> #endif
> };
>
> -static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
> +static inline int *get_mnt_writers_ptr_cpu(struct vfsmount *mnt,
> + int cpu)
> {
> #ifdef CONFIG_SMP
> - return mnt->mnt_writers;
> + return per_cpu_ptr(mnt->mnt_writers, cpu);
> #else
> return &mnt->mnt_writers;
> #endif
> }
>
> +static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
> +{
> + return get_mnt_writers_ptr_cpu(mnt, smp_processor_id());
> +}
> +
> +static inline int alloc_mnt_writers(struct vfsmount *mnt)
> +{
> +#ifdef CONFIG_SMP
> + mnt->mnt_writers = alloc_percpu(int);
> + if (!mnt->mnt_writers)
> + return -ENOMEM;
> +#else
> + mnt->mnt_writers = 0;
> +#endif
> + return 0;
> +}

If the CONFIG_SMP=n code is just removed, the percpu code should dtrt
with CONFIG_SMP=n. There is the additional pointer indirection though,
I guess. Do we do it often enough to be concerned about the cost?

2009-04-02 18:22:26

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Wed, Mar 18, 2009 at 12:13:43PM -0700, Dave Hansen wrote:
> On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> > On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > > I'm feeling a bit better about these, although I am still honestly quite
> > > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > > here's some help on that.
> >
> > FWIW, we have this in suse kernels because page fault performance was
> > so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> > biggest offender for file backed mappings (after pvops). I think we're
> > around parity again even with pvops.
>
> Page faults themselves? Which path was that from?

Yes. file_update_time.


> > Basically I think we have to improve this one way or another in mainline
> > too. Is there any way to make you feel better about the barriers? More
> > comments?
> >
> > mnt_make_readonly() mnt_want_write()
> > 1. mnt_flags |= MNT_WRITE_HOLD A. mnt_writers[x]++
> > 2. smp_mb() B. smp_mb()
> > 3. count += mnt_writers[0] C. while (mnt_flags & MNT_WRITE_HOLD) ;
> > ... D. smp_rmb()
> > count += mnt_writers[N] E. if (mnt_flags & MNT_READONLY)
> > 4. if (count == 0) F. mnt_writers[x]-- /* fail */
> > 5. mnt_flags |= MNT_READONLY G. else /* success */
> > 6. else /* fail */
> > 7. smp_wmb()
> > 8. mnt_flags &= ~MNT_WRITE_HOLD
> >
> > * 2 ensures that 1 is visible before 3 is loaded
> > * B ensures that A is visible before C is loaded
> > * Therefore, either count != 0 at 4, or C will loop (or both)
> > * If count == 0
> > * (make_readonly success)
> > * C will loop until 8
> > * D ensures E is not loaded until loop ends
> > * 7 ensures 5 is visible before 8 is
> > * Therefore E will find MNT_READONLY (want_write fail)
> > * If C does not loop
> > * 4 will find count != 0 (make_readonly fail)
> > * Therefore 5 is not executed.
> > * Therefore E will not find MNT_READONLY (want_write success)
> > * If count != 0 and C loops
> > * (make_readonly fail)
> > * 5 will not be executed
> > * Therefore E will not find MNT_READONLY (want_write success)
>
> It is great to spell it out like that. But, honestly, I think the code
> and comments that are there are probably better than looking at an out
> of line description like that.

Well I think the above gives a higher confidence of correctness
than the code and comments posted (at least, until the reader
workss through similar steps themselves). At least if you don't
like it, then it could just go int othe patch changelog.


> > I don't know if that helps (I should reference which statements rely
> > on which). I think it shows that either one or the other only must
> > succeed.
> >
> > It does not illustrate how the loop in the want_write side prevents
> > the sumation from getting confused by decrementing count on a different
> > CPU than it was incremented, but I've commented that case in the code
> > fairly well I think.
>
> I think you mentioned a seqlock being a possibility here. That would
> slot in as a replacement for MNT_WRITE_HOLD, right? mnt_make_readonly()
> takes a seq_write, mnt_want_write() takes a seq_read and doesn't
> consider MNT_READONLY valid until it gets a clear read of it. It would
> internalize all the barriers into the seqlock implementation except for
> the mnt_writers[]-related ones.
>
> As for mnt_writers, you'd need an smp_rmb() in mnt_make_readonly()
> before reading the counters and an smp_wmb() after writing the counter
> in mnt_want_write().
>
> Is see that those are effectively consolidated down in your version into
> a single smp_mb() at both call sites when combined with the
> MNT_WRITE_HOLD barriers.
>
> If we can get this down to two explicit calls to the barrier functions
> that paired and very clear, I think I'd be much more OK with it.

seqlock as I said doesn't get its barriers quite right for having
stores in the read protected section. I was musing the possibility,
but I think it ends up being less clear if you just end up having
to a) look at seqlock implementation, b) work out it is insufficnet,
c) add barriers to make up for it. May as well just open code it
here with sufficient comment to justify correctness (which I think
I have come up with).

2009-04-02 18:37:42

by Dave Hansen

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, 2009-04-02 at 20:22 +0200, Nick Piggin wrote:
> On Wed, Mar 18, 2009 at 12:13:43PM -0700, Dave Hansen wrote:
> > On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> > > On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > > > I'm feeling a bit better about these, although I am still honestly quite
> > > > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > > > here's some help on that.
> > >
> > > FWIW, we have this in suse kernels because page fault performance was
> > > so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> > > biggest offender for file backed mappings (after pvops). I think we're
> > > around parity again even with pvops.
> >
> > Page faults themselves? Which path was that from?
>
> Yes. file_update_time.

We should be able to use your mnt_clone_write() optimization separate
from the mnt_want_write() speedup here, right?

-- Dave

2009-04-02 18:43:35

by Al Viro

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, Apr 02, 2009 at 08:22:10PM +0200, Nick Piggin wrote:
> On Wed, Mar 18, 2009 at 12:13:43PM -0700, Dave Hansen wrote:
> > On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> > > On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > > > I'm feeling a bit better about these, although I am still honestly quite
> > > > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > > > here's some help on that.
> > >
> > > FWIW, we have this in suse kernels because page fault performance was
> > > so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> > > biggest offender for file backed mappings (after pvops). I think we're
> > > around parity again even with pvops.
> >
> > Page faults themselves? Which path was that from?
>
> Yes. file_update_time.

FWIW, I'm not sure that this optimization is valid. We might eventually
want to go for "don't allow any new writers, remount r/o when existing
ones expire" functionality, so nested mnt_want_write() might eventually
be allowed to fail.

2009-04-02 18:48:58

by Al Viro

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, Apr 02, 2009 at 07:43:05PM +0100, Al Viro wrote:

> FWIW, I'm not sure that this optimization is valid. We might eventually
> want to go for "don't allow any new writers, remount r/o when existing
> ones expire" functionality, so nested mnt_want_write() might eventually
> be allowed to fail.

BTW, I think that 1/2 as of Mar 10 is worth merging. There's an interesting
question, though - do we want mnt_writers int or long? On 64bit boxen it
can get serious...

2009-04-02 19:08:31

by Dave Hansen

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, 2009-04-02 at 19:43 +0100, Al Viro wrote:
> On Thu, Apr 02, 2009 at 08:22:10PM +0200, Nick Piggin wrote:
> > On Wed, Mar 18, 2009 at 12:13:43PM -0700, Dave Hansen wrote:
> > > On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> > > > On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > > > > I'm feeling a bit better about these, although I am still honestly quite
> > > > > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > > > > here's some help on that.
> > > >
> > > > FWIW, we have this in suse kernels because page fault performance was
> > > > so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> > > > biggest offender for file backed mappings (after pvops). I think we're
> > > > around parity again even with pvops.
> > >
> > > Page faults themselves? Which path was that from?
> >
> > Yes. file_update_time.
>
> FWIW, I'm not sure that this optimization is valid. We might eventually
> want to go for "don't allow any new writers, remount r/o when existing
> ones expire" functionality, so nested mnt_want_write() might eventually
> be allowed to fail.

That makes sense on a larger scale definitely.

But I do wonder about file_update_time() specifically, especially since
its mnt_want_write() is never persistent and it is always done under the
cover of a FMODE_WRITE 'struct file'. Do we strictly even need the
mnt_want/drop_write() pair in here at all right now?

-- Dave

2009-04-02 20:32:15

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, Apr 02, 2009 at 11:37:20AM -0700, Dave Hansen wrote:
> > > Page faults themselves? Which path was that from?
> >
> > Yes. file_update_time.
>
> We should be able to use your mnt_clone_write() optimization separate
> from the mnt_want_write() speedup here, right?

Does file_update_time even need a separate mnt_want_write? It should
always be after a mnt_want_write which we need for the actual data
write.

2009-04-03 01:30:18

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, Apr 02, 2009 at 11:37:20AM -0700, Dave Hansen wrote:
> On Thu, 2009-04-02 at 20:22 +0200, Nick Piggin wrote:
> > On Wed, Mar 18, 2009 at 12:13:43PM -0700, Dave Hansen wrote:
> > > On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> > > > On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > > > > I'm feeling a bit better about these, although I am still honestly quite
> > > > > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > > > > here's some help on that.
> > > >
> > > > FWIW, we have this in suse kernels because page fault performance was
> > > > so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> > > > biggest offender for file backed mappings (after pvops). I think we're
> > > > around parity again even with pvops.
> > >
> > > Page faults themselves? Which path was that from?
> >
> > Yes. file_update_time.
>
> We should be able to use your mnt_clone_write() optimization separate
> from the mnt_want_write() speedup here, right?

Yes that should work here.

2009-04-03 01:31:59

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, Apr 02, 2009 at 07:43:05PM +0100, Al Viro wrote:
> On Thu, Apr 02, 2009 at 08:22:10PM +0200, Nick Piggin wrote:
> > On Wed, Mar 18, 2009 at 12:13:43PM -0700, Dave Hansen wrote:
> > > On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> > > > On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > > > > I'm feeling a bit better about these, although I am still honestly quite
> > > > > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > > > > here's some help on that.
> > > >
> > > > FWIW, we have this in suse kernels because page fault performance was
> > > > so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> > > > biggest offender for file backed mappings (after pvops). I think we're
> > > > around parity again even with pvops.
> > >
> > > Page faults themselves? Which path was that from?
> >
> > Yes. file_update_time.
>
> FWIW, I'm not sure that this optimization is valid. We might eventually
> want to go for "don't allow any new writers, remount r/o when existing
> ones expire" functionality, so nested mnt_want_write() might eventually
> be allowed to fail.

I guess then it would need another flag to check than checking
RDONLY (BECOMING_RDONLY or something)

2009-04-03 10:31:21

by Al Viro

[permalink] [raw]
Subject: Re: [patch 1/2] fs: mnt_want_write speedup

On Thu, Apr 02, 2009 at 12:08:07PM -0700, Dave Hansen wrote:
> On Thu, 2009-04-02 at 19:43 +0100, Al Viro wrote:
> > On Thu, Apr 02, 2009 at 08:22:10PM +0200, Nick Piggin wrote:
> > > On Wed, Mar 18, 2009 at 12:13:43PM -0700, Dave Hansen wrote:
> > > > On Thu, 2009-03-12 at 05:13 +0100, Nick Piggin wrote:
> > > > > On Wed, Mar 11, 2009 at 03:11:17PM -0700, Dave Hansen wrote:
> > > > > > I'm feeling a bit better about these, although I am still honestly quite
> > > > > > afraid of the barriers. I also didn't like all the #ifdefs much, but
> > > > > > here's some help on that.
> > > > >
> > > > > FWIW, we have this in suse kernels because page fault performance was
> > > > > so bad compared with SLES10. mnt_want_write & co was I think the 2nd
> > > > > biggest offender for file backed mappings (after pvops). I think we're
> > > > > around parity again even with pvops.
> > > >
> > > > Page faults themselves? Which path was that from?
> > >
> > > Yes. file_update_time.
> >
> > FWIW, I'm not sure that this optimization is valid. We might eventually
> > want to go for "don't allow any new writers, remount r/o when existing
> > ones expire" functionality, so nested mnt_want_write() might eventually
> > be allowed to fail.
>
> That makes sense on a larger scale definitely.
>
> But I do wonder about file_update_time() specifically, especially since
> its mnt_want_write() is never persistent and it is always done under the
> cover of a FMODE_WRITE 'struct file'. Do we strictly even need the
> mnt_want/drop_write() pair in here at all right now?

mnt_want_write() checks for superblock having gone readonly as well, and
we want to preserve that in some form. OTOH, we probably want to move
that upstream from that place anyway.

BTW, mmap()/get page dirty/have filesystem forcibly go r-o on its own and
see how messy does it get is an useful regression test...