2013-10-03 06:20:20

by Al Viro

[permalink] [raw]
Subject: [PATCH 17/17] RCU'd vfsmounts


_very_ preliminary, barely tested.

Signed-off-by: Al Viro <[email protected]>

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 0ff4bae..3b79d15 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -123,6 +123,7 @@ static void adfs_put_super(struct super_block *sb)
for (i = 0; i < asb->s_map_size; i++)
brelse(asb->s_map[i].dm_bh);
kfree(asb->s_map);
+ synchronize_rcu();
kfree(asb);
sb->s_fs_info = NULL;
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index b104726..07599e2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -62,6 +62,7 @@ void autofs4_kill_sb(struct super_block *sb)
/* Free wait queues, close pipe */
autofs4_catatonic_mode(sbi);

+ synchronize_rcu();
sb->s_fs_info = NULL;
kfree(sbi);

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a279ffc..e0305ae 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3779,6 +3779,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb)

bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
+ synchronize_rcu();
unload_nls(cifs_sb->local_nls);
kfree(cifs_sb);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index d888223..ae74923 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1075,116 +1075,6 @@ void shrink_dcache_sb(struct super_block *sb)
EXPORT_SYMBOL(shrink_dcache_sb);

/*
- * destroy a single subtree of dentries for unmount
- * - see the comments on shrink_dcache_for_umount() for a description of the
- * locking
- */
-static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
-{
- struct dentry *parent;
-
- BUG_ON(!IS_ROOT(dentry));
-
- for (;;) {
- /* descend to the first leaf in the current subtree */
- while (!list_empty(&dentry->d_subdirs))
- dentry = list_entry(dentry->d_subdirs.next,
- struct dentry, d_u.d_child);
-
- /* consume the dentries from this leaf up through its parents
- * until we find one with children or run out altogether */
- do {
- struct inode *inode;
-
- /*
- * inform the fs that this dentry is about to be
- * unhashed and destroyed.
- */
- if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
- !d_unhashed(dentry))
- dentry->d_op->d_prune(dentry);
-
- dentry_lru_del(dentry);
- __d_shrink(dentry);
-
- if (dentry->d_lockref.count != 0) {
- printk(KERN_ERR
- "BUG: Dentry %p{i=%lx,n=%s}"
- " still in use (%d)"
- " [unmount of %s %s]\n",
- dentry,
- dentry->d_inode ?
- dentry->d_inode->i_ino : 0UL,
- dentry->d_name.name,
- dentry->d_lockref.count,
- dentry->d_sb->s_type->name,
- dentry->d_sb->s_id);
- BUG();
- }
-
- if (IS_ROOT(dentry)) {
- parent = NULL;
- list_del(&dentry->d_u.d_child);
- } else {
- parent = dentry->d_parent;
- parent->d_lockref.count--;
- list_del(&dentry->d_u.d_child);
- }
-
- inode = dentry->d_inode;
- if (inode) {
- dentry->d_inode = NULL;
- hlist_del_init(&dentry->d_alias);
- if (dentry->d_op && dentry->d_op->d_iput)
- dentry->d_op->d_iput(dentry, inode);
- else
- iput(inode);
- }
-
- d_free(dentry);
-
- /* finished when we fall off the top of the tree,
- * otherwise we ascend to the parent and move to the
- * next sibling if there is one */
- if (!parent)
- return;
- dentry = parent;
- } while (list_empty(&dentry->d_subdirs));
-
- dentry = list_entry(dentry->d_subdirs.next,
- struct dentry, d_u.d_child);
- }
-}
-
-/*
- * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock because:
- * - the superblock is detached from all mountings and open files, so the
- * dentry trees will not be rearranged by the VFS
- * - s_umount is write-locked, so the memory pressure shrinker will ignore
- * any dentries belonging to this superblock that it comes across
- * - the filesystem itself is no longer permitted to rearrange the dentries
- * in this superblock
- */
-void shrink_dcache_for_umount(struct super_block *sb)
-{
- struct dentry *dentry;
-
- if (down_read_trylock(&sb->s_umount))
- BUG();
-
- dentry = sb->s_root;
- sb->s_root = NULL;
- dentry->d_lockref.count--;
- shrink_dcache_for_umount_subtree(dentry);
-
- while (!hlist_bl_empty(&sb->s_anon)) {
- dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
- shrink_dcache_for_umount_subtree(dentry);
- }
-}
-
-/*
* This tries to ascend one level of parenthood, but
* we can race with renaming, so we need to re-check
* the parenthood after dropping the lock and check
@@ -1478,6 +1368,90 @@ void shrink_dcache_parent(struct dentry *parent)
}
EXPORT_SYMBOL(shrink_dcache_parent);

+static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
+{
+ struct select_data *data = _data;
+ enum d_walk_ret ret = D_WALK_CONTINUE;
+
+ if (dentry->d_lockref.count) {
+ dentry_lru_del(dentry);
+ if (likely(!list_empty(&dentry->d_subdirs)))
+ goto out;
+ if (dentry == data->start && dentry->d_lockref.count == 1)
+ goto out;
+ printk(KERN_ERR
+ "BUG: Dentry %p{i=%lx,n=%s}"
+ " still in use (%d)"
+ " [unmount of %s %s]\n",
+ dentry,
+ dentry->d_inode ?
+ dentry->d_inode->i_ino : 0UL,
+ dentry->d_name.name,
+ dentry->d_lockref.count,
+ dentry->d_sb->s_type->name,
+ dentry->d_sb->s_id);
+ BUG();
+ } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+ /*
+ * We can't use d_lru_shrink_move() because we
+ * need to get the global LRU lock and do the
+ * LRU accounting.
+ */
+ d_lru_del(dentry);
+ d_shrink_add(dentry, &data->dispose);
+ data->found++;
+ ret = D_WALK_NORETRY;
+ }
+out:
+ if (data->found && need_resched())
+ ret = D_WALK_QUIT;
+ return ret;
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+ struct dentry *dentry;
+
+ if (down_read_trylock(&sb->s_umount))
+ BUG();
+
+ dentry = sb->s_root;
+ sb->s_root = NULL;
+ for (;;) {
+ struct select_data data;
+
+ INIT_LIST_HEAD(&data.dispose);
+ data.start = dentry;
+ data.found = 0;
+
+ d_walk(dentry, &data, umount_collect, NULL);
+ if (!data.found)
+ break;
+
+ shrink_dentry_list(&data.dispose);
+ cond_resched();
+ }
+ d_drop(dentry);
+ dput(dentry);
+
+ while (!hlist_bl_empty(&sb->s_anon)) {
+ struct select_data data;
+ dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
+
+ INIT_LIST_HEAD(&data.dispose);
+ data.start = NULL;
+ data.found = 0;
+
+ d_walk(dentry, &data, umount_collect, NULL);
+ if (data.found)
+ shrink_dentry_list(&data.dispose);
+ cond_resched();
+ }
+}
+
static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
{
struct select_data *data = _data;
@@ -2885,24 +2859,28 @@ static int prepend_path(const struct path *path,
struct vfsmount *vfsmnt = path->mnt;
struct mount *mnt = real_mount(vfsmnt);
int error = 0;
- unsigned seq = 0;
+ unsigned seq, m_seq = 0;
char *bptr;
int blen;

- br_read_lock(&vfsmount_lock);
rcu_read_lock();
+restart_mnt:
+ read_seqbegin_or_lock(&mount_lock, &m_seq);
+ seq = 0;
restart:
bptr = *buffer;
blen = *buflen;
+ error = 0;
read_seqbegin_or_lock(&rename_lock, &seq);
while (dentry != root->dentry || vfsmnt != root->mnt) {
struct dentry * parent;

if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+ struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
/* Global root? */
- if (mnt_has_parent(mnt)) {
- dentry = mnt->mnt_mountpoint;
- mnt = mnt->mnt_parent;
+ if (mnt != parent) {
+ dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+ mnt = parent;
vfsmnt = &mnt->mnt;
continue;
}
@@ -2936,7 +2914,11 @@ restart:
goto restart;
}
done_seqretry(&rename_lock, seq);
- br_read_unlock(&vfsmount_lock);
+ if (need_seqretry(&mount_lock, m_seq)) {
+ m_seq = 1;
+ goto restart_mnt;
+ }
+ done_seqretry(&mount_lock, m_seq);

if (error >= 0 && bptr == *buffer) {
if (--blen < 0)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0062da2..3d297e6 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -557,6 +557,7 @@ static void fat_put_super(struct super_block *sb)
iput(sbi->fsinfo_inode);
iput(sbi->fat_inode);

+ synchronize_rcu();
unload_nls(sbi->nls_disk);
unload_nls(sbi->nls_io);

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8ce6da..2dfd2b4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -387,6 +387,7 @@ static void fuse_put_super(struct super_block *sb)
mutex_unlock(&fuse_mutex);
fuse_bdi_destroy(fc);

+ synchronize_rcu();
fuse_conn_put(fc);
}

diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4334cda..2946c6b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -109,6 +109,7 @@ static void hpfs_put_super(struct super_block *s)
unmark_dirty(s);
hpfs_unlock(s);

+ synchronize_rcu();
kfree(sbi->sb_cp_table);
kfree(sbi->sb_bmp_dir);
s->s_fs_info = NULL;
diff --git a/fs/mount.h b/fs/mount.h
index f086607..d64c594 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,7 +1,6 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
-#include <linux/lglock.h>

struct mnt_namespace {
atomic_t count;
@@ -30,6 +29,7 @@ struct mount {
struct mount *mnt_parent;
struct dentry *mnt_mountpoint;
struct vfsmount mnt;
+ struct rcu_head mnt_rcu;
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
#else
@@ -80,21 +80,23 @@ static inline int is_mounted(struct vfsmount *mnt)
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);

+extern bool legitimize_mnt(struct vfsmount *, unsigned);
+
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
atomic_inc(&ns->count);
}

-extern struct lglock vfsmount_lock;
+extern seqlock_t mount_lock;

static inline void lock_mount_hash(void)
{
- br_write_lock(&vfsmount_lock);
+ write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
- br_write_unlock(&vfsmount_lock);
+ write_sequnlock(&mount_lock);
}

struct proc_mounts {
diff --git a/fs/namei.c b/fs/namei.c
index 1f844fb..4b4310a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -482,18 +482,6 @@ EXPORT_SYMBOL(path_put);
* to restart the path walk from the beginning in ref-walk mode.
*/

-static inline void lock_rcu_walk(void)
-{
- br_read_lock(&vfsmount_lock);
- rcu_read_lock();
-}
-
-static inline void unlock_rcu_walk(void)
-{
- rcu_read_unlock();
- br_read_unlock(&vfsmount_lock);
-}
-
/**
* unlazy_walk - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
@@ -512,26 +500,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
BUG_ON(!(nd->flags & LOOKUP_RCU));

/*
- * Get a reference to the parent first: we're
- * going to make "path_put(nd->path)" valid in
- * non-RCU context for "terminate_walk()".
- *
- * If this doesn't work, return immediately with
- * RCU walking still active (and then we will do
- * the RCU walk cleanup in terminate_walk()).
+ * After legitimizing the bastards, terminate_walk()
+ * will do the right thing for non-RCU mode, and all our
+ * subsequent exit cases should rcu_read_unlock()
+ * before returning. Do vfsmount first; if dentry
+ * can't be legitimized, just set nd->path.dentry to NULL
+ * and rely on dput(NULL) being a no-op.
*/
- if (!lockref_get_not_dead(&parent->d_lockref))
+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
return -ECHILD;
-
- /*
- * After the mntget(), we terminate_walk() will do
- * the right thing for non-RCU mode, and all our
- * subsequent exit cases should unlock_rcu_walk()
- * before returning.
- */
- mntget(nd->path.mnt);
nd->flags &= ~LOOKUP_RCU;

+ if (!lockref_get_not_dead(&parent->d_lockref)) {
+ nd->path.dentry = NULL;
+ rcu_read_unlock();
+ return -ECHILD;
+ }
+
/*
* For a negative lookup, the lookup sequence point is the parents
* sequence point, and it only needs to revalidate the parent dentry.
@@ -566,17 +551,17 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
spin_unlock(&fs->lock);
}

- unlock_rcu_walk();
+ rcu_read_unlock();
return 0;

unlock_and_drop_dentry:
spin_unlock(&fs->lock);
drop_dentry:
- unlock_rcu_walk();
+ rcu_read_unlock();
dput(dentry);
goto drop_root_mnt;
out:
- unlock_rcu_walk();
+ rcu_read_unlock();
drop_root_mnt:
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
@@ -608,17 +593,22 @@ static int complete_walk(struct nameidata *nd)
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;

+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+ rcu_read_unlock();
+ return -ECHILD;
+ }
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
- unlock_rcu_walk();
+ rcu_read_unlock();
+ mntput(nd->path.mnt);
return -ECHILD;
}
if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
- unlock_rcu_walk();
+ rcu_read_unlock();
dput(dentry);
+ mntput(nd->path.mnt);
return -ECHILD;
}
- mntget(nd->path.mnt);
- unlock_rcu_walk();
+ rcu_read_unlock();
}

if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -909,15 +899,15 @@ int follow_up(struct path *path)
struct mount *parent;
struct dentry *mountpoint;

- br_read_lock(&vfsmount_lock);
+ read_seqlock_excl(&mount_lock);
parent = mnt->mnt_parent;
if (parent == mnt) {
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
return 0;
}
mntget(&parent->mnt);
mountpoint = dget(mnt->mnt_mountpoint);
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
@@ -1048,8 +1038,8 @@ static int follow_managed(struct path *path, unsigned flags)

/* Something is mounted on this dentry in another
* namespace and/or whatever was mounted there in this
- * namespace got unmounted before we managed to get the
- * vfsmount_lock */
+ * namespace got unmounted before lookup_mnt() could
+ * get it */
}

/* Handle an automount point */
@@ -1174,7 +1164,7 @@ failed:
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
- unlock_rcu_walk();
+ rcu_read_unlock();
return -ECHILD;
}

@@ -1501,7 +1491,7 @@ static void terminate_walk(struct nameidata *nd)
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
- unlock_rcu_walk();
+ rcu_read_unlock();
}
}

@@ -1862,7 +1852,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
- lock_rcu_walk();
+ rcu_read_lock();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
path_get(&nd->path);
@@ -1872,9 +1862,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,

nd->root.mnt = NULL;

+ nd->m_seq = read_seqbegin(&mount_lock);
if (*name=='/') {
if (flags & LOOKUP_RCU) {
- lock_rcu_walk();
+ rcu_read_lock();
set_root_rcu(nd);
} else {
set_root(nd);
@@ -1886,7 +1877,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
struct fs_struct *fs = current->fs;
unsigned seq;

- lock_rcu_walk();
+ rcu_read_lock();

do {
seq = read_seqcount_begin(&fs->seq);
@@ -1918,7 +1909,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (f.need_put)
*fp = f.file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
- lock_rcu_walk();
+ rcu_read_lock();
} else {
path_get(&nd->path);
fdput(f);
diff --git a/fs/namespace.c b/fs/namespace.c
index 8ae16b9f..1711536 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(fs_kobj);
* It should be taken for write in all cases where the vfsmount
* tree or hash is modified or when a vfsmount structure is modified.
*/
-DEFINE_BRLOCK(vfsmount_lock);
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -547,16 +547,38 @@ static void free_vfsmnt(struct mount *mnt)
kmem_cache_free(mnt_cache, mnt);
}

+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+ struct mount *mnt;
+ if (read_seqretry(&mount_lock, seq))
+ return false;
+ if (bastard == NULL)
+ return true;
+ mnt = real_mount(bastard);
+ mnt_add_count(mnt, 1);
+ if (likely(!read_seqretry(&mount_lock, seq)))
+ return true;
+ if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
+ mnt_add_count(mnt, -1);
+ return false;
+ }
+ rcu_read_unlock();
+ mntput(bastard);
+ rcu_read_lock();
+ return false;
+}
+
/*
* find the first mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * call under rcu_read_lock()
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
struct list_head *head = mount_hashtable + hash(mnt, dentry);
struct mount *p;

- list_for_each_entry(p, head, mnt_hash)
+ list_for_each_entry_rcu(p, head, mnt_hash)
if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
return p;
return NULL;
@@ -564,7 +586,7 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)

/*
* find the last mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * mount_lock must be held.
*/
struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -596,17 +618,17 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
struct vfsmount *lookup_mnt(struct path *path)
{
struct mount *child_mnt;
+ struct vfsmount *m;
+ unsigned seq;

- br_read_lock(&vfsmount_lock);
- child_mnt = __lookup_mnt(path->mnt, path->dentry);
- if (child_mnt) {
- mnt_add_count(child_mnt, 1);
- br_read_unlock(&vfsmount_lock);
- return &child_mnt->mnt;
- } else {
- br_read_unlock(&vfsmount_lock);
- return NULL;
- }
+ rcu_read_lock();
+ do {
+ seq = read_seqbegin(&mount_lock);
+ child_mnt = __lookup_mnt(path->mnt, path->dentry);
+ m = child_mnt ? &child_mnt->mnt : NULL;
+ } while (!legitimize_mnt(m, seq));
+ rcu_read_unlock();
+ return m;
}

static struct mountpoint *new_mountpoint(struct dentry *dentry)
@@ -874,38 +896,47 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return ERR_PTR(err);
}

+static void delayed_free(struct rcu_head *head)
+{
+ struct mount *mnt = container_of(head, struct mount, mnt_rcu);
+ kfree(mnt->mnt_devname);
+#ifdef CONFIG_SMP
+ free_percpu(mnt->mnt_pcp);
+#endif
+ kmem_cache_free(mnt_cache, mnt);
+}
+
static void mntput_no_expire(struct mount *mnt)
{
put_again:
-#ifdef CONFIG_SMP
- br_read_lock(&vfsmount_lock);
- if (likely(mnt->mnt_ns)) {
- /* shouldn't be the last one */
- mnt_add_count(mnt, -1);
- br_read_unlock(&vfsmount_lock);
+ rcu_read_lock();
+ mnt_add_count(mnt, -1);
+ smp_mb();
+ if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+ rcu_read_unlock();
return;
}
- br_read_unlock(&vfsmount_lock);
-
lock_mount_hash();
- mnt_add_count(mnt, -1);
if (mnt_get_count(mnt)) {
+ rcu_read_unlock();
unlock_mount_hash();
return;
}
-#else
- mnt_add_count(mnt, -1);
- if (likely(mnt_get_count(mnt)))
- return;
- lock_mount_hash();
-#endif
if (unlikely(mnt->mnt_pinned)) {
mnt_add_count(mnt, mnt->mnt_pinned + 1);
mnt->mnt_pinned = 0;
+ rcu_read_unlock();
unlock_mount_hash();
acct_auto_close_mnt(&mnt->mnt);
goto put_again;
}
+ if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
+ rcu_read_unlock();
+ unlock_mount_hash();
+ return;
+ }
+ mnt->mnt.mnt_flags |= MNT_DOOMED;
+ rcu_read_unlock();

list_del(&mnt->mnt_instance);
unlock_mount_hash();
@@ -924,7 +955,8 @@ put_again:
fsnotify_vfsmount_delete(&mnt->mnt);
dput(mnt->mnt.mnt_root);
deactivate_super(mnt->mnt.mnt_sb);
- free_vfsmnt(mnt);
+ mnt_free_id(mnt);
+ call_rcu(&mnt->mnt_rcu, delayed_free);
}

void mntput(struct vfsmount *mnt)
@@ -1137,6 +1169,8 @@ static void namespace_unlock(void)
list_splice_init(&unmounted, &head);
up_write(&namespace_sem);

+ synchronize_rcu();
+
while (!list_empty(&head)) {
mnt = list_first_entry(&head, struct mount, mnt_hash);
list_del_init(&mnt->mnt_hash);
@@ -1152,10 +1186,13 @@ static inline void namespace_lock(void)
}

/*
- * vfsmount lock must be held for write
+ * mount_lock must be held
* namespace_sem must be held for write
+ * how = 0 => just this tree, don't propagate
+ * how = 1 => propagate; we know that nobody else has reference to any victims
+ * how = 2 => lazy umount
*/
-void umount_tree(struct mount *mnt, int propagate)
+void umount_tree(struct mount *mnt, int how)
{
LIST_HEAD(tmp_list);
struct mount *p;
@@ -1163,7 +1200,7 @@ void umount_tree(struct mount *mnt, int propagate)
for (p = mnt; p; p = next_mnt(p, mnt))
list_move(&p->mnt_hash, &tmp_list);

- if (propagate)
+ if (how)
propagate_umount(&tmp_list);

list_for_each_entry(p, &tmp_list, mnt_hash) {
@@ -1171,6 +1208,8 @@ void umount_tree(struct mount *mnt, int propagate)
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
+ if (how < 2)
+ p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
put_mountpoint(p->mnt_mp);
@@ -1262,14 +1301,18 @@ static int do_umount(struct mount *mnt, int flags)
lock_mount_hash();
event++;

- if (!(flags & MNT_DETACH))
- shrink_submounts(mnt);
-
- retval = -EBUSY;
- if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
+ if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, 1);
+ umount_tree(mnt, 2);
retval = 0;
+ } else {
+ shrink_submounts(mnt);
+ retval = -EBUSY;
+ if (!propagate_mount_busy(mnt, 2)) {
+ if (!list_empty(&mnt->mnt_list))
+ umount_tree(mnt, 1);
+ retval = 0;
+ }
}
unlock_mount_hash();
namespace_unlock();
@@ -1955,7 +1998,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
struct mount *parent;
int err;

- mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+ mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);

mp = lock_mount(path);
if (IS_ERR(mp))
@@ -2172,7 +2215,7 @@ resume:
* process a list of expirable mountpoints with the intent of discarding any
* submounts of a specific parent mountpoint
*
- * vfsmount_lock must be held for write
+ * mount_lock must be held for write
*/
static void shrink_submounts(struct mount *mnt)
{
@@ -2558,7 +2601,7 @@ out_type:
/*
* Return true if path is reachable from root
*
- * namespace_sem or vfsmount_lock is held
+ * namespace_sem or mount_lock is held
*/
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
const struct path *root)
@@ -2573,9 +2616,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
int path_is_under(struct path *path1, struct path *path2)
{
int res;
- br_read_lock(&vfsmount_lock);
+ read_seqlock_excl(&mount_lock);
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
return res;
}
EXPORT_SYMBOL(path_is_under);
@@ -2748,8 +2791,6 @@ void __init mnt_init(void)
for (u = 0; u < HASH_SIZE; u++)
INIT_LIST_HEAD(&mountpoint_hashtable[u]);

- br_lock_init(&vfsmount_lock);
-
err = sysfs_init();
if (err)
printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2783,6 +2824,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
* we unmount before file sys is unregistered
*/
real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
+ smp_wmb();
}
return mnt;
}
@@ -2792,9 +2834,8 @@ void kern_unmount(struct vfsmount *mnt)
{
/* release long term mount so mount point can be released */
if (!IS_ERR_OR_NULL(mnt)) {
- lock_mount_hash();
real_mount(mnt)->mnt_ns = NULL;
- unlock_mount_hash();
+ smp_mb();
mntput(mnt);
}
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 4659da6..5539b5b 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -792,6 +792,7 @@ static void ncp_put_super(struct super_block *sb)

ncp_stop_tasks(server);

+ synchronize_rcu();
#ifdef CONFIG_NCPFS_NLS
/* unload the NLS charsets */
unload_nls(server->nls_vol);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbe..8d1e094 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -148,6 +148,7 @@ static void proc_kill_sb(struct super_block *sb)
if (ns->proc_self)
dput(ns->proc_self);
kill_anon_super(sb);
+ synchronize_rcu(); /* might be an overkill */
put_pid_ns(ns);
}

diff --git a/include/linux/mount.h b/include/linux/mount.h
index 38cd98f..371d346 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -49,6 +49,8 @@ struct mnt_namespace;

#define MNT_LOCK_READONLY 0x400000
#define MNT_LOCKED 0x800000
+#define MNT_DOOMED 0x1000000
+#define MNT_SYNC_UMOUNT 0x2000000

struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 8e47bc7..492de72 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -16,7 +16,7 @@ struct nameidata {
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags;
- unsigned seq;
+ unsigned seq, m_seq;
int last_type;
unsigned depth;
char *saved_names[MAX_NESTED_LINKS + 1];


2013-10-03 19:06:08

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 3, 2013 at 10:44 AM, Al Viro <[email protected]> wrote:
>
> Anyway, I've done nicer variants of that protection for everything except
> fuse (hadn't gotten around to it yet); see vfs.git#experimental now:

Ok, I did a quick test, and it looks ok here, so looking good for 3.13.

However, the new smp_mb() in mntput_no_expire() is quite noticeable in
the path lookup stress-test profiles. And I'm not seeing what that
allegedly protects against, especially if mnt_ns is NULL (ie all the
common important cases).

Linus

2013-10-03 19:43:54

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 12:06:04PM -0700, Linus Torvalds wrote:
> On Thu, Oct 3, 2013 at 10:44 AM, Al Viro <[email protected]> wrote:
> >
> > Anyway, I've done nicer variants of that protection for everything except
> > fuse (hadn't gotten around to it yet); see vfs.git#experimental now:
>
> Ok, I did a quick test, and it looks ok here, so looking good for 3.13.
>
> However, the new smp_mb() in mntput_no_expire() is quite noticeable in
> the path lookup stress-test profiles. And I'm not seeing what that
> allegedly protects against, especially if mnt_ns is NULL (ie all the
> common important cases).

In the common case it's ->mnt_ns is *not* NULL; that's what we get if
the damn thing is still mounted.

What we need to avoid is this:

mnt_ns non-NULL, mnt_count is 2
CPU1: umount -l CPU2: mntput
umount_tree() clears mnt_ns
drop mount_lock.lock
namespace_unlock() calls mntput()
decrement mnt_count
see that mnt_ns is NULL
grab mount_lock.lock
check mnt_count
decrement mnt_count
see old value of mnt_ns
decide to bugger off
see it equal to 1 (i.e. miss decrement on CPU2)
decide to bugger off

The barrier in mntput() is to prevent that combination, so that either CPU2
would see mnt_ns cleared by CPU1, or CPU1 would see mnt_count decrement done
by CPU2. Its counterpart on CPU1 is provided by spin_unlock/spin_lock we've
done between clearing mnt_ns and checking mnt_count. Note that
synchronize_rcu() in namespace_unlock() and rcu_read_lock() in mntput() are
irrelevant here - the latter on CPU2 might very well have happened after the
former on CPU1, so umount -l did *not* wait for CPU2 to do anything.

Any suggestions re getting rid of that barrier?

2013-10-03 20:19:19

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 3, 2013 at 12:43 PM, Al Viro <[email protected]> wrote:
>
> In the common case it's ->mnt_ns is *not* NULL; that's what we get if
> the damn thing is still mounted.

Yeah, I misread the profile assembly code. The point being that the
nice fast case now has the smp_mb() in it, and it accounts for about
60% of the cost of that function on my performance profile.

> What we need to avoid is this:
>
> mnt_ns non-NULL, mnt_count is 2
> CPU1: umount -l CPU2: mntput
> umount_tree() clears mnt_ns
> drop mount_lock.lock
> namespace_unlock() calls mntput()
> decrement mnt_count
> see that mnt_ns is NULL
> grab mount_lock.lock
> check mnt_count
> decrement mnt_count
> see old value of mnt_ns
> decide to bugger off
> see it equal to 1 (i.e. miss decrement on CPU2)
> decide to bugger off
>
> The barrier in mntput() is to prevent that combination, so that either CPU2
> would see mnt_ns cleared by CPU1, or CPU1 would see mnt_count decrement done
> by CPU2. Its counterpart on CPU1 is provided by spin_unlock/spin_lock we've
> done between clearing mnt_ns and checking mnt_count. Note that
> synchronize_rcu() in namespace_unlock() and rcu_read_lock() in mntput() are
> irrelevant here - the latter on CPU2 might very well have happened after the
> former on CPU1, so umount -l did *not* wait for CPU2 to do anything.
>
> Any suggestions re getting rid of that barrier?

Hmm. The CPU2 mntput can only happen under RCU readlock, right? After
the RCU grace period _and_ if the umount is going ahead, nothing
should have a mnt pointer, right?

So I'm wondering if you couldn't just have a synchronize_rcu() in that
umount path, after clearing mnt_ns. At that point you _know_ you're
the only one that should have access to the mnt.

You'd need to drop the mount-hash lock for that. But I think you can
do it in umount_tree(), right? IOW, you could make the rule be that
umount_tree() must be called with the namespace lock and the
mount-hash lock, and it will drop both. Or does that get too painful
too?

Linus

2013-10-03 20:41:44

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 01:19:16PM -0700, Linus Torvalds wrote:

> Hmm. The CPU2 mntput can only happen under RCU readlock, right? After
> the RCU grace period _and_ if the umount is going ahead, nothing
> should have a mnt pointer, right?

umount -l doesn't care.

> So I'm wondering if you couldn't just have a synchronize_rcu() in that
> umount path, after clearing mnt_ns. At that point you _know_ you're
> the only one that should have access to the mnt.

We have it there. See namespace_unlock(). And you are right about the
locking rules for umount_tree(), except that caller is responsible
for dropping those. With (potentially final) mntput() happening after
both (well, as part of namespace_unlock(), done after synchronize_rcu()).

The problem is this:
A = 1, B = 1
CPU1:
A = 0
<full barrier>
synchronize_rcu()
read B

CPU2:
rcu_read_lock()
B = 0
read A

Are we guaranteed that we won't get both of them seeing ones, in situation
when that rcu_read_lock() comes too late to be noticed by synchronize_rcu()?

2013-10-03 20:52:48

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
>
> The problem is this:
> A = 1, B = 1
> CPU1:
> A = 0
> <full barrier>
> synchronize_rcu()
> read B
>
> CPU2:
> rcu_read_lock()
> B = 0
> read A
>
> Are we guaranteed that we won't get both of them seeing ones, in situation
> when that rcu_read_lock() comes too late to be noticed by synchronize_rcu()?

Yeah, I think we should be guaranteed that, because the
synchronize_rcu() will guarantee that all other CPU's go through an
idle period. So the "read A" on CPU2 cannot possibly see a 1 _unless_
it happens so early that synchronize_rcu() definitely sees it (ie it's
a "preexisting reader" by definition), in which case synchronize_rcu()
will be waiting for a subsequent idle period, in which case the B=0 on
CPU2 is not only guaranteed to happen but also be visible out, so the
"read B" on CPU1 will see 0. And that's true even if CPU2 doesn't have
an explicit memory barrier, because the "RCU idle" state implies that
it has gone through a barrier.

So I don't see how they could possibly see ones. Modulo terminal bugs
in synchronize_barrier() (which can be very slow, but for umount I
wouldn't worry). Or modulo my brain being fried.

Linus

2013-10-03 21:14:51

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:

> Yeah, I think we should be guaranteed that, because the
> synchronize_rcu() will guarantee that all other CPU's go through an
> idle period. So the "read A" on CPU2 cannot possibly see a 1 _unless_
> it happens so early that synchronize_rcu() definitely sees it (ie it's
> a "preexisting reader" by definition), in which case synchronize_rcu()
> will be waiting for a subsequent idle period, in which case the B=0 on
> CPU2 is not only guaranteed to happen but also be visible out, so the
> "read B" on CPU1 will see 0. And that's true even if CPU2 doesn't have
> an explicit memory barrier, because the "RCU idle" state implies that
> it has gone through a barrier.
>
> So I don't see how they could possibly see ones. Modulo terminal bugs
> in synchronize_barrier() (which can be very slow, but for umount I
> wouldn't worry). Or modulo my brain being fried.

There's one more place similar to that - kern_unmount(). There we also
go from "longterm vfsmount, mntput() doesn't need to bother checking"
to NULL ->mnt_ns. We can, of course, slap synchronize_rcu() there as
well, but that might make pid_ns and ipc_ns destruction slow...

2013-10-03 23:28:37

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:
> On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
> >
> > The problem is this:
> > A = 1, B = 1
> > CPU1:
> > A = 0
> > <full barrier>
> > synchronize_rcu()
> > read B
> >
> > CPU2:
> > rcu_read_lock()
> > B = 0
> > read A
> >
> > Are we guaranteed that we won't get both of them seeing ones, in situation
> > when that rcu_read_lock() comes too late to be noticed by synchronize_rcu()?
>
> Yeah, I think we should be guaranteed that, because the
> synchronize_rcu() will guarantee that all other CPU's go through an
> idle period. So the "read A" on CPU2 cannot possibly see a 1 _unless_
> it happens so early that synchronize_rcu() definitely sees it (ie it's
> a "preexisting reader" by definition), in which case synchronize_rcu()
> will be waiting for a subsequent idle period, in which case the B=0 on
> CPU2 is not only guaranteed to happen but also be visible out, so the
> "read B" on CPU1 will see 0. And that's true even if CPU2 doesn't have
> an explicit memory barrier, because the "RCU idle" state implies that
> it has gone through a barrier.

I think the reasoning in one direction is actually quite a bit less
obvious than that.

rcu_read_unlock() does *not* necessarily imply a memory barrier (so the
B=0 can actually move logically outside the rcu_read_unlock()), but
synchronize_rcu() *does* imply (and enforce) that a memory barrier has
occurred on all CPUs as part of quiescence. However, likewise,
rcu_read_lock() doesn't imply anything in particular about writes; it
does enforce either that reads can't leak earlier or that if they do a
synchronize_rcu() will still wait for them, but I don't think the safety
interaction between a *write* in the RCU reader and a *read* in the RCU
writer necessarily follows from that enforcement.

(Also, to the best of my knowledge, you don't even need a barrier on
CPU1; synchronize_rcu() should imply one.)

If synchronize_rcu() on CPU1 sees rcu_read_lock() on CPU2, then
synchronize_rcu() will wait for CPU2's read-side critical section and a
memory barrier before reading B, so CPU1 will see B==0.

The harder direction: If synchronize_rcu() on CPU1 does not see
rcu_read_lock() on CPU2, then it won't necessarily wait for anything,
and since rcu_read_lock() itself does not imply any CPU write barriers,
it's not at all obvious that rcu_read_lock() prevents B=0 from occurring
before CPU1's read of B.

In short, the interaction between RCU's ordering guarantees and CPU
memory barriers in the presence of writes on the read side and reads on
the write side does not seem sufficiently clear to support the portable
use of the above pattern without an smp_wmb() on CPU2 between
rcu_read_lock() and B=0. I think it might happen to work with the
current implementations of RCU (with which synchronize_rcu() won't
actually notice a quiescent state and return until after either
the rcu_read_unlock() or a preemption point), but by the strict semantic
guarantees of the RCU primitives I think you could write a legitimate
RCU implementation that would break the above code.

That said, I believe this pattern *will* work with every existing
implementation of RCU. Thus, I'd suggest documenting it as a warning to
prospective RCU optimizers to avoid breaking the above pattern.

- Josh Triplett

2013-10-03 23:51:03

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 3, 2013 at 4:28 PM, Josh Triplett <[email protected]> wrote:
> On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:
>> On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
>> >
>> > The problem is this:
>> > A = 1, B = 1
>> > CPU1:
>> > A = 0
>> > <full barrier>
>> > synchronize_rcu()
>> > read B
>> >
>> > CPU2:
>> > rcu_read_lock()
>> > B = 0
>> > read A
>> >
>> > Are we guaranteed that we won't get both of them seeing ones, in situation
>> > when that rcu_read_lock() comes too late to be noticed by synchronize_rcu()?
>>
>> Yeah, I think we should be guaranteed that, because the
>> synchronize_rcu() will guarantee that all other CPU's go through an
>> idle period. So the "read A" on CPU2 cannot possibly see a 1 _unless_
>> it happens so early that synchronize_rcu() definitely sees it (ie it's
>> a "preexisting reader" by definition), in which case synchronize_rcu()
>> will be waiting for a subsequent idle period, in which case the B=0 on
>> CPU2 is not only guaranteed to happen but also be visible out, so the
>> "read B" on CPU1 will see 0. And that's true even if CPU2 doesn't have
>> an explicit memory barrier, because the "RCU idle" state implies that
>> it has gone through a barrier.
>
> I think the reasoning in one direction is actually quite a bit less
> obvious than that.
>
> rcu_read_unlock() does *not* necessarily imply a memory barrier

Don't think of it in those terms.

The only thing that matters is semantics. The semantics of
synchronize_rcu() is that it needs to wait for all RCU users. It's
that simple. By definition, anything inside a "rcu_read_lock()" is a
RCU user, so if we have a read of memory (memory barrier or not), then
synchronize_rcu() needs to wait for it. Otherwise serialize_rcu() is
clearly totally broken.

Now, the fact that the normal rcu_read_lock() is just a compiler
barrier may make you think "oh, it cannot work", but the thing is, the
way things happens is that synchronize_rcu() ends up relying on the
_scheduler_ data structures, rather than anything else. It requires
seeing an idle scheduler state for each CPU after being called.

Linus

2013-10-04 00:41:17

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 04:51:00PM -0700, Linus Torvalds wrote:
> On Thu, Oct 3, 2013 at 4:28 PM, Josh Triplett <[email protected]> wrote:
> > On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:
> >> On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
> >> >
> >> > The problem is this:
> >> > A = 1, B = 1
> >> > CPU1:
> >> > A = 0
> >> > <full barrier>
> >> > synchronize_rcu()
> >> > read B
> >> >
> >> > CPU2:
> >> > rcu_read_lock()
> >> > B = 0
> >> > read A
> >> >
> >> > Are we guaranteed that we won't get both of them seeing ones, in situation
> >> > when that rcu_read_lock() comes too late to be noticed by synchronize_rcu()?
> >>
> >> Yeah, I think we should be guaranteed that, because the
> >> synchronize_rcu() will guarantee that all other CPU's go through an
> >> idle period. So the "read A" on CPU2 cannot possibly see a 1 _unless_
> >> it happens so early that synchronize_rcu() definitely sees it (ie it's
> >> a "preexisting reader" by definition), in which case synchronize_rcu()
> >> will be waiting for a subsequent idle period, in which case the B=0 on
> >> CPU2 is not only guaranteed to happen but also be visible out, so the
> >> "read B" on CPU1 will see 0. And that's true even if CPU2 doesn't have
> >> an explicit memory barrier, because the "RCU idle" state implies that
> >> it has gone through a barrier.
> >
> > I think the reasoning in one direction is actually quite a bit less
> > obvious than that.
> >
> > rcu_read_unlock() does *not* necessarily imply a memory barrier
>
> Don't think of it in those terms.
>
> The only thing that matters is semantics. The semantics of
> synchronize_rcu() is that it needs to wait for all RCU users. It's
> that simple. By definition, anything inside a "rcu_read_lock()" is a
> RCU user, so if we have a read of memory (memory barrier or not), then
> synchronize_rcu() needs to wait for it. Otherwise serialize_rcu() is
> clearly totally broken.

Read, yes, but I don't think that's enough to force your example above
to work in all cases. That requires semantics beyond what RCU's
primitives guarantee, and I don't think you can draw conclusions about
those semantics without talking about CPU memory barriers.

synchronize_rcu() says it'll wait for any in-progress reader, which
includes whatever that reader might be doing. That guarantees one
direction of your example above. It's a tool for controlling what the
*reader* can observe. The vast majority of RCU usage models assume the
writers will use a lock to guard against each other, and that readers
don't make modifications. synchronize_rcu() doesn't make any particular
guarantees in the other direction about what the *writer* can observe,
especially in the case where synchronize_rcu() does not observe the
rcu_read_lock() and doesn't need to count that reader. (Also note that
B=0 is a blind write, with no read.)

The example above will work on x86 with any reasonable implementation
(due to write ordering guarantees), and it should work on any
architecture with all the current implementations of RCU in the Linux
kernel. If we want to require that all future implementations of RCU
allow the above example to work, we should document that example and
associated assumption for future reference, because I don't think the
semantics of the RCU primitives inherently guarantee it.

> Now, the fact that the normal rcu_read_lock() is just a compiler
> barrier may make you think "oh, it cannot work", but the thing is, the
> way things happens is that synchronize_rcu() ends up relying on the
> _scheduler_ data structures, rather than anything else. It requires
> seeing an idle scheduler state for each CPU after being called.

That's a detail of implementation. In practice, the required semantics
of synchronize_rcu() would be perfectly satisfied by an implementation
that can divine that no readers are currently running without waiting
for everyone to pass through the scheduler. synchronize_sched() means
"everyone has scheduled"; synchronize_rcu() *only* means "any readers
in-progress when I called synchronize_rcu() have finished when
synchronize_rcu() returns", which does not have to imply a trip through
the scheduler.

- Josh Triplett

2013-10-04 00:45:25

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 3, 2013 at 5:41 PM, Josh Triplett <[email protected]> wrote:
>
> Read, yes, but I don't think that's enough to force your example above
> to work in all cases. That requires semantics beyond what RCU's
> primitives guarantee, and I don't think you can draw conclusions about
> those semantics without talking about CPU memory barriers.

We seriosly depend on nothing leaking out. Not just reads. The "U" in
RCU is "update". So it's reads, and it's writes. The fact that it says
"read" in "rcu_read_lock()" doesn't mean that only reads would be
affected.

And no, this still has nothing to do with memory barriers. Every
single RCU user depends on the memory freeing being delayed by RCU,
for example. And again, that's not just reads. It's people taking
spinlocks on things that are RCU-protected etc.

So no, there is no question about this. The only question would be
whether we have some RCU mode that is _buggy_, not whether you need
extra memory barriers. And that is certainly possible.

Linus

2013-10-04 02:53:56

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 10:14:48PM +0100, Al Viro wrote:

> > So I don't see how they could possibly see ones. Modulo terminal bugs
> > in synchronize_barrier() (which can be very slow, but for umount I
> > wouldn't worry). Or modulo my brain being fried.
>
> There's one more place similar to that - kern_unmount(). There we also
> go from "longterm vfsmount, mntput() doesn't need to bother checking"
> to NULL ->mnt_ns. We can, of course, slap synchronize_rcu() there as
> well, but that might make pid_ns and ipc_ns destruction slow...

OK, fuse side of things done, smp_mb() in mntput_no_expire() dropped,
kern_umount() got synchronize_rcu() (I'm not happy about the last one,
but... hell knows; I want to see profiles before deciding what to do
about it).

Updated branch force-pushed. BTW, brlock defines can go after that;
we still two instances of lg_lock, but they spell the primitives
out instead of using br_{read,write}_lock aliases.

Speaking of those two - I really want to see file_table.c one killed.
Christoph, do you have anything along the lines of getting rid of the
mark_files_ro() nonsense? After all, a combination of r/w vfsmount
and a superblock with MS_RDONLY in flags should do about the right thing
these days... I can probably knock something together tomorrow, but
you've brought that thing up quite a few times, so if you happen to have
a patch more or less ready...

2013-10-04 05:30:07

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 04:28:27PM -0700, Josh Triplett wrote:
> On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:
> > On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
> > >
> > > The problem is this:
> > > A = 1, B = 1
> > > CPU1:
> > > A = 0
> > > <full barrier>
> > > synchronize_rcu()
> > > read B
> > >
> > > CPU2:
> > > rcu_read_lock()
> > > B = 0
> > > read A

/me scratches his head...

OK, for CPU2 to see 1 from its read from A, the corresponding RCU
read-side critical section must have started before CPU1 did A=0. This
means that this same RCU read-side critical section must have started
before CPU1's synchronize_rcu(), which means that it must complete
before that synchronize_rcu() returns. Therefore, CPU2's B=0 must
execute before CPU1's read of B, hence that read of B must return zero.

Conversely, if CPU1's read from B returns 1, we know that CPU2's
RCU read-side critical section must not have completed until after
CPU1's synchronize_rcu() returned, which means that the RCU read-side
critical section must have started after that synchronize_rcu() started,
so CPU1's assignment to A must also have already happened. Therefore,
CPU2's read from A must return zero.

> > > Are we guaranteed that we won't get both of them seeing ones, in situation
> > > when that rcu_read_lock() comes too late to be noticed by synchronize_rcu()?

Yes, at least one of CPU1's and CPU2's reads must return zero.

For whatever it is worth, it is easier to talk about if you write it
this way (easier for me, anyway):

A = 1, B = 1

CPU1:
A = 0
<full barrier>
synchronize_rcu()
r1 = B

CPU2:
rcu_read_lock()
B = 0
r2 = A

Then we are guaranteed that r1==0||r2==0.

> > Yeah, I think we should be guaranteed that, because the
> > synchronize_rcu() will guarantee that all other CPU's go through an
> > idle period. So the "read A" on CPU2 cannot possibly see a 1 _unless_
> > it happens so early that synchronize_rcu() definitely sees it (ie it's
> > a "preexisting reader" by definition), in which case synchronize_rcu()
> > will be waiting for a subsequent idle period, in which case the B=0 on
> > CPU2 is not only guaranteed to happen but also be visible out, so the
> > "read B" on CPU1 will see 0. And that's true even if CPU2 doesn't have
> > an explicit memory barrier, because the "RCU idle" state implies that
> > it has gone through a barrier.

I agree that the "<full barrier>" has no effect in this case.

The memory-barrier guarantees of RCU's grace-period primitives are
documented in the excessively long header comment for synchronize_sched(),
which documents an extended LKML conversation between Oleg Nesterov
and myself. Here is a summary, which applies on systems with more than
one CPU:

o When synchronize_sched() returns, each CPU is guaranteed to have
executed a full memory barrier since the end of its last RCU-sched
read-side critical section whose beginning preceded the call
to synchronize_sched().

o Each CPU having an RCU read-side critical section that extends
beyond the return from synchronize_sched() is guaranteed
to have executed a full memory barrier after the beginning
of synchronize_sched() and before the beginning of that RCU
read-side critical section.

o If CPU A invoked synchronize_sched(), which returned to its
caller on CPU B, then both CPU A and CPU B are guaranteed to
have executed a full memory barrier during the execution of
synchronize_sched(). This also applies when CPUs A and B are
the same CPU.

These guarantees apply to -all- CPUs, even those that are currently
offline or idle. Analogous guarantees of course apply to all flavors
of RCU.

> I think the reasoning in one direction is actually quite a bit less
> obvious than that.
>
> rcu_read_unlock() does *not* necessarily imply a memory barrier (so the
> B=0 can actually move logically outside the rcu_read_unlock()), but
> synchronize_rcu() *does* imply (and enforce) that a memory barrier has
> occurred on all CPUs as part of quiescence. However, likewise,
> rcu_read_lock() doesn't imply anything in particular about writes; it
> does enforce either that reads can't leak earlier or that if they do a
> synchronize_rcu() will still wait for them, but I don't think the safety
> interaction between a *write* in the RCU reader and a *read* in the RCU
> writer necessarily follows from that enforcement.
>
> (Also, to the best of my knowledge, you don't even need a barrier on
> CPU1; synchronize_rcu() should imply one.)
>
> If synchronize_rcu() on CPU1 sees rcu_read_lock() on CPU2, then
> synchronize_rcu() will wait for CPU2's read-side critical section and a
> memory barrier before reading B, so CPU1 will see B==0.
>
> The harder direction: If synchronize_rcu() on CPU1 does not see
> rcu_read_lock() on CPU2, then it won't necessarily wait for anything,
> and since rcu_read_lock() itself does not imply any CPU write barriers,
> it's not at all obvious that rcu_read_lock() prevents B=0 from occurring
> before CPU1's read of B.
>
> In short, the interaction between RCU's ordering guarantees and CPU
> memory barriers in the presence of writes on the read side and reads on
> the write side does not seem sufficiently clear to support the portable
> use of the above pattern without an smp_wmb() on CPU2 between
> rcu_read_lock() and B=0. I think it might happen to work with the
> current implementations of RCU (with which synchronize_rcu() won't
> actually notice a quiescent state and return until after either
> the rcu_read_unlock() or a preemption point), but by the strict semantic
> guarantees of the RCU primitives I think you could write a legitimate
> RCU implementation that would break the above code.
>
> That said, I believe this pattern *will* work with every existing
> implementation of RCU. Thus, I'd suggest documenting it as a warning to
> prospective RCU optimizers to avoid breaking the above pattern.

I believe that the comment headers for synchronize_sched() and call_rcu()
do document the required restrictions. Could you please take a look at
them and let me know if I left some wiggle room that needs to be taken
care of?

Thanx, Paul

2013-10-04 06:03:15

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 10:29:59PM -0700, Paul E. McKenney wrote:
> On Thu, Oct 03, 2013 at 04:28:27PM -0700, Josh Triplett wrote:
> > On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:
> > > On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
> > > >
> > > > The problem is this:
> > > > A = 1, B = 1
> > > > CPU1:
> > > > A = 0
> > > > <full barrier>
> > > > synchronize_rcu()
> > > > read B
> > > >
> > > > CPU2:
> > > > rcu_read_lock()
> > > > B = 0
> > > > read A
>
> /me scratches his head...
>
> OK, for CPU2 to see 1 from its read from A, the corresponding RCU
> read-side critical section must have started before CPU1 did A=0. This
> means that this same RCU read-side critical section must have started
> before CPU1's synchronize_rcu(), which means that it must complete
> before that synchronize_rcu() returns. Therefore, CPU2's B=0 must
> execute before CPU1's read of B, hence that read of B must return zero.
>
> Conversely, if CPU1's read from B returns 1, we know that CPU2's
> RCU read-side critical section must not have completed until after
> CPU1's synchronize_rcu() returned, which means that the RCU read-side
> critical section must have started after that synchronize_rcu() started,
> so CPU1's assignment to A must also have already happened. Therefore,
> CPU2's read from A must return zero.

Yeah, that makes sense.

I think too much time spent staring at the *implementation* of RCU and
the exciting assumptions it has to make about barriers or memory
operations leaking out of the implementations of the RCU primitives (for
instance, the fun needed to guarantee a memory barrier on all CPUs, or
to safely use non-atomic operations inside RCU itself) makes it entirely
too difficult to look at a perfectly ordinary *use* of RCU primitives
and see the obvious. :)

- Josh Triplett

2013-10-04 06:15:11

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 11:03:05PM -0700, Josh Triplett wrote:
> On Thu, Oct 03, 2013 at 10:29:59PM -0700, Paul E. McKenney wrote:
> > On Thu, Oct 03, 2013 at 04:28:27PM -0700, Josh Triplett wrote:
> > > On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:
> > > > On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
> > > > >
> > > > > The problem is this:
> > > > > A = 1, B = 1
> > > > > CPU1:
> > > > > A = 0
> > > > > <full barrier>
> > > > > synchronize_rcu()
> > > > > read B
> > > > >
> > > > > CPU2:
> > > > > rcu_read_lock()
> > > > > B = 0
> > > > > read A
> >
> > /me scratches his head...
> >
> > OK, for CPU2 to see 1 from its read from A, the corresponding RCU
> > read-side critical section must have started before CPU1 did A=0. This
> > means that this same RCU read-side critical section must have started
> > before CPU1's synchronize_rcu(), which means that it must complete
> > before that synchronize_rcu() returns. Therefore, CPU2's B=0 must
> > execute before CPU1's read of B, hence that read of B must return zero.
> >
> > Conversely, if CPU1's read from B returns 1, we know that CPU2's
> > RCU read-side critical section must not have completed until after
> > CPU1's synchronize_rcu() returned, which means that the RCU read-side
> > critical section must have started after that synchronize_rcu() started,
> > so CPU1's assignment to A must also have already happened. Therefore,
> > CPU2's read from A must return zero.
>
> Yeah, that makes sense.
>
> I think too much time spent staring at the *implementation* of RCU and
> the exciting assumptions it has to make about barriers or memory
> operations leaking out of the implementations of the RCU primitives (for
> instance, the fun needed to guarantee a memory barrier on all CPUs, or
> to safely use non-atomic operations inside RCU itself) makes it entirely
> too difficult to look at a perfectly ordinary *use* of RCU primitives
> and see the obvious. :)

I must confess that my first thought upon seeing Al's example was "but
of course CPU2's write to B and read from A can be reordered by either
the compiler or the CPU!" I had to look again myself. ;-)

Thanx, Paul

2013-10-04 06:41:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts


* Linus Torvalds <[email protected]> wrote:

> On Thu, Oct 3, 2013 at 5:41 PM, Josh Triplett <[email protected]> wrote:
> >
> > Read, yes, but I don't think that's enough to force your example above
> > to work in all cases. That requires semantics beyond what RCU's
> > primitives guarantee, and I don't think you can draw conclusions about
> > those semantics without talking about CPU memory barriers.
>
> We seriosly depend on nothing leaking out. Not just reads. The "U" in
> RCU is "update". So it's reads, and it's writes. The fact that it says
> "read" in "rcu_read_lock()" doesn't mean that only reads would be
> affected.
>
> And no, this still has nothing to do with memory barriers. Every single
> RCU user depends on the memory freeing being delayed by RCU, for
> example. And again, that's not just reads. It's people taking spinlocks
> on things that are RCU-protected etc.
>
> So no, there is no question about this. The only question would be
> whether we have some RCU mode that is _buggy_, not whether you need
> extra memory barriers. And that is certainly possible.

Broken RCU modes are not unheard of, but Paul is extremely methodical
about testing and reviewing all the details - including formal proof
testing methods. There are lots of high-profile, high-frequency RCU users
in the kernel that make use of every aspect of RCU semantics and any
breakage would affect them as well.

There are over 2000 RCU critical sections in the kernel today, so the
likelyhood of the VFS triggering an unknown bug, without other users
breaking already, is fairly low. If it happens it will be fixed like other
RCU bugs.

So I really wouldn't worry about it too much. If you don't mind the
additional sys_umount() delay then RCU is goodness.

Thanks,

Ingo

2013-10-04 07:04:57

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Thu, Oct 03, 2013 at 11:15:03PM -0700, Paul E. McKenney wrote:
> On Thu, Oct 03, 2013 at 11:03:05PM -0700, Josh Triplett wrote:
> > On Thu, Oct 03, 2013 at 10:29:59PM -0700, Paul E. McKenney wrote:
> > > On Thu, Oct 03, 2013 at 04:28:27PM -0700, Josh Triplett wrote:
> > > > On Thu, Oct 03, 2013 at 01:52:45PM -0700, Linus Torvalds wrote:
> > > > > On Thu, Oct 3, 2013 at 1:41 PM, Al Viro <[email protected]> wrote:
> > > > > >
> > > > > > The problem is this:
> > > > > > A = 1, B = 1
> > > > > > CPU1:
> > > > > > A = 0
> > > > > > <full barrier>
> > > > > > synchronize_rcu()
> > > > > > read B
> > > > > >
> > > > > > CPU2:
> > > > > > rcu_read_lock()
> > > > > > B = 0
> > > > > > read A
> > >
> > > /me scratches his head...
> > >
> > > OK, for CPU2 to see 1 from its read from A, the corresponding RCU
> > > read-side critical section must have started before CPU1 did A=0. This
> > > means that this same RCU read-side critical section must have started
> > > before CPU1's synchronize_rcu(), which means that it must complete
> > > before that synchronize_rcu() returns. Therefore, CPU2's B=0 must
> > > execute before CPU1's read of B, hence that read of B must return zero.
> > >
> > > Conversely, if CPU1's read from B returns 1, we know that CPU2's
> > > RCU read-side critical section must not have completed until after
> > > CPU1's synchronize_rcu() returned, which means that the RCU read-side
> > > critical section must have started after that synchronize_rcu() started,
> > > so CPU1's assignment to A must also have already happened. Therefore,
> > > CPU2's read from A must return zero.
> >
> > Yeah, that makes sense.
> >
> > I think too much time spent staring at the *implementation* of RCU and
> > the exciting assumptions it has to make about barriers or memory
> > operations leaking out of the implementations of the RCU primitives (for
> > instance, the fun needed to guarantee a memory barrier on all CPUs, or
> > to safely use non-atomic operations inside RCU itself) makes it entirely
> > too difficult to look at a perfectly ordinary *use* of RCU primitives
> > and see the obvious. :)
>
> I must confess that my first thought upon seeing Al's example was "but
> of course CPU2's write to B and read from A can be reordered by either
> the compiler or the CPU!" I had to look again myself. ;-)

Exactly.

- Josh Triplett

2013-10-04 08:37:37

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Fri, Oct 04, 2013 at 03:53:51AM +0100, Al Viro wrote:
> Speaking of those two - I really want to see file_table.c one killed.
> Christoph, do you have anything along the lines of getting rid of the
> mark_files_ro() nonsense? After all, a combination of r/w vfsmount
> and a superblock with MS_RDONLY in flags should do about the right thing
> these days... I can probably knock something together tomorrow, but
> you've brought that thing up quite a few times, so if you happen to have
> a patch more or less ready...

I used to have a patch for it that was also sent to the list long ago,
but it got rid of the sysrq emergency remount r/o, which people didn't like.

2013-10-04 12:58:50

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

On Fri, Oct 04, 2013 at 10:37:29AM +0200, Christoph Hellwig wrote:
> On Fri, Oct 04, 2013 at 03:53:51AM +0100, Al Viro wrote:
> > Speaking of those two - I really want to see file_table.c one killed.
> > Christoph, do you have anything along the lines of getting rid of the
> > mark_files_ro() nonsense? After all, a combination of r/w vfsmount
> > and a superblock with MS_RDONLY in flags should do about the right thing
> > these days... I can probably knock something together tomorrow, but
> > you've brought that thing up quite a few times, so if you happen to have
> > a patch more or less ready...
>
> I used to have a patch for it that was also sent to the list long ago,
> but it got rid of the sysrq emergency remount r/o, which people didn't like.

Well... What I had in mind is less drastic; something like this (on top
of #experimental); does anybody see any problems with it?

diff --git a/fs/file_table.c b/fs/file_table.c
index e61e552..23b6dca 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,8 +36,6 @@ struct files_stat_struct files_stat = {
.max_files = NR_FILE
};

-DEFINE_STATIC_LGLOCK(files_lglock);
-
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;

@@ -134,7 +132,6 @@ struct file *get_empty_filp(void)
return ERR_PTR(error);
}

- INIT_LIST_HEAD(&f->f_u.fu_list);
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
@@ -304,7 +301,6 @@ void fput(struct file *file)
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;

- file_sb_list_del(file);
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
@@ -333,7 +329,6 @@ void __fput_sync(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
- file_sb_list_del(file);
BUG_ON(!(task->flags & PF_KTHREAD));
__fput(file);
}
@@ -345,129 +340,10 @@ void put_filp(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
security_file_free(file);
- file_sb_list_del(file);
file_free(file);
}
}

-static inline int file_list_cpu(struct file *file)
-{
-#ifdef CONFIG_SMP
- return file->f_sb_list_cpu;
-#else
- return smp_processor_id();
-#endif
-}
-
-/* helper for file_sb_list_add to reduce ifdefs */
-static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
-{
- struct list_head *list;
-#ifdef CONFIG_SMP
- int cpu;
- cpu = smp_processor_id();
- file->f_sb_list_cpu = cpu;
- list = per_cpu_ptr(sb->s_files, cpu);
-#else
- list = &sb->s_files;
-#endif
- list_add(&file->f_u.fu_list, list);
-}
-
-/**
- * file_sb_list_add - add a file to the sb's file list
- * @file: file to add
- * @sb: sb to add it to
- *
- * Use this function to associate a file with the superblock of the inode it
- * refers to.
- */
-void file_sb_list_add(struct file *file, struct super_block *sb)
-{
- if (likely(!(file->f_mode & FMODE_WRITE)))
- return;
- if (!S_ISREG(file_inode(file)->i_mode))
- return;
- lg_local_lock(&files_lglock);
- __file_sb_list_add(file, sb);
- lg_local_unlock(&files_lglock);
-}
-
-/**
- * file_sb_list_del - remove a file from the sb's file list
- * @file: file to remove
- * @sb: sb to remove it from
- *
- * Use this function to remove a file from its superblock.
- */
-void file_sb_list_del(struct file *file)
-{
- if (!list_empty(&file->f_u.fu_list)) {
- lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
- list_del_init(&file->f_u.fu_list);
- lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
- }
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * These macros iterate all files on all CPUs for a given superblock.
- * files_lglock must be held globally.
- */
-#define do_file_list_for_each_entry(__sb, __file) \
-{ \
- int i; \
- for_each_possible_cpu(i) { \
- struct list_head *list; \
- list = per_cpu_ptr((__sb)->s_files, i); \
- list_for_each_entry((__file), list, f_u.fu_list)
-
-#define while_file_list_for_each_entry \
- } \
-}
-
-#else
-
-#define do_file_list_for_each_entry(__sb, __file) \
-{ \
- struct list_head *list; \
- list = &(sb)->s_files; \
- list_for_each_entry((__file), list, f_u.fu_list)
-
-#define while_file_list_for_each_entry \
-}
-
-#endif
-
-/**
- * mark_files_ro - mark all files read-only
- * @sb: superblock in question
- *
- * All files are marked read-only. We don't care about pending
- * delete files so this should be used in 'force' mode only.
- */
-void mark_files_ro(struct super_block *sb)
-{
- struct file *f;
-
- lg_global_lock(&files_lglock);
- do_file_list_for_each_entry(sb, f) {
- if (!file_count(f))
- continue;
- if (!(f->f_mode & FMODE_WRITE))
- continue;
- spin_lock(&f->f_lock);
- f->f_mode &= ~FMODE_WRITE;
- spin_unlock(&f->f_lock);
- if (file_check_writeable(f) != 0)
- continue;
- __mnt_drop_write(f->f_path.mnt);
- file_release_write(f);
- } while_file_list_for_each_entry;
- lg_global_unlock(&files_lglock);
-}
-
void __init files_init(unsigned long mempages)
{
unsigned long n;
@@ -483,6 +359,5 @@ void __init files_init(unsigned long mempages)
n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
files_defer_init();
- lg_lock_init(&files_lglock, "files_lglock");
percpu_counter_init(&nr_files, 0);
}
diff --git a/fs/internal.h b/fs/internal.h
index 4a11e75..4657424 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,9 +73,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
*/
-extern void file_sb_list_add(struct file *f, struct super_block *sb);
-extern void file_sb_list_del(struct file *f);
-extern void mark_files_ro(struct super_block *);
extern struct file *get_empty_filp(void);

/*
diff --git a/fs/open.c b/fs/open.c
index a1465b1..fffbed4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,7 +685,6 @@ static int do_dentry_open(struct file *f,
}

f->f_mapping = inode->i_mapping;
- file_sb_list_add(f, inode->i_sb);

if (unlikely(f->f_mode & FMODE_PATH)) {
f->f_op = &empty_fops;
@@ -724,7 +723,6 @@ static int do_dentry_open(struct file *f,

cleanup_all:
fops_put(f->f_op);
- file_sb_list_del(f);
if (f->f_mode & FMODE_WRITE) {
put_write_access(inode);
if (!special_file(inode->i_mode)) {
diff --git a/fs/super.c b/fs/super.c
index efa6e48..539d9bc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -140,9 +140,6 @@ static void destroy_super(struct super_block *s)
int i;
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
-#ifdef CONFIG_SMP
- free_percpu(s->s_files);
-#endif
for (i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_counter_destroy(&s->s_writers.counter[i]);
security_sb_free(s);
@@ -172,15 +169,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
if (security_sb_alloc(s))
goto fail;

-#ifdef CONFIG_SMP
- s->s_files = alloc_percpu(struct list_head);
- if (!s->s_files)
- goto fail;
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
-#else
- INIT_LIST_HEAD(&s->s_files);
-#endif
for (i = 0; i < SB_FREEZE_LEVELS; i++) {
if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
goto fail;
@@ -722,7 +710,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
make sure there are no rw files opened */
if (remount_ro) {
if (force) {
- mark_files_ro(sb);
+ sb->s_readonly_remount = 1;
+ smp_wmb();
} else {
retval = sb_prepare_remount_readonly(sb);
if (retval)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b09e4e1..e6b0109 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -764,12 +764,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
#define FILE_MNT_WRITE_RELEASED 2

struct file {
- /*
- * fu_list becomes invalid after file_free is called and queued via
- * fu_rcuhead for RCU freeing
- */
union {
- struct list_head fu_list;
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
@@ -783,9 +778,6 @@ struct file {
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
-#ifdef CONFIG_SMP
- int f_sb_list_cpu;
-#endif
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
@@ -1264,11 +1256,6 @@ struct super_block {

struct list_head s_inodes; /* all inodes */
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
-#ifdef CONFIG_SMP
- struct list_head __percpu *s_files;
-#else
- struct list_head s_files;
-#endif
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;

2013-10-04 14:00:58

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 17/17] RCU'd vfsmounts

That patch looks fine to me. Having the s_readonly_remount infrastructure
around certainly makes things easier.