From: Dave Hansen <[email protected]>
I have a _tiny_ microbenchmark that sits in a loop and writes
single bytes to a file. Writing one byte to a tmpfs file is
around 2x slower than reading one byte from a file, which is a
_bit_ more than I expecte. This is a dumb benchmark, but I think
it's hard to deny that write() is a hot path and we should avoid
unnecessary overhead there.
I did a 'perf record' of 30-second samples of read and write.
The top item in a diffprofile is srcu_read_lock() from
fsnotify(). There are active inotify fd's from systemd, but
nothing is actually listening to the file or its part of
the filesystem.
I *think* we can avoid taking the srcu_read_lock() for the
common case where there are no actual marks on the file.
This means that there will both be nothing to notify for
*and* implies that there is no need for clearing the ignore
mask.
This patch gave a 13.8% speedup in writes/second on my test,
which is an improvement from the 10.8% that I saw with the
last version.
Signed-off-by: Dave Hansen <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Jan Kara <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Eric Paris <[email protected]>
Cc: John McCutchan <[email protected]>
Cc: Robert Love <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: [email protected]
---
b/fs/notify/fsnotify.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff -puN fs/notify/fsnotify.c~optimize-fsnotify fs/notify/fsnotify.c
--- a/fs/notify/fsnotify.c~optimize-fsnotify 2015-06-24 17:14:34.573109264 -0700
+++ b/fs/notify/fsnotify.c 2015-06-24 17:14:34.576109399 -0700
@@ -213,6 +213,16 @@ int fsnotify(struct inode *to_tell, __u3
!(test_mask & to_tell->i_fsnotify_mask) &&
!(mnt && test_mask & mnt->mnt_fsnotify_mask))
return 0;
+ /*
+ * Optimization: srcu_read_lock() has a memory barrier which can
+ * be expensive. It protects walking the *_fsnotify_marks lists.
+ * However, if we do not walk the lists, we do not have to do
+ * SRCU because we have no references to any objects and do not
+ * need SRCU to keep them "alive".
+ */
+ if (!to_tell->i_fsnotify_marks.first &&
+ (!mnt || !mnt->mnt_fsnotify_marks.first))
+ return 0;
idx = srcu_read_lock(&fsnotify_mark_srcu);
_
From: Dave Hansen <[email protected]>
Currently, __sb_start_write() and freeze_super() can race with
each other. __sb_start_write() uses a smp_mb() to ensure that
freeze_super() can see its write to sb->s_writers.counter and
that it can see freeze_super()'s update to sb->s_writers.frozen.
This all seems to work fine.
But, this smp_mb() makes __sb_start_write() the single hottest
function in the kernel if I sit in a loop and do tiny write()s to
tmpfs over and over. This is on a very small 2-core system, so
it will only get worse on larger systems.
This _seems_ like an ideal case for RCU. __sb_start_write() is
the RCU read-side and is in a very fast, performance-sensitive
path. freeze_super() is the RCU writer and is in an extremely
rare non-performance-sensitive path.
Instead of doing and smp_wmb() in __sb_start_write(), we do
rcu_read_lock(). This ensures that a CPU doing freeze_super()
can not proceed past its synchronize_rcu() until the grace
period has ended and the 's_writers.frozen = SB_FREEZE_WRITE'
is visible to __sb_start_write().
One question here: Does the work that __sb_start_write() does in
a previous grace period becomes visible to freeze_super() after
its call to synchronize_rcu()? It _seems_ like it should, but it
seems backwards to me since __sb_start_write() is the "reader" in
this case.
This patch increases the number of writes/second that I can do
by 5.6%.
Does anybody see any holes with this?
Cc: Jan Kara <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Paul E. McKenney <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
b/fs/super.c | 63 ++++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 39 insertions(+), 24 deletions(-)
diff -puN fs/super.c~rcu-__sb_start_write fs/super.c
--- a/fs/super.c~rcu-__sb_start_write 2015-06-24 17:14:34.939125713 -0700
+++ b/fs/super.c 2015-06-24 17:14:34.942125847 -0700
@@ -1190,27 +1190,21 @@ static void acquire_freeze_lock(struct s
*/
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
-retry:
- if (unlikely(sb->s_writers.frozen >= level)) {
+ rcu_read_lock();
+ while (unlikely(sb->s_writers.frozen >= level)) {
+ rcu_read_unlock();
if (!wait)
return 0;
wait_event(sb->s_writers.wait_unfrozen,
sb->s_writers.frozen < level);
+ rcu_read_lock();
}
#ifdef CONFIG_LOCKDEP
acquire_freeze_lock(sb, level, !wait, _RET_IP_);
#endif
percpu_counter_inc(&sb->s_writers.counter[level-1]);
- /*
- * Make sure counter is updated before we check for frozen.
- * freeze_super() first sets frozen and then checks the counter.
- */
- smp_mb();
- if (unlikely(sb->s_writers.frozen >= level)) {
- __sb_end_write(sb, level);
- goto retry;
- }
+ rcu_read_unlock();
return 1;
}
EXPORT_SYMBOL(__sb_start_write);
@@ -1254,6 +1248,29 @@ static void sb_wait_write(struct super_b
} while (writers);
}
+static void __thaw_super(struct super_block *sb)
+{
+ sb->s_writers.frozen = SB_UNFROZEN;
+ /*
+ * RCU protects us against races where we are taking
+ * s_writers.frozen in to a less permissive state. When
+ * that happens, __sb_start_write() might not yet have
+ * seen our write and might still increment
+ * s_writers.counter.
+ *
+ * Here, however, we are transitioning to a _more_
+ * permissive state. The filesystem is frozen and no
+ * writes to s_writers.counter are being permitted.
+ *
+ * A smp_wmb() is sufficient here because we just need
+ * to ensure that new calls __sb_start_write() are
+ * allowed, not that _concurrent_ calls have finished.
+ */
+ smp_wmb();
+ wake_up(&sb->s_writers.wait_unfrozen);
+ deactivate_locked_super(sb);
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
@@ -1312,7 +1329,13 @@ int freeze_super(struct super_block *sb)
/* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
- smp_wmb();
+ /*
+ * After we synchronize_rcu(), we have ensured that everyone
+ * who reads sb->s_writers.frozen under rcu_read_lock() can
+ * now see our update. This pretty much means that
+ * __sb_start_write() will not allow any new writers.
+ */
+ synchronize_rcu();
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);
@@ -1322,7 +1345,7 @@ int freeze_super(struct super_block *sb)
/* Now we go and block page faults... */
down_write(&sb->s_umount);
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
- smp_wmb();
+ synchronize_rcu();
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
@@ -1331,7 +1354,7 @@ int freeze_super(struct super_block *sb)
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
- smp_wmb();
+ synchronize_rcu();
sb_wait_write(sb, SB_FREEZE_FS);
if (sb->s_op->freeze_fs) {
@@ -1339,11 +1362,7 @@ int freeze_super(struct super_block *sb)
if (ret) {
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
- sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
- wake_up(&sb->s_writers.wait_unfrozen);
- deactivate_locked_super(sb);
- return ret;
+ __thaw_super(sb);
}
}
/*
@@ -1386,11 +1405,7 @@ int thaw_super(struct super_block *sb)
}
out:
- sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
- wake_up(&sb->s_writers.wait_unfrozen);
- deactivate_locked_super(sb);
-
+ __thaw_super(sb);
return 0;
}
EXPORT_SYMBOL(thaw_super);
_
From: Dave Hansen <[email protected]>
If I sit in a loop and do write()s to small tmpfs files,
__sb_end_write() is third-hottest kernel function due to its
smp_mb().
__sb_end_write() uses the barrier to avoid races with freeze_super()
and its calls to sb_wait_write(). But, now that freeze_super() is
calling synchronize_rcu() before each sb_wait_write() call, we can
use that to our advantage.
The synchronize_rcu() ensures that all __sb_end_write() will see
freeze_super()'s updates to s_writers.counter. That, in turn,
guarantees that __sb_end_write() will try to wake up any subsequent
call by freeze_super() to sb_wait_write().
Cc: Jan Kara <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Paul E. McKenney <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
b/fs/super.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff -puN fs/super.c~selectively-do-barriers-in-__sb_end_write fs/super.c
--- a/fs/super.c~selectively-do-barriers-in-__sb_end_write 2015-06-24 17:14:35.315142611 -0700
+++ b/fs/super.c 2015-06-24 17:14:35.318142745 -0700
@@ -1146,14 +1146,23 @@ out:
*/
void __sb_end_write(struct super_block *sb, int level)
{
+ rcu_read_lock();
percpu_counter_dec(&sb->s_writers.counter[level-1]);
/*
- * Make sure s_writers are updated before we wake up waiters in
- * freeze_super().
+ * We are racing here with freeze_super()'s calls to
+ * sb_wait_write(). We want to ensure that we call
+ * wake_up() whenever one of those calls _might_ be
+ * in sb_wait_write().
+ *
+ * Since freeze_super() does a synchronize_rcu() before
+ * each of its sb_wait_write() calls, it can guarantee
+ * that it sees our update to s_writers.counter as well
+ * as that we see its update to s_writers.frozen.
*/
- smp_mb();
- if (waitqueue_active(&sb->s_writers.wait))
+ if (unlikely(sb->s_writers.frozen >= level))
wake_up(&sb->s_writers.wait);
+ rcu_read_unlock();
+
rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
}
EXPORT_SYMBOL(__sb_end_write);
_
From: Dave Hansen <[email protected]>
Use the new 'struct fsnotify_head' for the vfsmount fsnotify data,
just like we did for inodes in the last patch.
Cc: Jan Kara <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Paul E. McKenney <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
b/fs/mount.h | 6 ++----
b/fs/namespace.c | 2 +-
b/fs/notify/fanotify/fanotify_user.c | 4 ++--
b/fs/notify/fsnotify.c | 8 ++++----
b/fs/notify/vfsmount_mark.c | 14 +++++++-------
5 files changed, 16 insertions(+), 18 deletions(-)
diff -puN fs/mount.h~fsnotify_head_mnt fs/mount.h
--- a/fs/mount.h~fsnotify_head_mnt 2015-06-24 17:14:36.276185800 -0700
+++ b/fs/mount.h 2015-06-24 17:14:36.286186250 -0700
@@ -3,6 +3,7 @@
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
+#include <linux/fsnotify_head.h>
struct mnt_namespace {
atomic_t count;
@@ -55,10 +56,7 @@ struct mount {
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
-#ifdef CONFIG_FSNOTIFY
- struct hlist_head mnt_fsnotify_marks;
- __u32 mnt_fsnotify_mask;
-#endif
+ struct fsnotify_head mnt_fsnotify;
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
diff -puN fs/namespace.c~fsnotify_head_mnt fs/namespace.c
--- a/fs/namespace.c~fsnotify_head_mnt 2015-06-24 17:14:36.278185890 -0700
+++ b/fs/namespace.c 2015-06-24 17:14:36.287186295 -0700
@@ -235,7 +235,7 @@ static struct mount *alloc_vfsmnt(const
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+ INIT_HLIST_HEAD(&mnt->mnt_fsnotify.marks);
#endif
init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
}
diff -puN fs/notify/fanotify/fanotify_user.c~fsnotify_head_mnt fs/notify/fanotify/fanotify_user.c
--- a/fs/notify/fanotify/fanotify_user.c~fsnotify_head_mnt 2015-06-24 17:14:36.279185935 -0700
+++ b/fs/notify/fanotify/fanotify_user.c 2015-06-24 17:14:36.287186295 -0700
@@ -533,7 +533,7 @@ static int fanotify_remove_vfsmount_mark
mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
- if (removed & real_mount(mnt)->mnt_fsnotify_mask)
+ if (removed & real_mount(mnt)->mnt_fsnotify.mask)
fsnotify_recalc_vfsmount_mask(mnt);
return 0;
@@ -641,7 +641,7 @@ static int fanotify_add_vfsmount_mark(st
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
mutex_unlock(&group->mark_mutex);
- if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
+ if (added & ~real_mount(mnt)->mnt_fsnotify.mask)
fsnotify_recalc_vfsmount_mask(mnt);
fsnotify_put_mark(fsn_mark);
diff -puN fs/notify/fsnotify.c~fsnotify_head_mnt fs/notify/fsnotify.c
--- a/fs/notify/fsnotify.c~fsnotify_head_mnt 2015-06-24 17:14:36.281186025 -0700
+++ b/fs/notify/fsnotify.c 2015-06-24 17:14:36.288186339 -0700
@@ -211,7 +211,7 @@ int fsnotify(struct inode *to_tell, __u3
*/
if (!(mask & FS_MODIFY) &&
!(test_mask & to_tell->i_fsnotify.mask) &&
- !(mnt && test_mask & mnt->mnt_fsnotify_mask))
+ !(mnt && test_mask & mnt->mnt_fsnotify.mask))
return 0;
/*
* Optimization: srcu_read_lock() has a memory barrier which can
@@ -221,7 +221,7 @@ int fsnotify(struct inode *to_tell, __u3
* need SRCU to keep them "alive".
*/
if (!to_tell->i_fsnotify.marks.first &&
- (!mnt || !mnt->mnt_fsnotify_marks.first))
+ (!mnt || !mnt->mnt_fsnotify.marks.first))
return 0;
idx = srcu_read_lock(&fsnotify_mark_srcu);
@@ -232,8 +232,8 @@ int fsnotify(struct inode *to_tell, __u3
&fsnotify_mark_srcu);
if (mnt && ((mask & FS_MODIFY) ||
- (test_mask & mnt->mnt_fsnotify_mask))) {
- vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+ (test_mask & mnt->mnt_fsnotify.mask))) {
+ vfsmount_node = srcu_dereference(mnt->mnt_fsnotify.marks.first,
&fsnotify_mark_srcu);
inode_node = srcu_dereference(to_tell->i_fsnotify.marks.first,
&fsnotify_mark_srcu);
diff -puN fs/notify/vfsmount_mark.c~fsnotify_head_mnt fs/notify/vfsmount_mark.c
--- a/fs/notify/vfsmount_mark.c~fsnotify_head_mnt 2015-06-24 17:14:36.282186070 -0700
+++ b/fs/notify/vfsmount_mark.c 2015-06-24 17:14:36.288186339 -0700
@@ -38,7 +38,7 @@ void fsnotify_clear_marks_by_mount(struc
LIST_HEAD(free_list);
spin_lock(&mnt->mnt_root->d_lock);
- hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
+ hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify.marks, obj_list) {
list_add(&mark->free_list, &free_list);
hlist_del_init_rcu(&mark->obj_list);
fsnotify_get_mark(mark);
@@ -54,7 +54,7 @@ void fsnotify_clear_vfsmount_marks_by_gr
}
/*
- * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
+ * Recalculate the mnt->mnt_fsnotify.mask, or the mask of all FS_* event types
* any notifier is interested in hearing for this mount point
*/
void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
@@ -62,7 +62,7 @@ void fsnotify_recalc_vfsmount_mask(struc
struct mount *m = real_mount(mnt);
spin_lock(&mnt->mnt_root->d_lock);
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+ m->mnt_fsnotify.mask = fsnotify_recalc_mask(&m->mnt_fsnotify.marks);
spin_unlock(&mnt->mnt_root->d_lock);
}
@@ -79,7 +79,7 @@ void fsnotify_destroy_vfsmount_mark(stru
hlist_del_init_rcu(&mark->obj_list);
mark->mnt = NULL;
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+ m->mnt_fsnotify.mask = fsnotify_recalc_mask(&m->mnt_fsnotify.marks);
spin_unlock(&mnt->mnt_root->d_lock);
}
@@ -94,7 +94,7 @@ struct fsnotify_mark *fsnotify_find_vfsm
struct fsnotify_mark *mark;
spin_lock(&mnt->mnt_root->d_lock);
- mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
+ mark = fsnotify_find_mark(&m->mnt_fsnotify.marks, group);
spin_unlock(&mnt->mnt_root->d_lock);
return mark;
@@ -119,8 +119,8 @@ int fsnotify_add_vfsmount_mark(struct fs
spin_lock(&mnt->mnt_root->d_lock);
mark->mnt = mnt;
- ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+ ret = fsnotify_add_mark_list(&m->mnt_fsnotify.marks, mark, allow_dups);
+ m->mnt_fsnotify.mask = fsnotify_recalc_mask(&m->mnt_fsnotify.marks);
spin_unlock(&mnt->mnt_root->d_lock);
return ret;
_
From: Dave Hansen <[email protected]>
Both inodes and vfsmounts have fsnotify data embedded in them.
The data is always a "mask" and a "mark". Take those two
fields and move them in to a new structure: struct fsnotify_head.
We will shortly be adding a new field to this.
This also lets us get rid of the ugly #ifdef in 'struct inode'.
In searching for i_fsnotify_mark, my regex found the fsnotify_mark
comment about g_list. I think the comment was wrong and corrected
it.
Cc: Jan Kara <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Paul E. McKenney <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
b/fs/inode.c | 4 ++--
b/fs/notify/fanotify/fanotify_user.c | 4 ++--
b/fs/notify/fsnotify.c | 12 ++++++------
b/fs/notify/inode_mark.c | 18 +++++++++---------
b/fs/notify/inotify/inotify_user.c | 2 +-
b/include/linux/fs.h | 6 ++----
b/include/linux/fsnotify_backend.h | 8 ++++----
b/include/linux/fsnotify_head.h | 17 +++++++++++++++++
b/kernel/auditsc.c | 4 ++--
9 files changed, 45 insertions(+), 30 deletions(-)
diff -puN fs/inode.c~fsnotify_head fs/inode.c
--- a/fs/inode.c~fsnotify_head 2015-06-24 17:14:35.694159644 -0700
+++ b/fs/inode.c 2015-06-24 17:14:35.711160408 -0700
@@ -181,7 +181,7 @@ int inode_init_always(struct super_block
#endif
#ifdef CONFIG_FSNOTIFY
- inode->i_fsnotify_mask = 0;
+ inode->i_fsnotify.mask = 0;
#endif
inode->i_flctx = NULL;
this_cpu_inc(nr_inodes);
@@ -363,7 +363,7 @@ void inode_init_once(struct inode *inode
address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
+ INIT_HLIST_HEAD(&inode->i_fsnotify.marks);
#endif
}
EXPORT_SYMBOL(inode_init_once);
diff -puN fs/notify/fanotify/fanotify_user.c~fsnotify_head fs/notify/fanotify/fanotify_user.c
--- a/fs/notify/fanotify/fanotify_user.c~fsnotify_head 2015-06-24 17:14:35.696159734 -0700
+++ b/fs/notify/fanotify/fanotify_user.c 2015-06-24 17:14:35.712160453 -0700
@@ -562,7 +562,7 @@ static int fanotify_remove_inode_mark(st
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
- if (removed & inode->i_fsnotify_mask)
+ if (removed & inode->i_fsnotify.mask)
fsnotify_recalc_inode_mask(inode);
return 0;
@@ -679,7 +679,7 @@ static int fanotify_add_inode_mark(struc
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
mutex_unlock(&group->mark_mutex);
- if (added & ~inode->i_fsnotify_mask)
+ if (added & ~inode->i_fsnotify.mask)
fsnotify_recalc_inode_mask(inode);
fsnotify_put_mark(fsn_mark);
diff -puN fs/notify/fsnotify.c~fsnotify_head fs/notify/fsnotify.c
--- a/fs/notify/fsnotify.c~fsnotify_head 2015-06-24 17:14:35.697159779 -0700
+++ b/fs/notify/fsnotify.c 2015-06-24 17:14:35.712160453 -0700
@@ -104,7 +104,7 @@ int __fsnotify_parent(struct path *path,
if (unlikely(!fsnotify_inode_watches_children(p_inode)))
__fsnotify_update_child_dentry_flags(p_inode);
- else if (p_inode->i_fsnotify_mask & mask) {
+ else if (p_inode->i_fsnotify.mask & mask) {
/* we are notifying a parent so come up with the new mask which
* specifies these are events which came from a child. */
mask |= FS_EVENT_ON_CHILD;
@@ -210,7 +210,7 @@ int fsnotify(struct inode *to_tell, __u3
* this type of event.
*/
if (!(mask & FS_MODIFY) &&
- !(test_mask & to_tell->i_fsnotify_mask) &&
+ !(test_mask & to_tell->i_fsnotify.mask) &&
!(mnt && test_mask & mnt->mnt_fsnotify_mask))
return 0;
/*
@@ -220,22 +220,22 @@ int fsnotify(struct inode *to_tell, __u3
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (!to_tell->i_fsnotify_marks.first &&
+ if (!to_tell->i_fsnotify.marks.first &&
(!mnt || !mnt->mnt_fsnotify_marks.first))
return 0;
idx = srcu_read_lock(&fsnotify_mark_srcu);
if ((mask & FS_MODIFY) ||
- (test_mask & to_tell->i_fsnotify_mask))
- inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+ (test_mask & to_tell->i_fsnotify.mask))
+ inode_node = srcu_dereference(to_tell->i_fsnotify.marks.first,
&fsnotify_mark_srcu);
if (mnt && ((mask & FS_MODIFY) ||
(test_mask & mnt->mnt_fsnotify_mask))) {
vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
&fsnotify_mark_srcu);
- inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+ inode_node = srcu_dereference(to_tell->i_fsnotify.marks.first,
&fsnotify_mark_srcu);
}
diff -puN fs/notify/inode_mark.c~fsnotify_head fs/notify/inode_mark.c
--- a/fs/notify/inode_mark.c~fsnotify_head 2015-06-24 17:14:35.699159868 -0700
+++ b/fs/notify/inode_mark.c 2015-06-24 17:14:35.712160453 -0700
@@ -31,13 +31,13 @@
#include "../internal.h"
/*
- * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * Recalculate the inode->i_fsnotify.mask, or the mask of all FS_* event types
* any notifier is interested in hearing for this inode.
*/
void fsnotify_recalc_inode_mask(struct inode *inode)
{
spin_lock(&inode->i_lock);
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+ inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
spin_unlock(&inode->i_lock);
__fsnotify_update_child_dentry_flags(inode);
@@ -56,11 +56,11 @@ void fsnotify_destroy_inode_mark(struct
mark->inode = NULL;
/*
- * this mark is now off the inode->i_fsnotify_marks list and we
+ * this mark is now off the inode->i_fsnotify.marks list and we
* hold the inode->i_lock, so this is the perfect time to update the
- * inode->i_fsnotify_mask
+ * inode->i_fsnotify.mask
*/
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+ inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
spin_unlock(&inode->i_lock);
}
@@ -74,7 +74,7 @@ void fsnotify_clear_marks_by_inode(struc
LIST_HEAD(free_list);
spin_lock(&inode->i_lock);
- hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
+ hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify.marks, obj_list) {
list_add(&mark->free_list, &free_list);
hlist_del_init_rcu(&mark->obj_list);
fsnotify_get_mark(mark);
@@ -102,7 +102,7 @@ struct fsnotify_mark *fsnotify_find_inod
struct fsnotify_mark *mark;
spin_lock(&inode->i_lock);
- mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+ mark = fsnotify_find_mark(&inode->i_fsnotify.marks, group);
spin_unlock(&inode->i_lock);
return mark;
@@ -153,9 +153,9 @@ int fsnotify_add_inode_mark(struct fsnot
spin_lock(&inode->i_lock);
mark->inode = inode;
- ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
+ ret = fsnotify_add_mark_list(&inode->i_fsnotify.marks, mark,
allow_dups);
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+ inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
spin_unlock(&inode->i_lock);
return ret;
diff -puN fs/notify/inotify/inotify_user.c~fsnotify_head fs/notify/inotify/inotify_user.c
--- a/fs/notify/inotify/inotify_user.c~fsnotify_head 2015-06-24 17:14:35.701159959 -0700
+++ b/fs/notify/inotify/inotify_user.c 2015-06-24 17:14:35.713160498 -0700
@@ -547,7 +547,7 @@ static int inotify_update_existing_watch
/* more bits in old than in new? */
int dropped = (old_mask & ~new_mask);
/* more bits in this fsn_mark than the inode's mask? */
- int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+ int do_inode = (new_mask & ~inode->i_fsnotify.mask);
/* update the inode with this new fsn_mark */
if (dropped || do_inode)
diff -puN include/linux/fs.h~fsnotify_head include/linux/fs.h
--- a/include/linux/fs.h~fsnotify_head 2015-06-24 17:14:35.702160003 -0700
+++ b/include/linux/fs.h 2015-06-24 17:14:35.710160363 -0700
@@ -30,6 +30,7 @@
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/blk_types.h>
+#include <linux/fsnotify_head.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -660,10 +661,7 @@ struct inode {
__u32 i_generation;
-#ifdef CONFIG_FSNOTIFY
- __u32 i_fsnotify_mask; /* all events this inode cares about */
- struct hlist_head i_fsnotify_marks;
-#endif
+ struct fsnotify_head i_fsnotify;
void *i_private; /* fs or device private pointer */
};
diff -puN include/linux/fsnotify_backend.h~fsnotify_head include/linux/fsnotify_backend.h
--- a/include/linux/fsnotify_backend.h~fsnotify_head 2015-06-24 17:14:35.704160093 -0700
+++ b/include/linux/fsnotify_backend.h 2015-06-24 17:14:35.709160318 -0700
@@ -212,7 +212,7 @@ struct fsnotify_mark {
* in kernel that found and may be using this mark. */
atomic_t refcnt; /* active things looking at this mark */
struct fsnotify_group *group; /* group this mark is for */
- struct list_head g_list; /* list of marks by group->i_fsnotify_marks
+ struct list_head g_list; /* list of marks by group->marks_list
* Also reused for queueing mark into
* destroy_list when it's waiting for
* the end of SRCU period before it can
@@ -249,11 +249,11 @@ extern u32 fsnotify_get_cookie(void);
static inline int fsnotify_inode_watches_children(struct inode *inode)
{
/* FS_EVENT_ON_CHILD is set if the inode may care */
- if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
+ if (!(inode->i_fsnotify.mask & FS_EVENT_ON_CHILD))
return 0;
/* this inode might care about child events, does it care about the
* specific set of events that can happen on a child? */
- return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
+ return inode->i_fsnotify.mask & FS_EVENTS_POSS_ON_CHILD;
}
/*
@@ -326,7 +326,7 @@ extern struct fsnotify_event *fsnotify_r
/* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */
extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt);
-/* run all marks associated with an inode and update inode->i_fsnotify_mask */
+/* run all marks associated with an inode and update inode->i_fsnotify.mask */
extern void fsnotify_recalc_inode_mask(struct inode *inode);
extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
/* find (and take a reference) to a mark associated with group and inode */
diff -puN /dev/null include/linux/fsnotify_head.h
--- /dev/null 2015-06-17 12:44:31.632705131 -0700
+++ b/include/linux/fsnotify_head.h 2015-06-24 17:14:35.711160408 -0700
@@ -0,0 +1,17 @@
+#ifndef __LINUX_FSNOTIFY_HEAD_H
+#define __LINUX_FSNOTIFY_HEAD_H
+
+#include <linux/types.h>
+
+/*
+ * This gets embedded in vfsmounts and inodes.
+ */
+
+struct fsnotify_head {
+#ifdef CONFIG_FSNOTIFY
+ __u32 mask; /* all events this object cares about */
+ struct hlist_head marks;
+#endif
+};
+
+#endif /* __LINUX_FSNOTIFY_HEAD_H */
diff -puN kernel/auditsc.c~fsnotify_head kernel/auditsc.c
--- a/kernel/auditsc.c~fsnotify_head 2015-06-24 17:14:35.706160183 -0700
+++ b/kernel/auditsc.c 2015-06-24 17:14:35.714160543 -0700
@@ -1587,7 +1587,7 @@ static inline void handle_one(const stru
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
- if (likely(hlist_empty(&inode->i_fsnotify_marks)))
+ if (likely(hlist_empty(&inode->i_fsnotify.marks)))
return;
context = current->audit_context;
p = context->trees;
@@ -1630,7 +1630,7 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d_backing_inode(d);
- if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
+ if (inode && unlikely(!hlist_empty(&inode->i_fsnotify.marks))) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
if (chunk) {
_
From: Dave Hansen <[email protected]>
fsnotify_recalc_mask() currently takes a list of fsnotify_marks
and calculates a mask from them. Now that we store the marks
and the masks together in a fsnotify_head, just pass the whole
thing in and have fsnotify_recalc_mask() set ->mask internally.
Cc: Jan Kara <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Paul E. McKenney <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
b/fs/notify/fsnotify.h | 5 +++--
b/fs/notify/inode_mark.c | 6 +++---
b/fs/notify/mark.c | 8 +++++---
b/fs/notify/vfsmount_mark.c | 6 +++---
4 files changed, 14 insertions(+), 11 deletions(-)
diff -puN fs/notify/fsnotify.h~fsnotify_recalc_mask-redo fs/notify/fsnotify.h
--- a/fs/notify/fsnotify.h~fsnotify_recalc_mask-redo 2015-06-24 17:14:36.757207417 -0700
+++ b/fs/notify/fsnotify.h 2015-06-24 17:14:36.766207822 -0700
@@ -3,6 +3,7 @@
#include <linux/list.h>
#include <linux/fsnotify.h>
+#include <linux/fsnotify_head.h>
#include <linux/srcu.h>
#include <linux/types.h>
@@ -12,8 +13,8 @@ extern void fsnotify_flush_notify(struct
/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;
-/* Calculate mask of events for a list of marks */
-extern u32 fsnotify_recalc_mask(struct hlist_head *head);
+/* Recalculate the fsnotify_head's mask from its marks */
+extern void fsnotify_recalc_mask(struct fsnotify_head *fsn);
/* compare two groups for sorting of marks lists */
extern int fsnotify_compare_groups(struct fsnotify_group *a,
diff -puN fs/notify/inode_mark.c~fsnotify_recalc_mask-redo fs/notify/inode_mark.c
--- a/fs/notify/inode_mark.c~fsnotify_recalc_mask-redo 2015-06-24 17:14:36.759207507 -0700
+++ b/fs/notify/inode_mark.c 2015-06-24 17:14:36.766207822 -0700
@@ -37,7 +37,7 @@
void fsnotify_recalc_inode_mask(struct inode *inode)
{
spin_lock(&inode->i_lock);
- inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
+ fsnotify_recalc_mask(&inode->i_fsnotify);
spin_unlock(&inode->i_lock);
__fsnotify_update_child_dentry_flags(inode);
@@ -60,7 +60,7 @@ void fsnotify_destroy_inode_mark(struct
* hold the inode->i_lock, so this is the perfect time to update the
* inode->i_fsnotify.mask
*/
- inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
+ fsnotify_recalc_mask(&inode->i_fsnotify);
spin_unlock(&inode->i_lock);
}
@@ -155,7 +155,7 @@ int fsnotify_add_inode_mark(struct fsnot
mark->inode = inode;
ret = fsnotify_add_mark_list(&inode->i_fsnotify.marks, mark,
allow_dups);
- inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
+ fsnotify_recalc_mask(&inode->i_fsnotify);
spin_unlock(&inode->i_lock);
return ret;
diff -puN fs/notify/mark.c~fsnotify_recalc_mask-redo fs/notify/mark.c
--- a/fs/notify/mark.c~fsnotify_recalc_mask-redo 2015-06-24 17:14:36.760207552 -0700
+++ b/fs/notify/mark.c 2015-06-24 17:14:36.765207777 -0700
@@ -89,6 +89,7 @@
#include <linux/atomic.h>
#include <linux/fsnotify_backend.h>
+#include <linux/fsnotify_head.h>
#include "fsnotify.h"
struct srcu_struct fsnotify_mark_srcu;
@@ -111,14 +112,15 @@ void fsnotify_put_mark(struct fsnotify_m
}
/* Calculate mask of events for a list of marks */
-u32 fsnotify_recalc_mask(struct hlist_head *head)
+void fsnotify_recalc_mask(struct fsnotify_head *fsn)
{
u32 new_mask = 0;
struct fsnotify_mark *mark;
- hlist_for_each_entry(mark, head, obj_list)
+ hlist_for_each_entry(mark, &fsn->marks, obj_list)
new_mask |= mark->mask;
- return new_mask;
+
+ fsn->mask = new_mask;
}
/*
diff -puN fs/notify/vfsmount_mark.c~fsnotify_recalc_mask-redo fs/notify/vfsmount_mark.c
--- a/fs/notify/vfsmount_mark.c~fsnotify_recalc_mask-redo 2015-06-24 17:14:36.762207642 -0700
+++ b/fs/notify/vfsmount_mark.c 2015-06-24 17:14:36.766207822 -0700
@@ -62,7 +62,7 @@ void fsnotify_recalc_vfsmount_mask(struc
struct mount *m = real_mount(mnt);
spin_lock(&mnt->mnt_root->d_lock);
- m->mnt_fsnotify.mask = fsnotify_recalc_mask(&m->mnt_fsnotify.marks);
+ fsnotify_recalc_mask(&m->mnt_fsnotify);
spin_unlock(&mnt->mnt_root->d_lock);
}
@@ -79,7 +79,7 @@ void fsnotify_destroy_vfsmount_mark(stru
hlist_del_init_rcu(&mark->obj_list);
mark->mnt = NULL;
- m->mnt_fsnotify.mask = fsnotify_recalc_mask(&m->mnt_fsnotify.marks);
+ fsnotify_recalc_mask(&m->mnt_fsnotify);
spin_unlock(&mnt->mnt_root->d_lock);
}
@@ -120,7 +120,7 @@ int fsnotify_add_vfsmount_mark(struct fs
spin_lock(&mnt->mnt_root->d_lock);
mark->mnt = mnt;
ret = fsnotify_add_mark_list(&m->mnt_fsnotify.marks, mark, allow_dups);
- m->mnt_fsnotify.mask = fsnotify_recalc_mask(&m->mnt_fsnotify.marks);
+ fsnotify_recalc_mask(&m->mnt_fsnotify);
spin_unlock(&mnt->mnt_root->d_lock);
return ret;
_
From: Dave Hansen <[email protected]>
According to Jan Kara:
You can have ignored mask set without any of the
notification masks set and you are expected to clear the
ignored mask on the first IN_MODIFY event.
But, the only way we currently have to go and find if we need to
do this ignored-mask-clearing is to go through the mark lists
and look for them. That mark list iteration requires an
srcu_read_lock() which has a memory barrier and can be expensive.
The calculation of 'has_ignore' is pretty cheap because we store
it next to another value which we are updating and we do it
inside of a loop we were already running.
This patch will really only matter when we have a workload where
a file is being modified often _and_ there is an active fsnotify
mark on it. Otherwise the checks against *_fsnotify.marks.first
will keep us out of the expensive srcu_read_lock() call.
Cc: Jan Kara <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Paul E. McKenney <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
b/fs/notify/fsnotify.c | 44 ++++++++++++++++++++++++++++++++++------
b/fs/notify/mark.c | 8 +++++--
b/include/linux/fsnotify_head.h | 1
3 files changed, 45 insertions(+), 8 deletions(-)
diff -puN fs/notify/fsnotify.c~fsnotify-ignore-present fs/notify/fsnotify.c
--- a/fs/notify/fsnotify.c~fsnotify-ignore-present 2015-06-24 17:14:37.187226743 -0700
+++ b/fs/notify/fsnotify.c 2015-06-24 17:14:37.194227057 -0700
@@ -183,6 +183,34 @@ static int send_to_group(struct inode *t
}
/*
+ * The "logical or" of all of the marks' ->mask is kept in the
+ * i/mnt_fsnotify.mask. We can check it instead of going
+ * through all of the marks. fsnotify_recalc_mask() does the
+ * updates.
+ */
+static int some_mark_is_interested(__u32 mask, struct inode *inode, struct mount *mnt)
+{
+ if (mask & inode->i_fsnotify.mask)
+ return 1;
+ if (mnt && (mask & mnt->mnt_fsnotify.mask))
+ return 1;
+ return 0;
+}
+
+/*
+ * fsnotify_recalc_mask() recalculates "has_ignore" whenever any
+ * mark's flags change.
+ */
+static int some_mark_needs_ignore_clear(struct inode *inode, struct mount *mnt)
+{
+ if (inode->i_fsnotify.has_ignore)
+ return 1;
+ if (mnt && mnt->mnt_fsnotify.has_ignore)
+ return 1;
+ return 0;
+}
+
+/*
* This is the main call to fsnotify. The VFS calls into hook specific functions
* in linux/fsnotify.h. Those functions then in turn call here. Here will call
* out to all of the registered fsnotify_group. Those groups can then use the
@@ -205,14 +233,18 @@ int fsnotify(struct inode *to_tell, __u3
mnt = NULL;
/*
- * if this is a modify event we may need to clear the ignored masks
- * otherwise return if neither the inode nor the vfsmount care about
- * this type of event.
+ * We must clear the (user-visible) ignored mask on the first IN_MODIFY
+ * event despite the 'mask' which is passed in here. But we can safely
+ * skip that step if we know there are no marks which need this action.
+ *
+ * We can also skip looking at the list of marks if we know that none
+ * of the marks are interested in the events in our 'mask'.
*/
- if (!(mask & FS_MODIFY) &&
- !(test_mask & to_tell->i_fsnotify.mask) &&
- !(mnt && test_mask & mnt->mnt_fsnotify.mask))
+ if ((mask & FS_MODIFY) && !some_mark_needs_ignore_clear(to_tell, mnt))
+ return 0;
+ else if (!some_mark_is_interested(test_mask, to_tell, mnt))
return 0;
+
/*
* Optimization: srcu_read_lock() has a memory barrier which can
* be expensive. It protects walking the *_fsnotify_marks lists.
diff -puN fs/notify/mark.c~fsnotify-ignore-present fs/notify/mark.c
--- a/fs/notify/mark.c~fsnotify-ignore-present 2015-06-24 17:14:37.189226832 -0700
+++ b/fs/notify/mark.c 2015-06-24 17:14:37.194227057 -0700
@@ -116,10 +116,14 @@ void fsnotify_recalc_mask(struct fsnotif
{
u32 new_mask = 0;
struct fsnotify_mark *mark;
+ u32 has_ignore = 0;
- hlist_for_each_entry(mark, &fsn->marks, obj_list)
+ hlist_for_each_entry(mark, &fsn->marks, obj_list) {
+ if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+ has_ignore = 1;
new_mask |= mark->mask;
-
+ }
+ fsn->has_ignore = has_ignore;
fsn->mask = new_mask;
}
diff -puN include/linux/fsnotify_head.h~fsnotify-ignore-present include/linux/fsnotify_head.h
--- a/include/linux/fsnotify_head.h~fsnotify-ignore-present 2015-06-24 17:14:37.190226877 -0700
+++ b/include/linux/fsnotify_head.h 2015-06-24 17:14:37.193227012 -0700
@@ -11,6 +11,7 @@ struct fsnotify_head {
#ifdef CONFIG_FSNOTIFY
__u32 mask; /* all events this object cares about */
struct hlist_head marks;
+ __u32 has_ignore; /* any marks has ignore set */
#endif
};
_
On Wed, 2015-06-24 at 17:16 -0700, Dave Hansen wrote:
> From: Dave Hansen <[email protected]>
>
> I have a _tiny_ microbenchmark that sits in a loop and writes
> single bytes to a file. Writing one byte to a tmpfs file is
> around 2x slower than reading one byte from a file, which is a
> _bit_ more than I expecte. This is a dumb benchmark, but I think
> it's hard to deny that write() is a hot path and we should avoid
> unnecessary overhead there.
>
> I did a 'perf record' of 30-second samples of read and write.
> The top item in a diffprofile is srcu_read_lock() from
> fsnotify(). There are active inotify fd's from systemd, but
> nothing is actually listening to the file or its part of
> the filesystem.
>
> I *think* we can avoid taking the srcu_read_lock() for the
> common case where there are no actual marks on the file.
> This means that there will both be nothing to notify for
> *and* implies that there is no need for clearing the ignore
> mask.
>
> This patch gave a 13.8% speedup in writes/second on my test,
> which is an improvement from the 10.8% that I saw with the
> last version.
>
> Signed-off-by: Dave Hansen <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Jan Kara <[email protected]>
> Cc: Al Viro <[email protected]>
> Cc: Eric Paris <[email protected]>
> Cc: John McCutchan <[email protected]>
> Cc: Robert Love <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: [email protected]
> ---
>
> b/fs/notify/fsnotify.c | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff -puN fs/notify/fsnotify.c~optimize-fsnotify fs/notify/fsnotify.c
> --- a/fs/notify/fsnotify.c~optimize-fsnotify 2015-06-24
> 17:14:34.573109264 -0700
> +++ b/fs/notify/fsnotify.c 2015-06-24 17:14:34.576109399 -0700
> @@ -213,6 +213,16 @@ int fsnotify(struct inode *to_tell, __u3
> !(test_mask & to_tell->i_fsnotify_mask) &&
> !(mnt && test_mask & mnt->mnt_fsnotify_mask))
> return 0;
> + /*
> + * Optimization: srcu_read_lock() has a memory barrier which
> can
> + * be expensive. It protects walking the *_fsnotify_marks
> lists.
> + * However, if we do not walk the lists, we do not have to
> do
> + * SRCU because we have no references to any objects and do
> not
> + * need SRCU to keep them "alive".
> + */
> + if (!to_tell->i_fsnotify_marks.first &&
> + (!mnt || !mnt->mnt_fsnotify_marks.first))
> + return 0;
two useless peeps from the old peanut gallery of long lost....
1) should you actually move this check up before the IN_MODIFY check?
This seems like it would be by far the most common case, and you'd save
yourself a bunch of useless conditionals/bit operations.
2) do you want to use hlist_empty(&to_tell->i_fsnotify_marks) instead,
for readability (and they are static inline, so compiled code is the
same)
It is fine as it is. Don't know how much you want to try to bikeshed...
On 06/24/2015 05:57 PM, Eric Paris wrote:
> On Wed, 2015-06-24 at 17:16 -0700, Dave Hansen wrote:
>> + if (!to_tell->i_fsnotify_marks.first &&
>> + (!mnt || !mnt->mnt_fsnotify_marks.first))
>> + return 0;
>
> two useless peeps from the old peanut gallery of long lost....
>
> 1) should you actually move this check up before the IN_MODIFY check?
> This seems like it would be by far the most common case, and you'd save
> yourself a bunch of useless conditionals/bit operations.
Doing that actually makes fsnotify() 82 bytes smaller for me. I think
you're also right that the new check is a much more general condition
than the existing one. I'll move it up when I post again.
> 2) do you want to use hlist_empty(&to_tell->i_fsnotify_marks) instead,
> for readability (and they are static inline, so compiled code is the
> same)
Yeah I guess that makes sense. The only thing that the current code is
nice for is that it makes it obvious that the .first checks match with
the srcu_dereference()s done below.
On Wed 24-06-15 17:16:05, Dave Hansen wrote:
>
> From: Dave Hansen <[email protected]>
>
> Currently, __sb_start_write() and freeze_super() can race with
> each other. __sb_start_write() uses a smp_mb() to ensure that
> freeze_super() can see its write to sb->s_writers.counter and
> that it can see freeze_super()'s update to sb->s_writers.frozen.
> This all seems to work fine.
>
> But, this smp_mb() makes __sb_start_write() the single hottest
> function in the kernel if I sit in a loop and do tiny write()s to
> tmpfs over and over. This is on a very small 2-core system, so
> it will only get worse on larger systems.
>
> This _seems_ like an ideal case for RCU. __sb_start_write() is
> the RCU read-side and is in a very fast, performance-sensitive
> path. freeze_super() is the RCU writer and is in an extremely
> rare non-performance-sensitive path.
>
> Instead of doing and smp_wmb() in __sb_start_write(), we do
> rcu_read_lock(). This ensures that a CPU doing freeze_super()
> can not proceed past its synchronize_rcu() until the grace
> period has ended and the 's_writers.frozen = SB_FREEZE_WRITE'
> is visible to __sb_start_write().
>
> One question here: Does the work that __sb_start_write() does in
> a previous grace period becomes visible to freeze_super() after
> its call to synchronize_rcu()? It _seems_ like it should, but it
> seems backwards to me since __sb_start_write() is the "reader" in
> this case.
I believe yes. Because all accesses (be it reads or writes) must finish
before the current RCU period finishes. And synchronize_rcu() must make
sure that any code (loads / stores) after it execute only after the RCU
period has finished...
The patch looks good to me. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> This patch increases the number of writes/second that I can do
> by 5.6%.
>
> Does anybody see any holes with this?
>
> Cc: Jan Kara <[email protected]>
> Cc: Alexander Viro <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: Paul E. McKenney <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: Dave Hansen <[email protected]>
> ---
>
> b/fs/super.c | 63 ++++++++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 39 insertions(+), 24 deletions(-)
>
> diff -puN fs/super.c~rcu-__sb_start_write fs/super.c
> --- a/fs/super.c~rcu-__sb_start_write 2015-06-24 17:14:34.939125713 -0700
> +++ b/fs/super.c 2015-06-24 17:14:34.942125847 -0700
> @@ -1190,27 +1190,21 @@ static void acquire_freeze_lock(struct s
> */
> int __sb_start_write(struct super_block *sb, int level, bool wait)
> {
> -retry:
> - if (unlikely(sb->s_writers.frozen >= level)) {
> + rcu_read_lock();
> + while (unlikely(sb->s_writers.frozen >= level)) {
> + rcu_read_unlock();
> if (!wait)
> return 0;
> wait_event(sb->s_writers.wait_unfrozen,
> sb->s_writers.frozen < level);
> + rcu_read_lock();
> }
>
> #ifdef CONFIG_LOCKDEP
> acquire_freeze_lock(sb, level, !wait, _RET_IP_);
> #endif
> percpu_counter_inc(&sb->s_writers.counter[level-1]);
> - /*
> - * Make sure counter is updated before we check for frozen.
> - * freeze_super() first sets frozen and then checks the counter.
> - */
> - smp_mb();
> - if (unlikely(sb->s_writers.frozen >= level)) {
> - __sb_end_write(sb, level);
> - goto retry;
> - }
> + rcu_read_unlock();
> return 1;
> }
> EXPORT_SYMBOL(__sb_start_write);
> @@ -1254,6 +1248,29 @@ static void sb_wait_write(struct super_b
> } while (writers);
> }
>
> +static void __thaw_super(struct super_block *sb)
> +{
> + sb->s_writers.frozen = SB_UNFROZEN;
> + /*
> + * RCU protects us against races where we are taking
> + * s_writers.frozen in to a less permissive state. When
> + * that happens, __sb_start_write() might not yet have
> + * seen our write and might still increment
> + * s_writers.counter.
> + *
> + * Here, however, we are transitioning to a _more_
> + * permissive state. The filesystem is frozen and no
> + * writes to s_writers.counter are being permitted.
> + *
> + * A smp_wmb() is sufficient here because we just need
> + * to ensure that new calls __sb_start_write() are
> + * allowed, not that _concurrent_ calls have finished.
> + */
> + smp_wmb();
> + wake_up(&sb->s_writers.wait_unfrozen);
> + deactivate_locked_super(sb);
> +}
> +
> /**
> * freeze_super - lock the filesystem and force it into a consistent state
> * @sb: the super to lock
> @@ -1312,7 +1329,13 @@ int freeze_super(struct super_block *sb)
>
> /* From now on, no new normal writers can start */
> sb->s_writers.frozen = SB_FREEZE_WRITE;
> - smp_wmb();
> + /*
> + * After we synchronize_rcu(), we have ensured that everyone
> + * who reads sb->s_writers.frozen under rcu_read_lock() can
> + * now see our update. This pretty much means that
> + * __sb_start_write() will not allow any new writers.
> + */
> + synchronize_rcu();
>
> /* Release s_umount to preserve sb_start_write -> s_umount ordering */
> up_write(&sb->s_umount);
> @@ -1322,7 +1345,7 @@ int freeze_super(struct super_block *sb)
> /* Now we go and block page faults... */
> down_write(&sb->s_umount);
> sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
> - smp_wmb();
> + synchronize_rcu();
>
> sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
>
> @@ -1331,7 +1354,7 @@ int freeze_super(struct super_block *sb)
>
> /* Now wait for internal filesystem counter */
> sb->s_writers.frozen = SB_FREEZE_FS;
> - smp_wmb();
> + synchronize_rcu();
> sb_wait_write(sb, SB_FREEZE_FS);
>
> if (sb->s_op->freeze_fs) {
> @@ -1339,11 +1362,7 @@ int freeze_super(struct super_block *sb)
> if (ret) {
> printk(KERN_ERR
> "VFS:Filesystem freeze failed\n");
> - sb->s_writers.frozen = SB_UNFROZEN;
> - smp_wmb();
> - wake_up(&sb->s_writers.wait_unfrozen);
> - deactivate_locked_super(sb);
> - return ret;
> + __thaw_super(sb);
> }
> }
> /*
> @@ -1386,11 +1405,7 @@ int thaw_super(struct super_block *sb)
> }
>
> out:
> - sb->s_writers.frozen = SB_UNFROZEN;
> - smp_wmb();
> - wake_up(&sb->s_writers.wait_unfrozen);
> - deactivate_locked_super(sb);
> -
> + __thaw_super(sb);
> return 0;
> }
> EXPORT_SYMBOL(thaw_super);
> _
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 24-06-15 17:16:06, Dave Hansen wrote:
>
> From: Dave Hansen <[email protected]>
This has nothing to do with fsnotify so just remove it from the subject
line please. Thanks!
> If I sit in a loop and do write()s to small tmpfs files,
> __sb_end_write() is third-hottest kernel function due to its
> smp_mb().
>
> __sb_end_write() uses the barrier to avoid races with freeze_super()
> and its calls to sb_wait_write(). But, now that freeze_super() is
> calling synchronize_rcu() before each sb_wait_write() call, we can
> use that to our advantage.
>
> The synchronize_rcu() ensures that all __sb_end_write() will see
> freeze_super()'s updates to s_writers.counter. That, in turn,
> guarantees that __sb_end_write() will try to wake up any subsequent
> call by freeze_super() to sb_wait_write().
What gains does this patch bring?
Otherwise the patch looks good to me. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> Cc: Jan Kara <[email protected]>
> Cc: Alexander Viro <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: Paul E. McKenney <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: Dave Hansen <[email protected]>
> ---
>
> b/fs/super.c | 17 +++++++++++++----
> 1 file changed, 13 insertions(+), 4 deletions(-)
>
> diff -puN fs/super.c~selectively-do-barriers-in-__sb_end_write fs/super.c
> --- a/fs/super.c~selectively-do-barriers-in-__sb_end_write 2015-06-24 17:14:35.315142611 -0700
> +++ b/fs/super.c 2015-06-24 17:14:35.318142745 -0700
> @@ -1146,14 +1146,23 @@ out:
> */
> void __sb_end_write(struct super_block *sb, int level)
> {
> + rcu_read_lock();
> percpu_counter_dec(&sb->s_writers.counter[level-1]);
> /*
> - * Make sure s_writers are updated before we wake up waiters in
> - * freeze_super().
> + * We are racing here with freeze_super()'s calls to
> + * sb_wait_write(). We want to ensure that we call
> + * wake_up() whenever one of those calls _might_ be
> + * in sb_wait_write().
> + *
> + * Since freeze_super() does a synchronize_rcu() before
> + * each of its sb_wait_write() calls, it can guarantee
> + * that it sees our update to s_writers.counter as well
> + * as that we see its update to s_writers.frozen.
> */
> - smp_mb();
> - if (waitqueue_active(&sb->s_writers.wait))
> + if (unlikely(sb->s_writers.frozen >= level))
> wake_up(&sb->s_writers.wait);
> + rcu_read_unlock();
> +
> rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
> }
> EXPORT_SYMBOL(__sb_end_write);
> _
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 24-06-15 17:16:07, Dave Hansen wrote:
>
> From: Dave Hansen <[email protected]>
>
> Both inodes and vfsmounts have fsnotify data embedded in them.
> The data is always a "mask" and a "mark". Take those two
> fields and move them in to a new structure: struct fsnotify_head.
>
> We will shortly be adding a new field to this.
>
> This also lets us get rid of the ugly #ifdef in 'struct inode'.
>
> In searching for i_fsnotify_mark, my regex found the fsnotify_mark
> comment about g_list. I think the comment was wrong and corrected
> it.
Umm, doesn't this grow struct inode due to padding? I'm not sure whether the
compiler is clever enough to leave the first 32-bit variable only 32-bit
aligned when it is inside a struct (at least my quick test seems to show it
isn't)...
Honza
>
> Cc: Jan Kara <[email protected]>
> Cc: Alexander Viro <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: Paul E. McKenney <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: Dave Hansen <[email protected]>
> ---
>
> b/fs/inode.c | 4 ++--
> b/fs/notify/fanotify/fanotify_user.c | 4 ++--
> b/fs/notify/fsnotify.c | 12 ++++++------
> b/fs/notify/inode_mark.c | 18 +++++++++---------
> b/fs/notify/inotify/inotify_user.c | 2 +-
> b/include/linux/fs.h | 6 ++----
> b/include/linux/fsnotify_backend.h | 8 ++++----
> b/include/linux/fsnotify_head.h | 17 +++++++++++++++++
> b/kernel/auditsc.c | 4 ++--
> 9 files changed, 45 insertions(+), 30 deletions(-)
>
> diff -puN fs/inode.c~fsnotify_head fs/inode.c
> --- a/fs/inode.c~fsnotify_head 2015-06-24 17:14:35.694159644 -0700
> +++ b/fs/inode.c 2015-06-24 17:14:35.711160408 -0700
> @@ -181,7 +181,7 @@ int inode_init_always(struct super_block
> #endif
>
> #ifdef CONFIG_FSNOTIFY
> - inode->i_fsnotify_mask = 0;
> + inode->i_fsnotify.mask = 0;
> #endif
> inode->i_flctx = NULL;
> this_cpu_inc(nr_inodes);
> @@ -363,7 +363,7 @@ void inode_init_once(struct inode *inode
> address_space_init_once(&inode->i_data);
> i_size_ordered_init(inode);
> #ifdef CONFIG_FSNOTIFY
> - INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
> + INIT_HLIST_HEAD(&inode->i_fsnotify.marks);
> #endif
> }
> EXPORT_SYMBOL(inode_init_once);
> diff -puN fs/notify/fanotify/fanotify_user.c~fsnotify_head fs/notify/fanotify/fanotify_user.c
> --- a/fs/notify/fanotify/fanotify_user.c~fsnotify_head 2015-06-24 17:14:35.696159734 -0700
> +++ b/fs/notify/fanotify/fanotify_user.c 2015-06-24 17:14:35.712160453 -0700
> @@ -562,7 +562,7 @@ static int fanotify_remove_inode_mark(st
>
> /* matches the fsnotify_find_inode_mark() */
> fsnotify_put_mark(fsn_mark);
> - if (removed & inode->i_fsnotify_mask)
> + if (removed & inode->i_fsnotify.mask)
> fsnotify_recalc_inode_mask(inode);
>
> return 0;
> @@ -679,7 +679,7 @@ static int fanotify_add_inode_mark(struc
> added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
> mutex_unlock(&group->mark_mutex);
>
> - if (added & ~inode->i_fsnotify_mask)
> + if (added & ~inode->i_fsnotify.mask)
> fsnotify_recalc_inode_mask(inode);
>
> fsnotify_put_mark(fsn_mark);
> diff -puN fs/notify/fsnotify.c~fsnotify_head fs/notify/fsnotify.c
> --- a/fs/notify/fsnotify.c~fsnotify_head 2015-06-24 17:14:35.697159779 -0700
> +++ b/fs/notify/fsnotify.c 2015-06-24 17:14:35.712160453 -0700
> @@ -104,7 +104,7 @@ int __fsnotify_parent(struct path *path,
>
> if (unlikely(!fsnotify_inode_watches_children(p_inode)))
> __fsnotify_update_child_dentry_flags(p_inode);
> - else if (p_inode->i_fsnotify_mask & mask) {
> + else if (p_inode->i_fsnotify.mask & mask) {
> /* we are notifying a parent so come up with the new mask which
> * specifies these are events which came from a child. */
> mask |= FS_EVENT_ON_CHILD;
> @@ -210,7 +210,7 @@ int fsnotify(struct inode *to_tell, __u3
> * this type of event.
> */
> if (!(mask & FS_MODIFY) &&
> - !(test_mask & to_tell->i_fsnotify_mask) &&
> + !(test_mask & to_tell->i_fsnotify.mask) &&
> !(mnt && test_mask & mnt->mnt_fsnotify_mask))
> return 0;
> /*
> @@ -220,22 +220,22 @@ int fsnotify(struct inode *to_tell, __u3
> * SRCU because we have no references to any objects and do not
> * need SRCU to keep them "alive".
> */
> - if (!to_tell->i_fsnotify_marks.first &&
> + if (!to_tell->i_fsnotify.marks.first &&
> (!mnt || !mnt->mnt_fsnotify_marks.first))
> return 0;
>
> idx = srcu_read_lock(&fsnotify_mark_srcu);
>
> if ((mask & FS_MODIFY) ||
> - (test_mask & to_tell->i_fsnotify_mask))
> - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
> + (test_mask & to_tell->i_fsnotify.mask))
> + inode_node = srcu_dereference(to_tell->i_fsnotify.marks.first,
> &fsnotify_mark_srcu);
>
> if (mnt && ((mask & FS_MODIFY) ||
> (test_mask & mnt->mnt_fsnotify_mask))) {
> vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
> &fsnotify_mark_srcu);
> - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
> + inode_node = srcu_dereference(to_tell->i_fsnotify.marks.first,
> &fsnotify_mark_srcu);
> }
>
> diff -puN fs/notify/inode_mark.c~fsnotify_head fs/notify/inode_mark.c
> --- a/fs/notify/inode_mark.c~fsnotify_head 2015-06-24 17:14:35.699159868 -0700
> +++ b/fs/notify/inode_mark.c 2015-06-24 17:14:35.712160453 -0700
> @@ -31,13 +31,13 @@
> #include "../internal.h"
>
> /*
> - * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
> + * Recalculate the inode->i_fsnotify.mask, or the mask of all FS_* event types
> * any notifier is interested in hearing for this inode.
> */
> void fsnotify_recalc_inode_mask(struct inode *inode)
> {
> spin_lock(&inode->i_lock);
> - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
> + inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
> spin_unlock(&inode->i_lock);
>
> __fsnotify_update_child_dentry_flags(inode);
> @@ -56,11 +56,11 @@ void fsnotify_destroy_inode_mark(struct
> mark->inode = NULL;
>
> /*
> - * this mark is now off the inode->i_fsnotify_marks list and we
> + * this mark is now off the inode->i_fsnotify.marks list and we
> * hold the inode->i_lock, so this is the perfect time to update the
> - * inode->i_fsnotify_mask
> + * inode->i_fsnotify.mask
> */
> - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
> + inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
> spin_unlock(&inode->i_lock);
> }
>
> @@ -74,7 +74,7 @@ void fsnotify_clear_marks_by_inode(struc
> LIST_HEAD(free_list);
>
> spin_lock(&inode->i_lock);
> - hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
> + hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify.marks, obj_list) {
> list_add(&mark->free_list, &free_list);
> hlist_del_init_rcu(&mark->obj_list);
> fsnotify_get_mark(mark);
> @@ -102,7 +102,7 @@ struct fsnotify_mark *fsnotify_find_inod
> struct fsnotify_mark *mark;
>
> spin_lock(&inode->i_lock);
> - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
> + mark = fsnotify_find_mark(&inode->i_fsnotify.marks, group);
> spin_unlock(&inode->i_lock);
>
> return mark;
> @@ -153,9 +153,9 @@ int fsnotify_add_inode_mark(struct fsnot
>
> spin_lock(&inode->i_lock);
> mark->inode = inode;
> - ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
> + ret = fsnotify_add_mark_list(&inode->i_fsnotify.marks, mark,
> allow_dups);
> - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
> + inode->i_fsnotify.mask = fsnotify_recalc_mask(&inode->i_fsnotify.marks);
> spin_unlock(&inode->i_lock);
>
> return ret;
> diff -puN fs/notify/inotify/inotify_user.c~fsnotify_head fs/notify/inotify/inotify_user.c
> --- a/fs/notify/inotify/inotify_user.c~fsnotify_head 2015-06-24 17:14:35.701159959 -0700
> +++ b/fs/notify/inotify/inotify_user.c 2015-06-24 17:14:35.713160498 -0700
> @@ -547,7 +547,7 @@ static int inotify_update_existing_watch
> /* more bits in old than in new? */
> int dropped = (old_mask & ~new_mask);
> /* more bits in this fsn_mark than the inode's mask? */
> - int do_inode = (new_mask & ~inode->i_fsnotify_mask);
> + int do_inode = (new_mask & ~inode->i_fsnotify.mask);
>
> /* update the inode with this new fsn_mark */
> if (dropped || do_inode)
> diff -puN include/linux/fs.h~fsnotify_head include/linux/fs.h
> --- a/include/linux/fs.h~fsnotify_head 2015-06-24 17:14:35.702160003 -0700
> +++ b/include/linux/fs.h 2015-06-24 17:14:35.710160363 -0700
> @@ -30,6 +30,7 @@
> #include <linux/lockdep.h>
> #include <linux/percpu-rwsem.h>
> #include <linux/blk_types.h>
> +#include <linux/fsnotify_head.h>
>
> #include <asm/byteorder.h>
> #include <uapi/linux/fs.h>
> @@ -660,10 +661,7 @@ struct inode {
>
> __u32 i_generation;
>
> -#ifdef CONFIG_FSNOTIFY
> - __u32 i_fsnotify_mask; /* all events this inode cares about */
> - struct hlist_head i_fsnotify_marks;
> -#endif
> + struct fsnotify_head i_fsnotify;
>
> void *i_private; /* fs or device private pointer */
> };
> diff -puN include/linux/fsnotify_backend.h~fsnotify_head include/linux/fsnotify_backend.h
> --- a/include/linux/fsnotify_backend.h~fsnotify_head 2015-06-24 17:14:35.704160093 -0700
> +++ b/include/linux/fsnotify_backend.h 2015-06-24 17:14:35.709160318 -0700
> @@ -212,7 +212,7 @@ struct fsnotify_mark {
> * in kernel that found and may be using this mark. */
> atomic_t refcnt; /* active things looking at this mark */
> struct fsnotify_group *group; /* group this mark is for */
> - struct list_head g_list; /* list of marks by group->i_fsnotify_marks
> + struct list_head g_list; /* list of marks by group->marks_list
> * Also reused for queueing mark into
> * destroy_list when it's waiting for
> * the end of SRCU period before it can
> @@ -249,11 +249,11 @@ extern u32 fsnotify_get_cookie(void);
> static inline int fsnotify_inode_watches_children(struct inode *inode)
> {
> /* FS_EVENT_ON_CHILD is set if the inode may care */
> - if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
> + if (!(inode->i_fsnotify.mask & FS_EVENT_ON_CHILD))
> return 0;
> /* this inode might care about child events, does it care about the
> * specific set of events that can happen on a child? */
> - return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
> + return inode->i_fsnotify.mask & FS_EVENTS_POSS_ON_CHILD;
> }
>
> /*
> @@ -326,7 +326,7 @@ extern struct fsnotify_event *fsnotify_r
>
> /* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */
> extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt);
> -/* run all marks associated with an inode and update inode->i_fsnotify_mask */
> +/* run all marks associated with an inode and update inode->i_fsnotify.mask */
> extern void fsnotify_recalc_inode_mask(struct inode *inode);
> extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
> /* find (and take a reference) to a mark associated with group and inode */
> diff -puN /dev/null include/linux/fsnotify_head.h
> --- /dev/null 2015-06-17 12:44:31.632705131 -0700
> +++ b/include/linux/fsnotify_head.h 2015-06-24 17:14:35.711160408 -0700
> @@ -0,0 +1,17 @@
> +#ifndef __LINUX_FSNOTIFY_HEAD_H
> +#define __LINUX_FSNOTIFY_HEAD_H
> +
> +#include <linux/types.h>
> +
> +/*
> + * This gets embedded in vfsmounts and inodes.
> + */
> +
> +struct fsnotify_head {
> +#ifdef CONFIG_FSNOTIFY
> + __u32 mask; /* all events this object cares about */
> + struct hlist_head marks;
> +#endif
> +};
> +
> +#endif /* __LINUX_FSNOTIFY_HEAD_H */
> diff -puN kernel/auditsc.c~fsnotify_head kernel/auditsc.c
> --- a/kernel/auditsc.c~fsnotify_head 2015-06-24 17:14:35.706160183 -0700
> +++ b/kernel/auditsc.c 2015-06-24 17:14:35.714160543 -0700
> @@ -1587,7 +1587,7 @@ static inline void handle_one(const stru
> struct audit_tree_refs *p;
> struct audit_chunk *chunk;
> int count;
> - if (likely(hlist_empty(&inode->i_fsnotify_marks)))
> + if (likely(hlist_empty(&inode->i_fsnotify.marks)))
> return;
> context = current->audit_context;
> p = context->trees;
> @@ -1630,7 +1630,7 @@ retry:
> seq = read_seqbegin(&rename_lock);
> for(;;) {
> struct inode *inode = d_backing_inode(d);
> - if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
> + if (inode && unlikely(!hlist_empty(&inode->i_fsnotify.marks))) {
> struct audit_chunk *chunk;
> chunk = audit_tree_lookup(inode);
> if (chunk) {
> _
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 24-06-15 17:16:08, Dave Hansen wrote:
>
> From: Dave Hansen <[email protected]>
>
> According to Jan Kara:
>
> You can have ignored mask set without any of the
> notification masks set and you are expected to clear the
> ignored mask on the first IN_MODIFY event.
>
> But, the only way we currently have to go and find if we need to
> do this ignored-mask-clearing is to go through the mark lists
> and look for them. That mark list iteration requires an
> srcu_read_lock() which has a memory barrier and can be expensive.
>
> The calculation of 'has_ignore' is pretty cheap because we store
> it next to another value which we are updating and we do it
> inside of a loop we were already running.
>
> This patch will really only matter when we have a workload where
> a file is being modified often _and_ there is an active fsnotify
> mark on it. Otherwise the checks against *_fsnotify.marks.first
> will keep us out of the expensive srcu_read_lock() call.
So this is a nice optimization but definitely not worth growing struct
inode. We could use one bit in the mask itself for this though.
Honza
>
> Cc: Jan Kara <[email protected]>
> Cc: Alexander Viro <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: Paul E. McKenney <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: Dave Hansen <[email protected]>
> ---
>
> b/fs/notify/fsnotify.c | 44 ++++++++++++++++++++++++++++++++++------
> b/fs/notify/mark.c | 8 +++++--
> b/include/linux/fsnotify_head.h | 1
> 3 files changed, 45 insertions(+), 8 deletions(-)
>
> diff -puN fs/notify/fsnotify.c~fsnotify-ignore-present fs/notify/fsnotify.c
> --- a/fs/notify/fsnotify.c~fsnotify-ignore-present 2015-06-24 17:14:37.187226743 -0700
> +++ b/fs/notify/fsnotify.c 2015-06-24 17:14:37.194227057 -0700
> @@ -183,6 +183,34 @@ static int send_to_group(struct inode *t
> }
>
> /*
> + * The "logical or" of all of the marks' ->mask is kept in the
> + * i/mnt_fsnotify.mask. We can check it instead of going
> + * through all of the marks. fsnotify_recalc_mask() does the
> + * updates.
> + */
> +static int some_mark_is_interested(__u32 mask, struct inode *inode, struct mount *mnt)
> +{
> + if (mask & inode->i_fsnotify.mask)
> + return 1;
> + if (mnt && (mask & mnt->mnt_fsnotify.mask))
> + return 1;
> + return 0;
> +}
> +
> +/*
> + * fsnotify_recalc_mask() recalculates "has_ignore" whenever any
> + * mark's flags change.
> + */
> +static int some_mark_needs_ignore_clear(struct inode *inode, struct mount *mnt)
> +{
> + if (inode->i_fsnotify.has_ignore)
> + return 1;
> + if (mnt && mnt->mnt_fsnotify.has_ignore)
> + return 1;
> + return 0;
> +}
> +
> +/*
> * This is the main call to fsnotify. The VFS calls into hook specific functions
> * in linux/fsnotify.h. Those functions then in turn call here. Here will call
> * out to all of the registered fsnotify_group. Those groups can then use the
> @@ -205,14 +233,18 @@ int fsnotify(struct inode *to_tell, __u3
> mnt = NULL;
>
> /*
> - * if this is a modify event we may need to clear the ignored masks
> - * otherwise return if neither the inode nor the vfsmount care about
> - * this type of event.
> + * We must clear the (user-visible) ignored mask on the first IN_MODIFY
> + * event despite the 'mask' which is passed in here. But we can safely
> + * skip that step if we know there are no marks which need this action.
> + *
> + * We can also skip looking at the list of marks if we know that none
> + * of the marks are interested in the events in our 'mask'.
> */
> - if (!(mask & FS_MODIFY) &&
> - !(test_mask & to_tell->i_fsnotify.mask) &&
> - !(mnt && test_mask & mnt->mnt_fsnotify.mask))
> + if ((mask & FS_MODIFY) && !some_mark_needs_ignore_clear(to_tell, mnt))
> + return 0;
> + else if (!some_mark_is_interested(test_mask, to_tell, mnt))
> return 0;
> +
> /*
> * Optimization: srcu_read_lock() has a memory barrier which can
> * be expensive. It protects walking the *_fsnotify_marks lists.
> diff -puN fs/notify/mark.c~fsnotify-ignore-present fs/notify/mark.c
> --- a/fs/notify/mark.c~fsnotify-ignore-present 2015-06-24 17:14:37.189226832 -0700
> +++ b/fs/notify/mark.c 2015-06-24 17:14:37.194227057 -0700
> @@ -116,10 +116,14 @@ void fsnotify_recalc_mask(struct fsnotif
> {
> u32 new_mask = 0;
> struct fsnotify_mark *mark;
> + u32 has_ignore = 0;
>
> - hlist_for_each_entry(mark, &fsn->marks, obj_list)
> + hlist_for_each_entry(mark, &fsn->marks, obj_list) {
> + if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
> + has_ignore = 1;
> new_mask |= mark->mask;
> -
> + }
> + fsn->has_ignore = has_ignore;
> fsn->mask = new_mask;
> }
>
> diff -puN include/linux/fsnotify_head.h~fsnotify-ignore-present include/linux/fsnotify_head.h
> --- a/include/linux/fsnotify_head.h~fsnotify-ignore-present 2015-06-24 17:14:37.190226877 -0700
> +++ b/include/linux/fsnotify_head.h 2015-06-24 17:14:37.193227012 -0700
> @@ -11,6 +11,7 @@ struct fsnotify_head {
> #ifdef CONFIG_FSNOTIFY
> __u32 mask; /* all events this object cares about */
> struct hlist_head marks;
> + __u32 has_ignore; /* any marks has ignore set */
> #endif
> };
>
> _
--
Jan Kara <[email protected]>
SUSE Labs, CR