Hi Al,
The following patches are the inode_lock breakup series originally
derived from Nick Piggin's vfs-scale tree. I've kind of been sitting
on them until the dcache_lock breakup and rcu path-walk has had some
time to be shaken out. The patch ѕet is pretty much unchanged from
the last round of review last last year - all I've done to bring it
up to date is forward port it and run it through some testing on XFS
and ext4.
I know it's late in the .39 merge window, but I hope you'll consider
it if the patches are still acceptable(*). Otherwise I'm happy to take
the time to get it right for .40.
Cheers,
Dave.
(*) The series can also be found here:
git://git.kernel.org/pub/scm/linux/kernel/git/dgc/xfsdev.git inode-scale
Dave Chinner (8):
fs: protect inode->i_state with inode->i_lock
fs: factor inode disposal
fs: Lock the inode LRU list separately
fs: remove inode_lock from iput_final and prune_icache
fs: move i_sb_list out from under inode_lock
fs: move i_wb_list out from under inode_lock
fs: rename inode_lock to inode_hash_lock
fs: pull inode->i_lock up out of writeback_single_inode
Documentation/filesystems/Locking | 2 +-
Documentation/filesystems/porting | 16 +-
Documentation/filesystems/vfs.txt | 2 +-
fs/block_dev.c | 6 +-
fs/buffer.c | 2 +-
fs/drop_caches.c | 18 +-
fs/fs-writeback.c | 141 ++++++++-----
fs/inode.c | 416 +++++++++++++++++++++----------------
fs/internal.h | 7 +
fs/logfs/inode.c | 2 +-
fs/notify/inode_mark.c | 42 +++--
fs/notify/mark.c | 1 -
fs/notify/vfsmount_mark.c | 1 -
fs/ntfs/inode.c | 4 +-
fs/quota/dquot.c | 41 ++--
include/linux/fs.h | 2 +-
include/linux/quotaops.h | 2 +-
include/linux/writeback.h | 2 +-
mm/backing-dev.c | 8 +-
mm/filemap.c | 10 +-
mm/rmap.c | 5 +-
21 files changed, 434 insertions(+), 296 deletions(-)
From: Dave Chinner <[email protected]>
Protect the per-sb inode list with a new global lock
inode_sb_list_lock and use it to protect the list manipulations and
traversals. This lock replaces the inode_lock as the inodes on the
list can be validity checked while holding the inode->i_lock and
hence the inode_lock is no longer needed to protect the list.
Signed-off-by: Dave Chinner <[email protected]>
---
fs/drop_caches.c | 9 +++++----
fs/fs-writeback.c | 21 +++++++++++----------
fs/inode.c | 43 +++++++++++++++++++++++--------------------
fs/internal.h | 2 ++
fs/notify/inode_mark.c | 20 ++++++++++----------
fs/quota/dquot.c | 28 ++++++++++++++++------------
6 files changed, 67 insertions(+), 56 deletions(-)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 62dd8ee..86cd2f1 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
#include <linux/writeback.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
+#include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */
int sysctl_drop_caches;
@@ -16,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
struct inode *inode, *toput_inode = NULL;
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -26,13 +27,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
iput(toput_inode);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index efd1ebe..5de56a2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1123,7 +1123,7 @@ static void wait_sb_inodes(struct super_block *sb)
*/
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
/*
* Data integrity sync. Must wait for all pages under writeback,
@@ -1143,14 +1143,15 @@ static void wait_sb_inodes(struct super_block *sb)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
+
/*
- * We hold a reference to 'inode' so it couldn't have
- * been removed from s_inodes list while we dropped the
- * inode_lock. We cannot iput the inode now as we can
- * be holding the last reference and we cannot iput it
- * under inode_lock. So we keep the reference and iput
- * it later.
+ * We hold a reference to 'inode' so it couldn't have been
+ * removed from s_inodes list while we dropped the
+ * inode_sb_list_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it under
+ * inode_sb_list_lock. So we keep the reference and iput it
+ * later.
*/
iput(old_inode);
old_inode = inode;
@@ -1159,9 +1160,9 @@ static void wait_sb_inodes(struct super_block *sb)
cond_resched();
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
iput(old_inode);
}
diff --git a/fs/inode.c b/fs/inode.c
index 5ecd880..5483d38 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -33,10 +33,15 @@
* inode->i_state, inode->i_hash, __iget()
* inode_lru_lock protects:
* inode_lru, inode->i_lru
+ * inode_sb_list_lock protects:
+ * sb->s_inodes, inode->i_sb_list
*
* Lock ordering:
* inode_lock
* inode->i_lock
+ *
+ * inode_sb_list_lock
+ * inode->i_lock
* inode_lru_lock
*/
@@ -98,6 +103,8 @@ static struct hlist_head *inode_hashtable __read_mostly;
*/
DEFINE_SPINLOCK(inode_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+
/*
* iprune_sem provides exclusion between the icache shrinking and the
* umount path.
@@ -377,26 +384,23 @@ static void inode_lru_list_del(struct inode *inode)
spin_unlock(&inode_lru_lock);
}
-static inline void __inode_sb_list_add(struct inode *inode)
-{
- list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-}
-
/**
* inode_sb_list_add - add inode to the superblock list of inodes
* @inode: inode to add
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode_lock);
- __inode_sb_list_add(inode);
- spin_unlock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
+ list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+ spin_unlock(&inode_sb_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
-static inline void __inode_sb_list_del(struct inode *inode)
+static inline void inode_sb_list_del(struct inode *inode)
{
+ spin_lock(&inode_sb_list_lock);
list_del_init(&inode->i_sb_list);
+ spin_unlock(&inode_sb_list_lock);
}
static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -480,9 +484,10 @@ static void evict(struct inode *inode)
spin_lock(&inode_lock);
list_del_init(&inode->i_wb_list);
- __inode_sb_list_del(inode);
spin_unlock(&inode_lock);
+ inode_sb_list_del(inode);
+
if (op->evict_inode) {
op->evict_inode(inode);
} else {
@@ -538,7 +543,7 @@ void evict_inodes(struct super_block *sb)
struct inode *inode, *next;
LIST_HEAD(dispose);
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
@@ -554,7 +559,7 @@ void evict_inodes(struct super_block *sb)
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
dispose_list(&dispose);
@@ -583,7 +588,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
struct inode *inode, *next;
LIST_HEAD(dispose);
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
@@ -606,7 +611,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
dispose_list(&dispose);
@@ -866,16 +871,14 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- spin_lock_prefetch(&inode_lock);
+ spin_lock_prefetch(&inode_sb_list_lock);
inode = alloc_inode(sb);
if (inode) {
- spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
- __inode_sb_list_add(inode);
- spin_unlock(&inode_lock);
+ inode_sb_list_add(inode);
}
return inode;
}
@@ -944,7 +947,7 @@ static struct inode *get_new_inode(struct super_block *sb,
inode->i_state = I_NEW;
hlist_add_head(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- __inode_sb_list_add(inode);
+ inode_sb_list_add(inode);
spin_unlock(&inode_lock);
/* Return the locked inode with I_NEW set, the
@@ -993,7 +996,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
inode->i_state = I_NEW;
hlist_add_head(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- __inode_sb_list_add(inode);
+ inode_sb_list_add(inode);
spin_unlock(&inode_lock);
/* Return the locked inode with I_NEW set, the
diff --git a/fs/internal.h b/fs/internal.h
index 8318059..7013ae0 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -125,6 +125,8 @@ extern long do_handle_open(int mountdirfd,
/*
* inode.c
*/
+extern spinlock_t inode_sb_list_lock;
+
extern int get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4dd53fb..fb3b3c5 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -29,6 +29,8 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
+#include "../internal.h"
+
/*
* Recalculate the mask of events relevant to a given inode locked.
*/
@@ -237,15 +239,14 @@ out:
* fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
* @list: list of inodes being unmounted (sb->s_inodes)
*
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
*/
void fsnotify_unmount_inodes(struct list_head *list)
{
struct inode *inode, *next_i, *need_iput = NULL;
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
struct inode *need_iput_tmp;
@@ -293,12 +294,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
}
/*
- * We can safely drop inode_lock here because we hold
+ * We can safely drop inode_sb_list_lock here because we hold
* references on both inode and next_i. Also no new inodes
- * will be added since the umount has begun. Finally,
- * iprune_mutex keeps shrink_icache_memory() away.
+ * will be added since the umount has begun.
*/
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
if (need_iput_tmp)
iput(need_iput_tmp);
@@ -310,7 +310,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
iput(inode);
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a1470fd..fcc8ae7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
#include <linux/buffer_head.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
-#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
+#include "../internal.h" /* ugh */
#include <asm/uaccess.h>
@@ -900,7 +900,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
int reserved = 0;
#endif
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -915,19 +915,23 @@ static void add_dquot_ref(struct super_block *sb, int type)
#endif
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
iput(old_inode);
__dquot_initialize(inode, type);
- /* We hold a reference to 'inode' so it couldn't have been
- * removed from s_inodes list while we dropped the inode_lock.
- * We cannot iput the inode now as we can be holding the last
- * reference and we cannot iput it under inode_lock. So we
- * keep the reference and iput it later. */
+
+ /*
+ * We hold a reference to 'inode' so it couldn't have been
+ * removed from s_inodes list while we dropped the
+ * inode_sb_list_lock We cannot iput the inode now as we can be
+ * holding the last reference and we cannot iput it under
+ * inode_sb_list_lock. So we keep the reference and iput it
+ * later.
+ */
old_inode = inode;
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
iput(old_inode);
#ifdef CONFIG_QUOTA_DEBUG
@@ -1008,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
struct inode *inode;
int reserved = 0;
- spin_lock(&inode_lock);
+ spin_lock(&inode_sb_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
/*
* We have to scan also I_NEW inodes because they can already
@@ -1022,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
remove_inode_dquot_ref(inode, type, tofree_head);
}
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_sb_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
if (reserved) {
printk(KERN_WARNING "VFS (%s): Writes happened after quota"
--
1.7.2.3
From: Dave Chinner <[email protected]>
Now that inode state changes are protected by the inode->i_lock and
the inode LRU manipulations by the inode_lru_lock, we can remove the
inode_lock from prune_icache and the initial part of iput_final().
instead of using the inode_lock to protect the inode during
iput_final, use the inode->i_lock instead. This protects the inode
against new references being taken while we change the inode state
to I_FREEING, as well as preventing prune_icache from grabbing the
inode while we are manipulating it. Hence we no longer need the
inode_lock in iput_final prior to setting I_FREEING on the inode.
For prune_icache, we no longer need the inode_lock to protect the
LRU list, and the inodes themselves are protected against freeing
races by the inode->i_lock. Hence we can lift the inode_lock from
prune_icache as well.
Signed-off-by: Dave Chinner <[email protected]>
---
Documentation/filesystems/Locking | 2 +-
Documentation/filesystems/porting | 16 +++++++++++-----
Documentation/filesystems/vfs.txt | 2 +-
fs/inode.c | 17 +++--------------
fs/logfs/inode.c | 2 +-
5 files changed, 17 insertions(+), 22 deletions(-)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2e994ef..61b31ac 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -128,7 +128,7 @@ alloc_inode:
destroy_inode:
dirty_inode: (must not sleep)
write_inode:
-drop_inode: !!!inode_lock!!!
+drop_inode: !!!inode->i_lock!!!
evict_inode:
put_super: write
write_super: read
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 0c986c9..6e29954 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -298,11 +298,14 @@ be used instead. It gets called whenever the inode is evicted, whether it has
remaining links or not. Caller does *not* evict the pagecache or inode-associated
metadata buffers; getting rid of those is responsibility of method, as it had
been for ->delete_inode().
- ->drop_inode() returns int now; it's called on final iput() with inode_lock
-held and it returns true if filesystems wants the inode to be dropped. As before,
-generic_drop_inode() is still the default and it's been updated appropriately.
-generic_delete_inode() is also alive and it consists simply of return 1. Note that
-all actual eviction work is done by caller after ->drop_inode() returns.
+
+ ->drop_inode() returns int now; it's called on final iput() with
+inode->i_lock held and it returns true if filesystems wants the inode to be
+dropped. As before, generic_drop_inode() is still the default and it's been
+updated appropriately. generic_delete_inode() is also alive and it consists
+simply of return 1. Note that all actual eviction work is done by caller after
+->drop_inode() returns.
+
clear_inode() is gone; use end_writeback() instead. As before, it must
be called exactly once on each call of ->evict_inode() (as it used to be for
each call of ->delete_inode()). Unlike before, if you are using inode-associated
@@ -397,6 +400,9 @@ a file off.
--
[mandatory]
+
+--
+[mandatory]
->get_sb() is gone. Switch to use of ->mount(). Typically it's just
a matter of switching from calling get_sb_... to mount_... and changing the
function type. If you were doing it manually, just switch from setting ->mnt_root
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 306f0ae..80815ed 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -254,7 +254,7 @@ or bottom half).
should be synchronous or not, not all filesystems check this flag.
drop_inode: called when the last access to the inode is dropped,
- with the inode_lock spinlock held.
+ with the inode->i_lock spinlock held.
This method should be either NULL (normal UNIX filesystem
semantics) or "generic_delete_inode" (for filesystems that do not
diff --git a/fs/inode.c b/fs/inode.c
index f6e6e37..5ecd880 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -649,7 +649,6 @@ static void prune_icache(int nr_to_scan)
unsigned long reap = 0;
down_read(&iprune_sem);
- spin_lock(&inode_lock);
spin_lock(&inode_lru_lock);
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
struct inode *inode;
@@ -675,8 +674,8 @@ static void prune_icache(int nr_to_scan)
*/
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
- spin_unlock(&inode->i_lock);
list_del_init(&inode->i_lru);
+ spin_unlock(&inode->i_lock);
inodes_stat.nr_unused--;
continue;
}
@@ -684,20 +683,18 @@ static void prune_icache(int nr_to_scan)
/* recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
- spin_unlock(&inode->i_lock);
list_move(&inode->i_lru, &inode_lru);
+ spin_unlock(&inode->i_lock);
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lru_lock);
- spin_unlock(&inode_lock);
if (remove_inode_buffers(inode))
reap += invalidate_mapping_pages(&inode->i_data,
0, -1);
iput(inode);
- spin_lock(&inode_lock);
spin_lock(&inode_lru_lock);
if (inode != list_entry(inode_lru.next,
@@ -723,7 +720,6 @@ static void prune_icache(int nr_to_scan)
else
__count_vm_events(PGINODESTEAL, reap);
spin_unlock(&inode_lru_lock);
- spin_unlock(&inode_lock);
dispose_list(&freeable);
up_read(&iprune_sem);
@@ -1081,7 +1077,6 @@ EXPORT_SYMBOL(iunique);
struct inode *igrab(struct inode *inode)
{
- spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
__iget(inode);
@@ -1095,7 +1090,6 @@ struct inode *igrab(struct inode *inode)
*/
inode = NULL;
}
- spin_unlock(&inode_lock);
return inode;
}
EXPORT_SYMBOL(igrab);
@@ -1438,7 +1432,6 @@ static void iput_final(struct inode *inode)
const struct super_operations *op = inode->i_sb->s_op;
int drop;
- spin_lock(&inode->i_lock);
WARN_ON(inode->i_state & I_NEW);
if (op && op->drop_inode)
@@ -1451,16 +1444,13 @@ static void iput_final(struct inode *inode)
if (!(inode->i_state & (I_DIRTY|I_SYNC)))
inode_lru_list_add(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
return;
}
if (!drop) {
inode->i_state |= I_WILL_FREE;
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
write_inode_now(inode, 1);
- spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
WARN_ON(inode->i_state & I_NEW);
inode->i_state &= ~I_WILL_FREE;
@@ -1469,7 +1459,6 @@ static void iput_final(struct inode *inode)
inode->i_state |= I_FREEING;
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
evict(inode);
}
@@ -1488,7 +1477,7 @@ void iput(struct inode *inode)
if (inode) {
BUG_ON(inode->i_state & I_CLEAR);
- if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
iput_final(inode);
}
}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c24..edfea7a 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
-/* called with inode_lock held */
+/* called with inode->i_lock held */
static int logfs_drop_inode(struct inode *inode)
{
struct logfs_super *super = logfs_super(inode->i_sb);
--
1.7.2.3
From: Dave Chinner <[email protected]>
All that remains of the inode_lock is protecting the inode hash list
manipulation and traversals. Rename the inode_lock to
inode_hash_lock to reflect it's actual function.
Signed-off-by: Dave Chinner <[email protected]>
---
fs/inode.c | 111 +++++++++++++++++++++++++--------------------
fs/notify/inode_mark.c | 1 -
fs/notify/mark.c | 1 -
fs/notify/vfsmount_mark.c | 1 -
fs/ntfs/inode.c | 4 +-
include/linux/writeback.h | 1 -
6 files changed, 63 insertions(+), 56 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 5a7f8ef..730ddd6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -38,10 +38,10 @@
* sb->s_inodes, inode->i_sb_list
* inode_wb_list_lock protects:
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * inode_hash_lock protects:
+ * inode_hashtable, inode->i_hash
*
* Lock ordering:
- * inode_lock
- * inode->i_lock
*
* inode_sb_list_lock
* inode->i_lock
@@ -49,6 +49,13 @@
*
* inode_wb_list_lock
* inode->i_lock
+ *
+ * inode_hash_lock
+ * inode_sb_list_lock
+ * inode->i_lock
+ *
+ * iunique_lock
+ * inode_hash_lock
*/
/*
@@ -84,6 +91,8 @@
static unsigned int i_hash_mask __read_mostly;
static unsigned int i_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
/*
* Each inode can be on two separate lists. One is
@@ -99,15 +108,6 @@ static unsigned int i_hash_shift __read_mostly;
static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);
-static struct hlist_head *inode_hashtable __read_mostly;
-
-/*
- * A simple spinlock to protect the list manipulations.
- *
- * NOTE! You also have to own the lock if you change
- * the i_state of an inode while it is in use..
- */
-DEFINE_SPINLOCK(inode_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
@@ -432,11 +432,11 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
spin_lock(&inode->i_lock);
hlist_add_head(&inode->i_hash, b);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);
@@ -448,11 +448,11 @@ EXPORT_SYMBOL(__insert_inode_hash);
*/
void remove_inode_hash(struct inode *inode)
{
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
spin_lock(&inode->i_lock);
hlist_del_init(&inode->i_hash);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(remove_inode_hash);
@@ -777,11 +777,15 @@ static struct inode *find_inode(struct super_block *sb,
repeat:
hlist_for_each_entry(inode, node, head, i_hash) {
- if (inode->i_sb != sb)
+ spin_lock(&inode->i_lock);
+ if (inode->i_sb != sb) {
+ spin_unlock(&inode->i_lock);
continue;
- if (!test(inode, data))
+ }
+ if (!test(inode, data)) {
+ spin_unlock(&inode->i_lock);
continue;
- spin_lock(&inode->i_lock);
+ }
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
@@ -805,11 +809,15 @@ static struct inode *find_inode_fast(struct super_block *sb,
repeat:
hlist_for_each_entry(inode, node, head, i_hash) {
- if (inode->i_ino != ino)
+ spin_lock(&inode->i_lock);
+ if (inode->i_ino != ino) {
+ spin_unlock(&inode->i_lock);
continue;
- if (inode->i_sb != sb)
+ }
+ if (inode->i_sb != sb) {
+ spin_unlock(&inode->i_lock);
continue;
- spin_lock(&inode->i_lock);
+ }
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
@@ -923,7 +931,7 @@ void unlock_new_inode(struct inode *inode)
EXPORT_SYMBOL(unlock_new_inode);
/*
- * This is called without the inode lock held.. Be careful.
+ * This is called without the inode hash lock held.. Be careful.
*
* We no longer cache the sb_flags in i_flags - see fs.h
* -- [email protected]
@@ -940,7 +948,7 @@ static struct inode *get_new_inode(struct super_block *sb,
if (inode) {
struct inode *old;
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
old = find_inode(sb, head, test, data);
if (!old) {
@@ -952,7 +960,7 @@ static struct inode *get_new_inode(struct super_block *sb,
hlist_add_head(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
inode_sb_list_add(inode);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
@@ -965,7 +973,7 @@ static struct inode *get_new_inode(struct super_block *sb,
* us. Use the old inode instead of the one we just
* allocated.
*/
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
destroy_inode(inode);
inode = old;
wait_on_inode(inode);
@@ -973,7 +981,7 @@ static struct inode *get_new_inode(struct super_block *sb,
return inode;
set_failed:
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
destroy_inode(inode);
return NULL;
}
@@ -991,7 +999,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
if (inode) {
struct inode *old;
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
old = find_inode_fast(sb, head, ino);
if (!old) {
@@ -1001,7 +1009,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
hlist_add_head(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
inode_sb_list_add(inode);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
@@ -1014,7 +1022,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
* us. Use the old inode instead of the one we just
* allocated.
*/
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
destroy_inode(inode);
inode = old;
wait_on_inode(inode);
@@ -1035,10 +1043,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
struct hlist_node *node;
struct inode *inode;
+ spin_lock(&inode_hash_lock);
hlist_for_each_entry(inode, node, b, i_hash) {
- if (inode->i_ino == ino && inode->i_sb == sb)
+ if (inode->i_ino == ino && inode->i_sb == sb) {
+ spin_unlock(&inode_hash_lock);
return 0;
+ }
}
+ spin_unlock(&inode_hash_lock);
return 1;
}
@@ -1068,7 +1080,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
static unsigned int counter;
ino_t res;
- spin_lock(&inode_lock);
spin_lock(&iunique_lock);
do {
if (counter <= max_reserved)
@@ -1076,7 +1087,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
res = counter++;
} while (!test_inode_iunique(sb, res));
spin_unlock(&iunique_lock);
- spin_unlock(&inode_lock);
return res;
}
@@ -1118,7 +1128,7 @@ EXPORT_SYMBOL(igrab);
*
* Otherwise NULL is returned.
*
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note, @test is called with the inode_hash_lock held, so can't sleep.
*/
static struct inode *ifind(struct super_block *sb,
struct hlist_head *head, int (*test)(struct inode *, void *),
@@ -1126,15 +1136,15 @@ static struct inode *ifind(struct super_block *sb,
{
struct inode *inode;
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
inode = find_inode(sb, head, test, data);
if (inode) {
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
if (likely(wait))
wait_on_inode(inode);
return inode;
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
return NULL;
}
@@ -1158,14 +1168,14 @@ static struct inode *ifind_fast(struct super_block *sb,
{
struct inode *inode;
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
if (inode) {
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
wait_on_inode(inode);
return inode;
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
return NULL;
}
@@ -1188,7 +1198,7 @@ static struct inode *ifind_fast(struct super_block *sb,
*
* Otherwise NULL is returned.
*
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note, @test is called with the inode_hash_lock held, so can't sleep.
*/
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
@@ -1216,7 +1226,7 @@ EXPORT_SYMBOL(ilookup5_nowait);
*
* Otherwise NULL is returned.
*
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note, @test is called with the inode_hash_lock held, so can't sleep.
*/
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
@@ -1267,7 +1277,8 @@ EXPORT_SYMBOL(ilookup);
* inode and this is returned locked, hashed, and with the I_NEW flag set. The
* file system gets to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
*/
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -1327,7 +1338,7 @@ int insert_inode_locked(struct inode *inode)
while (1) {
struct hlist_node *node;
struct inode *old = NULL;
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
hlist_for_each_entry(old, node, head, i_hash) {
if (old->i_ino != ino)
continue;
@@ -1345,12 +1356,12 @@ int insert_inode_locked(struct inode *inode)
inode->i_state |= I_NEW;
hlist_add_head(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
return 0;
}
__iget(old);
spin_unlock(&old->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
wait_on_inode(old);
if (unlikely(!inode_unhashed(old))) {
iput(old);
@@ -1371,7 +1382,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
struct hlist_node *node;
struct inode *old = NULL;
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
hlist_for_each_entry(old, node, head, i_hash) {
if (old->i_sb != sb)
continue;
@@ -1389,12 +1400,12 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
inode->i_state |= I_NEW;
hlist_add_head(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
return 0;
}
__iget(old);
spin_unlock(&old->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
wait_on_inode(old);
if (unlikely(!inode_unhashed(old))) {
iput(old);
@@ -1673,10 +1684,10 @@ static void __wait_on_freeing_inode(struct inode *inode)
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_hash_lock);
schedule();
finish_wait(wq, &wait.wait);
- spin_lock(&inode_lock);
+ spin_lock(&inode_hash_lock);
}
static __initdata unsigned long ihash_entries;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index fb3b3c5..07ea8d3 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,7 +22,6 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
#include <asm/atomic.h>
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e..50c0085 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
-#include <linux/writeback.h> /* for inode_lock */
#include <asm/atomic.h>
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff..e86577d 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
#include <asm/atomic.h>
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed8..0b56c6b 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
*
* Return 1 if the attributes match and 0 if not.
*
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
* allowed to sleep.
*/
int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
*
* Return 0 on success and -errno on error.
*
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
* allowed to sleep. (Hence the GFP_ATOMIC allocation.)
*/
static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3f5fee7..17e7ccc 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -9,7 +9,6 @@
struct backing_dev_info;
-extern spinlock_t inode_lock;
extern spinlock_t inode_wb_list_lock;
/*
--
1.7.2.3
From: Dave Chinner <[email protected]>
Protect inode state transitions and validity checks with the
inode->i_lock. This enables us to make inode state transitions
independently of the inode_lock and is the first step to peeling
away the inode_lock from the code.
This requires that __iget() is done atomically with i_state checks
during list traversals so that we don't race with another thread
marking the inode I_FREEING between the state check and grabbing the
reference.
Also remove the unlock_new_inode() memory barrier optimisation
required to avoid taking the inode_lock when clearing I_NEW.
Simplify the code by simply taking the inode->i_lock around the
state change and wakeup. Because the wakeup is no longer tricky,
remove the wake_up_inode() function and open code the wakeup where
necessary.
Signed-off-by: Dave Chinner <[email protected]>
---
fs/block_dev.c | 2 +
fs/buffer.c | 2 +-
fs/drop_caches.c | 9 ++-
fs/fs-writeback.c | 44 ++++++++++---
fs/inode.c | 150 ++++++++++++++++++++++++++++++++--------------
fs/notify/inode_mark.c | 21 +++++--
fs/quota/dquot.c | 13 ++--
include/linux/fs.h | 2 +-
include/linux/quotaops.h | 2 +-
mm/filemap.c | 2 +
mm/rmap.c | 1 +
11 files changed, 174 insertions(+), 74 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8892870..bc39b18 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -56,9 +56,11 @@ static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY)
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76..da666f3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1144,7 +1144,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * mapping->tree_lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
{
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2195c21..62dd8ee 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,11 +18,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_lock(&inode_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
- continue;
- if (inode->i_mapping->nrpages == 0)
+ spin_lock(&inode->i_lock);
+ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ (inode->i_mapping->nrpages == 0)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
__iget(inode);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e49..efd1ebe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -306,10 +306,12 @@ static void inode_wait_for_writeback(struct inode *inode)
wait_queue_head_t *wqh;
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
- while (inode->i_state & I_SYNC) {
+ while (inode->i_state & I_SYNC) {
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
}
}
@@ -333,6 +335,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
unsigned dirty;
int ret;
+ spin_lock(&inode->i_lock);
if (!atomic_read(&inode->i_count))
WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
else
@@ -348,6 +351,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* completed a full scan of b_io.
*/
if (wbc->sync_mode != WB_SYNC_ALL) {
+ spin_unlock(&inode->i_lock);
requeue_io(inode);
return 0;
}
@@ -363,6 +367,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/* Set I_SYNC, reset I_DIRTY_PAGES */
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY_PAGES;
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
ret = do_writepages(mapping, wbc);
@@ -384,8 +389,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* write_inode()
*/
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
@@ -395,6 +402,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -436,6 +444,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
}
inode_sync_complete(inode);
+ spin_unlock(&inode->i_lock);
return ret;
}
@@ -506,7 +515,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
* kind does not need peridic writeout yet, and for the latter
* kind writeout is handled by the freer.
*/
+ spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ spin_unlock(&inode->i_lock);
requeue_io(inode);
continue;
}
@@ -515,10 +526,14 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
*/
- if (inode_dirtied_after(inode, wbc->wb_start))
+ if (inode_dirtied_after(inode, wbc->wb_start)) {
+ spin_unlock(&inode->i_lock);
return 1;
+ }
__iget(inode);
+ spin_unlock(&inode->i_lock);
+
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
if (wbc->pages_skipped != pages_skipped) {
@@ -724,7 +739,9 @@ static long wb_writeback(struct bdi_writeback *wb,
if (!list_empty(&wb->b_more_io)) {
inode = wb_inode(wb->b_more_io.prev);
trace_wbc_writeback_wait(&wbc, wb->bdi);
+ spin_lock(&inode->i_lock);
inode_wait_for_writeback(inode);
+ spin_unlock(&inode->i_lock);
}
spin_unlock(&inode_lock);
}
@@ -1017,6 +1034,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
block_dump___mark_inode_dirty(inode);
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
@@ -1028,7 +1046,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* superblock list, based upon its state.
*/
if (inode->i_state & I_SYNC)
- goto out;
+ goto out_unlock_inode;
/*
* Only add valid (hashed) inodes to the superblock's
@@ -1036,11 +1054,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (!S_ISBLK(inode->i_mode)) {
if (inode_unhashed(inode))
- goto out;
+ goto out_unlock_inode;
}
if (inode->i_state & I_FREEING)
- goto out;
+ goto out_unlock_inode;
+ spin_unlock(&inode->i_lock);
/*
* If the inode was already on b_dirty/b_io/b_more_io, don't
* reposition it (that would break b_dirty time-ordering).
@@ -1065,7 +1084,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
}
+ goto out;
}
+out_unlock_inode:
+ spin_unlock(&inode->i_lock);
out:
spin_unlock(&inode_lock);
@@ -1111,14 +1133,16 @@ static void wait_sb_inodes(struct super_block *sb)
* we still have to wait for that writeout.
*/
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- struct address_space *mapping;
+ struct address_space *mapping = inode->i_mapping;
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
- continue;
- mapping = inode->i_mapping;
- if (mapping->nrpages == 0)
+ spin_lock(&inode->i_lock);
+ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ (mapping->nrpages == 0)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
__iget(inode);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
/*
* We hold a reference to 'inode' so it couldn't have
diff --git a/fs/inode.c b/fs/inode.c
index 16fefd3..bd5a237 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -27,6 +27,17 @@
#include <linux/ima.h>
/*
+ * inode locking rules.
+ *
+ * inode->i_lock protects:
+ * inode->i_state, inode->i_hash, __iget()
+ *
+ * Lock ordering:
+ * inode_lock
+ * inode->i_lock
+ */
+
+/*
* This is needed for the following functions:
* - inode_has_buffers
* - invalidate_bdev
@@ -136,15 +147,6 @@ int proc_nr_inodes(ctl_table *table, int write,
}
#endif
-static void wake_up_inode(struct inode *inode)
-{
- /*
- * Prevent speculative execution through spin_unlock(&inode_lock);
- */
- smp_mb();
- wake_up_bit(&inode->i_state, __I_NEW);
-}
-
/**
* inode_init_always - perform inode structure intialisation
* @sb: superblock inode belongs to
@@ -335,7 +337,7 @@ static void init_once(void *foo)
}
/*
- * inode_lock must be held
+ * inode->i_lock must be held
*/
void __iget(struct inode *inode)
{
@@ -412,7 +414,9 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
hlist_add_head(&inode->i_hash, b);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);
@@ -437,7 +441,9 @@ static void __remove_inode_hash(struct inode *inode)
void remove_inode_hash(struct inode *inode)
{
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
}
EXPORT_SYMBOL(remove_inode_hash);
@@ -494,7 +500,9 @@ static void dispose_list(struct list_head *head)
__inode_sb_list_del(inode);
spin_unlock(&inode_lock);
- wake_up_inode(inode);
+ spin_lock(&inode->i_lock);
+ wake_up_bit(&inode->i_state, __I_NEW);
+ spin_unlock(&inode->i_lock);
destroy_inode(inode);
}
}
@@ -517,10 +525,17 @@ void evict_inodes(struct super_block *sb)
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
inode->i_state |= I_FREEING;
+ if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+ inodes_stat.nr_unused--;
+ spin_unlock(&inode->i_lock);
/*
* Move the inode off the IO lists and LRU once I_FREEING is
@@ -528,8 +543,6 @@ void evict_inodes(struct super_block *sb)
*/
list_move(&inode->i_lru, &dispose);
list_del_init(&inode->i_wb_list);
- if (!(inode->i_state & (I_DIRTY | I_SYNC)))
- inodes_stat.nr_unused--;
}
spin_unlock(&inode_lock);
@@ -562,18 +575,26 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
spin_lock(&inode_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
if (inode->i_state & I_DIRTY && !kill_dirty) {
+ spin_unlock(&inode->i_lock);
busy = 1;
continue;
}
if (atomic_read(&inode->i_count)) {
+ spin_unlock(&inode->i_lock);
busy = 1;
continue;
}
inode->i_state |= I_FREEING;
+ if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+ inodes_stat.nr_unused--;
+ spin_unlock(&inode->i_lock);
/*
* Move the inode off the IO lists and LRU once I_FREEING is
@@ -581,8 +602,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
*/
list_move(&inode->i_lru, &dispose);
list_del_init(&inode->i_wb_list);
- if (!(inode->i_state & (I_DIRTY | I_SYNC)))
- inodes_stat.nr_unused--;
}
spin_unlock(&inode_lock);
@@ -640,8 +659,10 @@ static void prune_icache(int nr_to_scan)
* Referenced or dirty inodes are still in use. Give them
* another pass through the LRU as we canot reclaim them now.
*/
+ spin_lock(&inode->i_lock);
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
+ spin_unlock(&inode->i_lock);
list_del_init(&inode->i_lru);
inodes_stat.nr_unused--;
continue;
@@ -649,12 +670,14 @@ static void prune_icache(int nr_to_scan)
/* recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
- list_move(&inode->i_lru, &inode_lru);
inode->i_state &= ~I_REFERENCED;
+ spin_unlock(&inode->i_lock);
+ list_move(&inode->i_lru, &inode_lru);
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
__iget(inode);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
if (remove_inode_buffers(inode))
reap += invalidate_mapping_pages(&inode->i_data,
@@ -665,11 +688,15 @@ static void prune_icache(int nr_to_scan)
if (inode != list_entry(inode_lru.next,
struct inode, i_lru))
continue; /* wrong inode or list_empty */
- if (!can_unuse(inode))
+ spin_lock(&inode->i_lock);
+ if (!can_unuse(inode)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
}
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
+ spin_unlock(&inode->i_lock);
/*
* Move the inode off the IO lists and LRU once I_FREEING is
@@ -736,11 +763,13 @@ repeat:
continue;
if (!test(inode, data))
continue;
+ spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
}
__iget(inode);
+ spin_unlock(&inode->i_lock);
return inode;
}
return NULL;
@@ -762,11 +791,13 @@ repeat:
continue;
if (inode->i_sb != sb)
continue;
+ spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
}
__iget(inode);
+ spin_unlock(&inode->i_lock);
return inode;
}
return NULL;
@@ -831,14 +862,23 @@ struct inode *new_inode(struct super_block *sb)
inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode_lock);
- __inode_sb_list_add(inode);
+ spin_lock(&inode->i_lock);
inode->i_state = 0;
+ spin_unlock(&inode->i_lock);
+ __inode_sb_list_add(inode);
spin_unlock(&inode_lock);
}
return inode;
}
EXPORT_SYMBOL(new_inode);
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode: new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
void unlock_new_inode(struct inode *inode)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -858,19 +898,11 @@ void unlock_new_inode(struct inode *inode)
}
}
#endif
- /*
- * This is special! We do not need the spinlock when clearing I_NEW,
- * because we're guaranteed that nobody else tries to do anything about
- * the state of the inode when it is locked, as we just created it (so
- * there can be no old holders that haven't tested I_NEW).
- * However we must emit the memory barrier so that other CPUs reliably
- * see the clearing of I_NEW after the other inode initialisation has
- * completed.
- */
- smp_mb();
+ spin_lock(&inode->i_lock);
WARN_ON(!(inode->i_state & I_NEW));
inode->i_state &= ~I_NEW;
- wake_up_inode(inode);
+ wake_up_bit(&inode->i_state, __I_NEW);
+ spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);
@@ -899,9 +931,11 @@ static struct inode *get_new_inode(struct super_block *sb,
if (set(inode, data))
goto set_failed;
+ spin_lock(&inode->i_lock);
+ inode->i_state = I_NEW;
hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode->i_lock);
__inode_sb_list_add(inode);
- inode->i_state = I_NEW;
spin_unlock(&inode_lock);
/* Return the locked inode with I_NEW set, the
@@ -946,9 +980,11 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
old = find_inode_fast(sb, head, ino);
if (!old) {
inode->i_ino = ino;
+ spin_lock(&inode->i_lock);
+ inode->i_state = I_NEW;
hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode->i_lock);
__inode_sb_list_add(inode);
- inode->i_state = I_NEW;
spin_unlock(&inode_lock);
/* Return the locked inode with I_NEW set, the
@@ -1033,15 +1069,19 @@ EXPORT_SYMBOL(iunique);
struct inode *igrab(struct inode *inode)
{
spin_lock(&inode_lock);
- if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
__iget(inode);
- else
+ spin_unlock(&inode->i_lock);
+ } else {
+ spin_unlock(&inode->i_lock);
/*
* Handle the case where s_op->clear_inode is not been
* called yet, and somebody is calling igrab
* while the inode is getting freed.
*/
inode = NULL;
+ }
spin_unlock(&inode_lock);
return inode;
}
@@ -1270,7 +1310,6 @@ int insert_inode_locked(struct inode *inode)
ino_t ino = inode->i_ino;
struct hlist_head *head = inode_hashtable + hash(sb, ino);
- inode->i_state |= I_NEW;
while (1) {
struct hlist_node *node;
struct inode *old = NULL;
@@ -1280,16 +1319,23 @@ int insert_inode_locked(struct inode *inode)
continue;
if (old->i_sb != sb)
continue;
- if (old->i_state & (I_FREEING|I_WILL_FREE))
+ spin_lock(&old->i_lock);
+ if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+ spin_unlock(&old->i_lock);
continue;
+ }
break;
}
if (likely(!node)) {
+ spin_lock(&inode->i_lock);
+ inode->i_state |= I_NEW;
hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
return 0;
}
__iget(old);
+ spin_unlock(&old->i_lock);
spin_unlock(&inode_lock);
wait_on_inode(old);
if (unlikely(!inode_unhashed(old))) {
@@ -1307,8 +1353,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
struct super_block *sb = inode->i_sb;
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
- inode->i_state |= I_NEW;
-
while (1) {
struct hlist_node *node;
struct inode *old = NULL;
@@ -1319,16 +1363,23 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
continue;
if (!test(old, data))
continue;
- if (old->i_state & (I_FREEING|I_WILL_FREE))
+ spin_lock(&old->i_lock);
+ if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+ spin_unlock(&old->i_lock);
continue;
+ }
break;
}
if (likely(!node)) {
+ spin_lock(&inode->i_lock);
+ inode->i_state |= I_NEW;
hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
return 0;
}
__iget(old);
+ spin_unlock(&old->i_lock);
spin_unlock(&inode_lock);
wait_on_inode(old);
if (unlikely(!inode_unhashed(old))) {
@@ -1374,6 +1425,9 @@ static void iput_final(struct inode *inode)
const struct super_operations *op = inode->i_sb->s_op;
int drop;
+ spin_lock(&inode->i_lock);
+ WARN_ON(inode->i_state & I_NEW);
+
if (op && op->drop_inode)
drop = op->drop_inode(inode);
else
@@ -1385,21 +1439,23 @@ static void iput_final(struct inode *inode)
if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
inode_lru_list_add(inode);
}
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
return;
}
- WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_WILL_FREE;
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
write_inode_now(inode, 1);
spin_lock(&inode_lock);
+ spin_lock(&inode->i_lock);
WARN_ON(inode->i_state & I_NEW);
inode->i_state &= ~I_WILL_FREE;
__remove_inode_hash(inode);
}
- WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
+ spin_unlock(&inode->i_lock);
/*
* Move the inode off the IO lists and LRU once I_FREEING is
@@ -1412,8 +1468,10 @@ static void iput_final(struct inode *inode)
spin_unlock(&inode_lock);
evict(inode);
remove_inode_hash(inode);
- wake_up_inode(inode);
+ spin_lock(&inode->i_lock);
+ wake_up_bit(&inode->i_state, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+ spin_unlock(&inode->i_lock);
destroy_inode(inode);
}
@@ -1610,9 +1668,8 @@ EXPORT_SYMBOL(inode_wait);
* to recheck inode state.
*
* It doesn't matter if I_NEW is not set initially, a call to
- * wake_up_inode() after removing from the hash list will DTRT.
- *
- * This is called with inode_lock held.
+ * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
+ * will DTRT.
*/
static void __wait_on_freeing_inode(struct inode *inode)
{
@@ -1620,6 +1677,7 @@ static void __wait_on_freeing_inode(struct inode *inode)
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
schedule();
finish_wait(wq, &wait.wait);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf..4dd53fb 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
* I_WILL_FREE, or I_NEW which is fine because by that point
* the inode cannot have any associated watches.
*/
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
/*
* If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
* evict all inodes with zero i_count from icache which is
* unnecessarily violent and may in fact be illegal to do.
*/
- if (!atomic_read(&inode->i_count))
+ if (!atomic_read(&inode->i_count)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
need_iput_tmp = need_iput;
need_iput = NULL;
@@ -274,13 +279,17 @@ void fsnotify_unmount_inodes(struct list_head *list)
__iget(inode);
else
need_iput_tmp = NULL;
+ spin_unlock(&inode->i_lock);
/* In case the dropping of a reference would nuke next_i. */
if ((&next_i->i_sb_list != list) &&
- atomic_read(&next_i->i_count) &&
- !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
- __iget(next_i);
- need_iput = next_i;
+ atomic_read(&next_i->i_count)) {
+ spin_lock(&next_i->i_lock);
+ if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+ __iget(next_i);
+ need_iput = next_i;
+ }
+ spin_unlock(&next_i->i_lock);
}
/*
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e..a1470fd 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -902,18 +902,19 @@ static void add_dquot_ref(struct super_block *sb, int type)
spin_lock(&inode_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+ spin_lock(&inode->i_lock);
+ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ !atomic_read(&inode->i_writecount) ||
+ !dqinit_needed(inode, type)) {
+ spin_unlock(&inode->i_lock);
continue;
+ }
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
reserved = 1;
#endif
- if (!atomic_read(&inode->i_writecount))
- continue;
- if (!dqinit_needed(inode, type))
- continue;
-
__iget(inode);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
iput(old_inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7061a85..0a96bb0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1642,7 +1642,7 @@ struct super_operations {
};
/*
- * Inode state bits. Protected by inode_lock.
+ * Inode state bits. Protected by inode->i_lock
*
* Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
* I_DIRTY_DATASYNC and I_DIRTY_PAGES.
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index eb354f6..26f9e36 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -277,7 +277,7 @@ static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
/*
* Mark inode fully dirty. Since we are allocating blocks, inode
* would become fully dirty soon anyway and it reportedly
- * reduces inode_lock contention.
+ * reduces lock contention.
*/
mark_inode_dirty(inode);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d3..79b021d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -99,7 +99,9 @@
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* ->inode_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
* ->inode_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
* (code doesn't rely on that order, so you could switch it around)
diff --git a/mm/rmap.c b/mm/rmap.c
index 941bf82..81a95c3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,6 +32,7 @@
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* inode_lock (in set_page_dirty's __mark_inode_dirty)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
--
1.7.2.3
From: Dave Chinner <[email protected]>
First thing we do in writeback_single_inode() is take the i_lock and
the last thing we do is drop it. A caller already holds the i_lock,
so pull the i_lock out of writeback_single_inode() to reduce the
round trips on this lock during inode writeback.
Signed-off-by: Dave Chinner <[email protected]>
---
fs/fs-writeback.c | 18 +++++++++++-------
1 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ed80065..b5ed541 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -332,9 +332,9 @@ static void inode_wait_for_writeback(struct inode *inode)
}
/*
- * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either
- * the caller has an active reference on the inode or the inode has I_WILL_FREE
- * set.
+ * Write out an inode's dirty pages. Called under inode_wb_list_lock and
+ * inode->i_lock. Either the caller has an active reference on the inode or
+ * the inode has I_WILL_FREE set.
*
* If `wait' is set, wait on the writeout.
*
@@ -349,7 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
unsigned dirty;
int ret;
- spin_lock(&inode->i_lock);
+ assert_spin_locked(&inode_wb_list_lock);
+ assert_spin_locked(&inode->i_lock);
+
if (!atomic_read(&inode->i_count))
WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
else
@@ -365,7 +367,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* completed a full scan of b_io.
*/
if (wbc->sync_mode != WB_SYNC_ALL) {
- spin_unlock(&inode->i_lock);
requeue_io(inode);
return 0;
}
@@ -456,7 +457,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
}
inode_sync_complete(inode);
- spin_unlock(&inode->i_lock);
return ret;
}
@@ -544,7 +544,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
}
__iget(inode);
- spin_unlock(&inode->i_lock);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
@@ -555,6 +554,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
*/
redirty_tail(inode);
}
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
iput(inode);
cond_resched();
@@ -1309,7 +1309,9 @@ int write_inode_now(struct inode *inode, int sync)
might_sleep();
spin_lock(&inode_wb_list_lock);
+ spin_lock(&inode->i_lock);
ret = writeback_single_inode(inode, &wbc);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
if (sync)
inode_sync_wait(inode);
@@ -1333,7 +1335,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
int ret;
spin_lock(&inode_wb_list_lock);
+ spin_lock(&inode->i_lock);
ret = writeback_single_inode(inode, wbc);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
return ret;
}
--
1.7.2.3
From: Dave Chinner <[email protected]>
Protect the inode writeback list with a new global lock
inode_wb_list_lock and use it to protect the list manipulations and
traversals. This lock replaces the inode_lock as the inodes on the
list can be validity checked while holding the inode->i_lock and
hence the inode_lock is no longer needed to protect the list.
Signed-off-by: Dave Chinner <[email protected]>
---
fs/block_dev.c | 4 +-
fs/fs-writeback.c | 76 ++++++++++++++++++++++++++-------------------
fs/inode.c | 12 +++++--
fs/internal.h | 5 +++
include/linux/writeback.h | 1 +
mm/backing-dev.c | 8 ++--
mm/filemap.c | 8 ++--
mm/rmap.c | 4 +-
8 files changed, 70 insertions(+), 48 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bc39b18..2bbc0e6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,13 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY)
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
}
static sector_t max_block(struct block_device *bdev)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5de56a2..ed80065 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
}
/*
+ * Remove the inode from the writeback list it is on.
+ */
+void inode_wb_list_del(struct inode *inode)
+{
+ spin_lock(&inode_wb_list_lock);
+ list_del_init(&inode->i_wb_list);
+ spin_unlock(&inode_wb_list_lock);
+}
+
+
+/*
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
* furthest end of its superblock's dirty-inode list.
*
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ assert_spin_locked(&inode_wb_list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ assert_spin_locked(&inode_wb_list_lock);
list_move(&inode->i_wb_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
{
/*
- * Prevent speculative execution through spin_unlock(&inode_lock);
+ * Prevent speculative execution through
+ * spin_unlock(&inode_wb_list_lock);
*/
+
smp_mb();
wake_up_bit(&inode->i_state, __I_SYNC);
}
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
*/
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
+ assert_spin_locked(&inode_wb_list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
@@ -308,25 +324,23 @@ static void inode_wait_for_writeback(struct inode *inode)
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
spin_lock(&inode->i_lock);
}
}
/*
- * Write out an inode's dirty pages. Called under inode_lock. Either the
- * caller has ref on the inode (either via __iget or via syscall against an fd)
- * or the inode has I_WILL_FREE set (via generic_forget_inode)
+ * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either
+ * the caller has an active reference on the inode or the inode has I_WILL_FREE
+ * set.
*
* If `wait' is set, wait on the writeout.
*
* The whole writeout design is quite complex and fragile. We want to avoid
* starvation of particular inodes when others are being redirtied, prevent
* livelocks, etc.
- *
- * Called under inode_lock.
*/
static int
writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -368,7 +382,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
ret = do_writepages(mapping, wbc);
@@ -388,12 +402,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* due to delalloc, clear dirty metadata flags right before
* write_inode()
*/
- spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
@@ -401,7 +413,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
ret = err;
}
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
spin_lock(&inode->i_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
@@ -543,10 +555,10 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
*/
redirty_tail(inode);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
iput(inode);
cond_resched();
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
if (wbc->nr_to_write <= 0) {
wbc->more_io = 1;
return 1;
@@ -565,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
if (!wbc->wb_start)
wbc->wb_start = jiffies; /* livelock avoidance */
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
if (!wbc->for_kupdate || list_empty(&wb->b_io))
queue_io(wb, wbc->older_than_this);
@@ -583,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
if (ret)
break;
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
/* Leave any unwritten inodes on b_io */
}
@@ -592,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
{
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
if (!wbc->for_kupdate || list_empty(&wb->b_io))
queue_io(wb, wbc->older_than_this);
writeback_sb_inodes(sb, wb, wbc, true);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
}
/*
@@ -735,7 +747,7 @@ static long wb_writeback(struct bdi_writeback *wb,
* become available for writeback. Otherwise
* we'll just busyloop.
*/
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
if (!list_empty(&wb->b_more_io)) {
inode = wb_inode(wb->b_more_io.prev);
trace_wbc_writeback_wait(&wbc, wb->bdi);
@@ -743,7 +755,7 @@ static long wb_writeback(struct bdi_writeback *wb,
inode_wait_for_writeback(inode);
spin_unlock(&inode->i_lock);
}
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
}
return wrote;
@@ -1009,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
struct backing_dev_info *bdi = NULL;
- bool wakeup_bdi = false;
/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1033,7 +1044,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (unlikely(block_dump))
block_dump___mark_inode_dirty(inode);
- spin_lock(&inode_lock);
spin_lock(&inode->i_lock);
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
@@ -1059,12 +1069,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (inode->i_state & I_FREEING)
goto out_unlock_inode;
- spin_unlock(&inode->i_lock);
/*
* If the inode was already on b_dirty/b_io/b_more_io, don't
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
+ bool wakeup_bdi = false;
bdi = inode_to_bdi(inode);
if (bdi_cap_writeback_dirty(bdi)) {
@@ -1081,18 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wakeup_bdi = true;
}
+ spin_unlock(&inode->i_lock);
+ spin_lock(&inode_wb_list_lock);
inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ spin_unlock(&inode_wb_list_lock);
+
+ if (wakeup_bdi)
+ bdi_wakeup_thread_delayed(bdi);
+ return;
}
- goto out;
}
out_unlock_inode:
spin_unlock(&inode->i_lock);
-out:
- spin_unlock(&inode_lock);
- if (wakeup_bdi)
- bdi_wakeup_thread_delayed(bdi);
}
EXPORT_SYMBOL(__mark_inode_dirty);
@@ -1296,9 +1308,9 @@ int write_inode_now(struct inode *inode, int sync)
wbc.nr_to_write = 0;
might_sleep();
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
ret = writeback_single_inode(inode, &wbc);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
if (sync)
inode_sync_wait(inode);
return ret;
@@ -1320,9 +1332,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
int ret;
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
ret = writeback_single_inode(inode, wbc);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
return ret;
}
EXPORT_SYMBOL(sync_inode);
diff --git a/fs/inode.c b/fs/inode.c
index 5483d38..5a7f8ef 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,7 @@
#include <linux/async.h>
#include <linux/posix_acl.h>
#include <linux/ima.h>
+#include "internal.h"
/*
* inode locking rules.
@@ -35,6 +36,8 @@
* inode_lru, inode->i_lru
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
+ * inode_wb_list_lock protects:
+ * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
*
* Lock ordering:
* inode_lock
@@ -43,6 +46,9 @@
* inode_sb_list_lock
* inode->i_lock
* inode_lru_lock
+ *
+ * inode_wb_list_lock
+ * inode->i_lock
*/
/*
@@ -104,6 +110,7 @@ static struct hlist_head *inode_hashtable __read_mostly;
DEFINE_SPINLOCK(inode_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
/*
* iprune_sem provides exclusion between the icache shrinking and the
@@ -482,10 +489,7 @@ static void evict(struct inode *inode)
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(!list_empty(&inode->i_lru));
- spin_lock(&inode_lock);
- list_del_init(&inode->i_wb_list);
- spin_unlock(&inode_lock);
-
+ inode_wb_list_del(inode);
inode_sb_list_del(inode);
if (op->evict_inode) {
diff --git a/fs/internal.h b/fs/internal.h
index 7013ae0..b29c46e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -127,6 +127,11 @@ extern long do_handle_open(int mountdirfd,
*/
extern spinlock_t inode_sb_list_lock;
+/*
+ * fs-writeback.c
+ */
+extern void inode_wb_list_del(struct inode *inode);
+
extern int get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
extern int invalidate_inodes(struct super_block *, bool);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 0ead399..3f5fee7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -10,6 +10,7 @@
struct backing_dev_info;
extern spinlock_t inode_lock;
+extern spinlock_t inode_wb_list_lock;
/*
* fs/fs-writeback.c
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d..4b3e9f1 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -73,14 +73,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct inode *inode;
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -682,11 +682,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
}
bdi_unregister(bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index 79b021d..c88d17f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
* ->i_mutex
* ->i_alloc_sem (various)
*
- * ->inode_lock
- * ->sb_lock (fs/fs-writeback.c)
+ * inode_wb_list_lock
+ * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
* ->i_mmap_lock
@@ -98,9 +98,9 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (page_remove_rmap->set_page_dirty)
+ * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (zap_pte_range->set_page_dirty)
+ * inode_wb_list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
diff --git a/mm/rmap.c b/mm/rmap.c
index 81a95c3..af9e0d7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,12 +31,12 @@
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
- * inode_lock (in set_page_dirty's __mark_inode_dirty)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
- * within inode_lock in __sync_single_inode)
+ * within inode_wb_list_lock in __sync_single_inode)
*
* (code doesn't rely on that order so it could be switched around)
* ->tasklist_lock
--
1.7.2.3
From: Dave Chinner <[email protected]>
Introduce the inode_lru_lock to protect the inode_lru list. This
lock is nested inside the inode->i_lock to allow the inode to be
added to the LRU list in iput_final without needing to deal with
lock inversions. This keeps iput_final() clean and neat.
Further, where marking the inode I_FREEING and removing it from the
LRU, move the LRU list manipulation within the inode->i_lock to keep
the list manipulation consistent with iput_final. This also means
that most of the open coded LRU list removal + unused inode
accounting can now use the inode_lru_list_del() wrappers which
cleans the code up further.
However, this locking change means what the LRU traversal in
prune_icache() inverts this lock ordering and needs to use trylock
semantics on the inode->i_lock to avoid deadlocking. In these cases,
if we fail to lock the inode we move it to the back of the LRU to
prevent spinning on it.
Signed-off-by: Dave Chinner <[email protected]>
---
fs/inode.c | 39 ++++++++++++++++++++++++++++++---------
1 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 442b55b..f6e6e37 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -31,10 +31,13 @@
*
* inode->i_lock protects:
* inode->i_state, inode->i_hash, __iget()
+ * inode_lru_lock protects:
+ * inode_lru, inode->i_lru
*
* Lock ordering:
* inode_lock
* inode->i_lock
+ * inode_lru_lock
*/
/*
@@ -84,6 +87,7 @@ static unsigned int i_hash_shift __read_mostly;
*/
static LIST_HEAD(inode_lru);
+static DEFINE_SPINLOCK(inode_lru_lock);
static struct hlist_head *inode_hashtable __read_mostly;
/*
@@ -355,18 +359,22 @@ EXPORT_SYMBOL(ihold);
static void inode_lru_list_add(struct inode *inode)
{
+ spin_lock(&inode_lru_lock);
if (list_empty(&inode->i_lru)) {
list_add(&inode->i_lru, &inode_lru);
inodes_stat.nr_unused++;
}
+ spin_unlock(&inode_lru_lock);
}
static void inode_lru_list_del(struct inode *inode)
{
+ spin_lock(&inode_lru_lock);
if (!list_empty(&inode->i_lru)) {
list_del_init(&inode->i_lru);
inodes_stat.nr_unused--;
}
+ spin_unlock(&inode_lru_lock);
}
static inline void __inode_sb_list_add(struct inode *inode)
@@ -542,10 +550,9 @@ void evict_inodes(struct super_block *sb)
}
inode->i_state |= I_FREEING;
- if (!(inode->i_state & (I_DIRTY | I_SYNC)))
- inodes_stat.nr_unused--;
+ inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
- list_move(&inode->i_lru, &dispose);
+ list_add(&inode->i_lru, &dispose);
}
spin_unlock(&inode_lock);
@@ -595,10 +602,9 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
}
inode->i_state |= I_FREEING;
- if (!(inode->i_state & (I_DIRTY | I_SYNC)))
- inodes_stat.nr_unused--;
+ inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
- list_move(&inode->i_lru, &dispose);
+ list_add(&inode->i_lru, &dispose);
}
spin_unlock(&inode_lock);
@@ -622,7 +628,7 @@ static int can_unuse(struct inode *inode)
/*
* Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lru_lock by dispose_list().
*
* Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. If the inode has metadata buffers attached to
@@ -644,6 +650,7 @@ static void prune_icache(int nr_to_scan)
down_read(&iprune_sem);
spin_lock(&inode_lock);
+ spin_lock(&inode_lru_lock);
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
struct inode *inode;
@@ -653,10 +660,19 @@ static void prune_icache(int nr_to_scan)
inode = list_entry(inode_lru.prev, struct inode, i_lru);
/*
+ * we are inverting the inode_lru_lock/inode->i_lock here,
+ * so use a trylock. If we fail to get the lock, just move the
+ * inode to the back of the list so we don't spin on it.
+ */
+ if (!spin_trylock(&inode->i_lock)) {
+ list_move(&inode->i_lru, &inode_lru);
+ continue;
+ }
+
+ /*
* Referenced or dirty inodes are still in use. Give them
* another pass through the LRU as we canot reclaim them now.
*/
- spin_lock(&inode->i_lock);
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
spin_unlock(&inode->i_lock);
@@ -675,17 +691,21 @@ static void prune_icache(int nr_to_scan)
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
__iget(inode);
spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_lru_lock);
spin_unlock(&inode_lock);
if (remove_inode_buffers(inode))
reap += invalidate_mapping_pages(&inode->i_data,
0, -1);
iput(inode);
spin_lock(&inode_lock);
+ spin_lock(&inode_lru_lock);
if (inode != list_entry(inode_lru.next,
struct inode, i_lru))
continue; /* wrong inode or list_empty */
- spin_lock(&inode->i_lock);
+ /* avoid lock inversions with trylock */
+ if (!spin_trylock(&inode->i_lock))
+ continue;
if (!can_unuse(inode)) {
spin_unlock(&inode->i_lock);
continue;
@@ -702,6 +722,7 @@ static void prune_icache(int nr_to_scan)
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
+ spin_unlock(&inode_lru_lock);
spin_unlock(&inode_lock);
dispose_list(&freeable);
--
1.7.2.3
From: Dave Chinner <[email protected]>
We have a couple of places that dispose of inodes. factor the
disposal into evict() to isolate this code and make it simpler to
peel away the inode_lock from the code.
While doing this, change the logic flow in iput_final() to separate
the different cases that need to be handled to make the transitions
the inode goes through more obvious.
Signed-off-by: Dave Chinner <[email protected]>
---
fs/inode.c | 104 +++++++++++++++++++++++------------------------------------
1 files changed, 41 insertions(+), 63 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index bd5a237..442b55b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -422,17 +422,6 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
EXPORT_SYMBOL(__insert_inode_hash);
/**
- * __remove_inode_hash - remove an inode from the hash
- * @inode: inode to unhash
- *
- * Remove an inode from the superblock.
- */
-static void __remove_inode_hash(struct inode *inode)
-{
- hlist_del_init(&inode->i_hash);
-}
-
-/**
* remove_inode_hash - remove an inode from the hash
* @inode: inode to unhash
*
@@ -461,10 +450,31 @@ void end_writeback(struct inode *inode)
}
EXPORT_SYMBOL(end_writeback);
+/*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to. We remove any pages still attached to the inode and wait for any IO that
+ * is still in progress before finally destroying the inode.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode). The caller is responsible for setting this.
+ *
+ * An inode must already be removed from the LRU list before being evicted from
+ * the cache. This should occur atomically with setting the I_FREEING state
+ * flag, so no inodes here should ever be on the LRU when being evicted.
+ */
static void evict(struct inode *inode)
{
const struct super_operations *op = inode->i_sb->s_op;
+ BUG_ON(!(inode->i_state & I_FREEING));
+ BUG_ON(!list_empty(&inode->i_lru));
+
+ spin_lock(&inode_lock);
+ list_del_init(&inode->i_wb_list);
+ __inode_sb_list_del(inode);
+ spin_unlock(&inode_lock);
+
if (op->evict_inode) {
op->evict_inode(inode);
} else {
@@ -476,6 +486,15 @@ static void evict(struct inode *inode)
bd_forget(inode);
if (S_ISCHR(inode->i_mode) && inode->i_cdev)
cd_forget(inode);
+
+ remove_inode_hash(inode);
+
+ spin_lock(&inode->i_lock);
+ wake_up_bit(&inode->i_state, __I_NEW);
+ BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+ spin_unlock(&inode->i_lock);
+
+ destroy_inode(inode);
}
/*
@@ -494,16 +513,6 @@ static void dispose_list(struct list_head *head)
list_del_init(&inode->i_lru);
evict(inode);
-
- spin_lock(&inode_lock);
- __remove_inode_hash(inode);
- __inode_sb_list_del(inode);
- spin_unlock(&inode_lock);
-
- spin_lock(&inode->i_lock);
- wake_up_bit(&inode->i_state, __I_NEW);
- spin_unlock(&inode->i_lock);
- destroy_inode(inode);
}
}
@@ -536,13 +545,7 @@ void evict_inodes(struct super_block *sb)
if (!(inode->i_state & (I_DIRTY | I_SYNC)))
inodes_stat.nr_unused--;
spin_unlock(&inode->i_lock);
-
- /*
- * Move the inode off the IO lists and LRU once I_FREEING is
- * set so that it won't get moved back on there if it is dirty.
- */
list_move(&inode->i_lru, &dispose);
- list_del_init(&inode->i_wb_list);
}
spin_unlock(&inode_lock);
@@ -595,13 +598,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
if (!(inode->i_state & (I_DIRTY | I_SYNC)))
inodes_stat.nr_unused--;
spin_unlock(&inode->i_lock);
-
- /*
- * Move the inode off the IO lists and LRU once I_FREEING is
- * set so that it won't get moved back on there if it is dirty.
- */
list_move(&inode->i_lru, &dispose);
- list_del_init(&inode->i_wb_list);
}
spin_unlock(&inode_lock);
@@ -698,12 +695,7 @@ static void prune_icache(int nr_to_scan)
inode->i_state |= I_FREEING;
spin_unlock(&inode->i_lock);
- /*
- * Move the inode off the IO lists and LRU once I_FREEING is
- * set so that it won't get moved back on there if it is dirty.
- */
list_move(&inode->i_lru, &freeable);
- list_del_init(&inode->i_wb_list);
inodes_stat.nr_unused--;
}
if (current_is_kswapd())
@@ -1433,16 +1425,16 @@ static void iput_final(struct inode *inode)
else
drop = generic_drop_inode(inode);
+ if (!drop && (sb->s_flags & MS_ACTIVE)) {
+ inode->i_state |= I_REFERENCED;
+ if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+ inode_lru_list_add(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_lock);
+ return;
+ }
+
if (!drop) {
- if (sb->s_flags & MS_ACTIVE) {
- inode->i_state |= I_REFERENCED;
- if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
- inode_lru_list_add(inode);
- }
- spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lock);
- return;
- }
inode->i_state |= I_WILL_FREE;
spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
@@ -1451,28 +1443,14 @@ static void iput_final(struct inode *inode)
spin_lock(&inode->i_lock);
WARN_ON(inode->i_state & I_NEW);
inode->i_state &= ~I_WILL_FREE;
- __remove_inode_hash(inode);
}
inode->i_state |= I_FREEING;
- spin_unlock(&inode->i_lock);
-
- /*
- * Move the inode off the IO lists and LRU once I_FREEING is
- * set so that it won't get moved back on there if it is dirty.
- */
inode_lru_list_del(inode);
- list_del_init(&inode->i_wb_list);
-
- __inode_sb_list_del(inode);
+ spin_unlock(&inode->i_lock);
spin_unlock(&inode_lock);
+
evict(inode);
- remove_inode_hash(inode);
- spin_lock(&inode->i_lock);
- wake_up_bit(&inode->i_state, __I_NEW);
- BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
- spin_unlock(&inode->i_lock);
- destroy_inode(inode);
}
/**
--
1.7.2.3
On Tue, Mar 22, 2011 at 12:23 PM, Dave Chinner <[email protected]> wrote:
> Hi Al,
>
> The following patches are the inode_lock breakup series originally
> derived from Nick Piggin's vfs-scale tree. I've kind of been sitting
> on them until the dcache_lock breakup and rcu path-walk has had some
> time to be shaken out. The patch ѕet is pretty much unchanged from
> the last round of review last last year - all I've done to bring it
> up to date is forward port it and run it through some testing on XFS
> and ext4.
>
> I know it's late in the .39 merge window, but I hope you'll consider
> it if the patches are still acceptable(*). Otherwise I'm happy to take
> the time to get it right for .40.
>
> Cheers,
>
> Dave.
>
> (*) The series can also be found here:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/dgc/xfsdev.git inode-scale
>
> Dave Chinner (8):
> fs: protect inode->i_state with inode->i_lock
> fs: factor inode disposal
> fs: Lock the inode LRU list separately
> fs: remove inode_lock from iput_final and prune_icache
> fs: move i_sb_list out from under inode_lock
> fs: move i_wb_list out from under inode_lock
> fs: rename inode_lock to inode_hash_lock
> fs: pull inode->i_lock up out of writeback_single_inode
>
[...]
Hi,
I have tested this patch-series on top of linux-next (next-20110322)
by running xfstests-dev (built from git).
My sdb2 partition (on an external 1GBytes USB-2.0 hdd) was formatted
and mounted as ext4-fs .
The check-log is attached (not sure how to interpret the errors and failures).
Regards,
- Sedat -
P.S.: Note to myself (Host: Debian/sid i386)
[ BUILD ]
$ apt-get install uuid-dev xfslibs-dev libacl1-dev libdm0-dev libgdbm-dev
[ TESTS ]
$ apt-get install xfsdump
DEVNODE="sdb2"
FSTYPE="ext4"
$ mkfs.$FSTYPE /dev/$DEVNODE
$ mkdir -p /mnt/$DEVNODE
$ mount -t $FSTYPE /dev/$DEVNODE /mnt/$DEVNODE
$ TEST_DIR=/mnt/$DEVNODE TEST_DEV=/dev/$DEVNODE ./check 2>&1 | tee
../check_${DEVNODE}-${FSTYPE}.txt
-SD // 22-Mar-2011
On Tue, Mar 22, 2011 at 07:17:04PM +0100, Sedat Dilek wrote:
> On Tue, Mar 22, 2011 at 12:23 PM, Dave Chinner <[email protected]> wrote:
> > Hi Al,
> >
> > The following patches are the inode_lock breakup series originally
> > derived from Nick Piggin's vfs-scale tree. I've kind of been sitting
> > on them until the dcache_lock breakup and rcu path-walk has had some
> > time to be shaken out. The patch ѕet is pretty much unchanged from
> > the last round of review last last year - all I've done to bring it
> > up to date is forward port it and run it through some testing on XFS
> > and ext4.
> >
> > I know it's late in the .39 merge window, but I hope you'll consider
> > it if the patches are still acceptable(*). Otherwise I'm happy to take
> > the time to get it right for .40.
> >
> > Cheers,
> >
> > Dave.
> >
> > (*) The series can also be found here:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/dgc/xfsdev.git inode-scale
> >
> > Dave Chinner (8):
> > fs: protect inode->i_state with inode->i_lock
> > fs: factor inode disposal
> > fs: Lock the inode LRU list separately
> > fs: remove inode_lock from iput_final and prune_icache
> > fs: move i_sb_list out from under inode_lock
> > fs: move i_wb_list out from under inode_lock
> > fs: rename inode_lock to inode_hash_lock
> > fs: pull inode->i_lock up out of writeback_single_inode
> >
> [...]
>
> Hi,
>
> I have tested this patch-series on top of linux-next (next-20110322)
> by running xfstests-dev (built from git).
>
> My sdb2 partition (on an external 1GBytes USB-2.0 hdd) was formatted
> and mounted as ext4-fs .
If you really want to use xfstests to produce some system stress,
you'd do better to use an XFS filesystem ;)
> The check-log is attached (not sure how to interpret the errors and failures).
Nothing indicates an unknown failure...
> 001 5s ... 4s
> 002 1s ... 1s
> 003 [not run] not suitable for this filesystem type: ext4
> 004 [not run] not suitable for this filesystem type: ext4
> 005 - output mismatch (see 005.out.bad)
> --- 005.out 2011-03-22 17:47:03.861226933 +0100
> +++ 005.out.bad 2011-03-22 18:47:58.847277538 +0100
> @@ -1,7 +1,7 @@
> QA output created by 005
> *** touch deep symlinks
>
> -ELOOP returned. Good.
> +No ELOOP? Unexpected!
>
> *** touch recusive symlinks
This is a result of Al fixing the max nested loop depth very early
on in .39, so the test needs to run to deeper nesting depths to
produce ELOOP. So it's a test problem, not a bug.
> 197 [not run] not suitable for this filesystem type: ext4
> 198 [failed, exit status 127] - output mismatch (see 198.out.bad)
> --- 198.out 2011-03-22 17:47:03.917226229 +0100
> +++ 198.out.bad 2011-03-22 19:04:12.591035920 +0100
> @@ -1,2 +1,3 @@
> QA output created by 198
> Silence is golden.
> +./198: line 54: /home/sd/src/xfstests-dev/xfstests-dev/src/aio-dio-regress/aiodio_sparse2: No such file or directory
You need to install libaio and friends so that the binary is
built. We probably need to add a "requires_aio" test option to
detect this situation and not_run the test gracefully.
> 238 [not run] not suitable for this filesystem type: ext4
> 239 [not run] src/aio-dio-regress/aio-dio-hole-filling-race not built
Like this one does....
> 240 [failed, exit status 127] - output mismatch (see 240.out.bad)
> --- 240.out 2011-03-22 17:47:03.925226129 +0100
> +++ 240.out.bad 2011-03-22 19:04:59.866441589 +0100
> @@ -1,2 +1,3 @@
> QA output created by 240
> Silence is golden.
> +./240: line 72: /home/sd/src/xfstests-dev/xfstests-dev/src/aio-dio-regress/aiodio_sparse2: No such file or directory
Same again.
> 241 [not run] dbench not found
> 242 [not run] not suitable for this filesystem type: ext4
> 243 3s ... 3s
> 244 [not run] not suitable for this filesystem type: ext4
> 245 0s ... 0s
> 246 0s ... 0s
> 247 77s ... 78s
> 248 0s ... 0s
> 249 0s ... 1s
> 250 [not run] not suitable for this filesystem type: ext4
> 251 [not run] this test requires a valid $SCRATCH_DEV
> Ran: 001 002 005 006 007 010 011 013 014 070 074 075 088 089 126 127 131 133 184 198 213 214 215 221 225 228 236 237 240 243 245 246 247 248 249
> Not run: 003 004 008 009 012 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 071 072 073 076 077 078 079 080 081 082 083 084 085 086 087 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 128 129 130 132 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 185 186 187 188 189 190 191 192 193 194 195 196 197 199 200 201 202 203 204 205 206 207 208 209 210 211 212 216 217 218 219 220 222 223 224 226 227 229 230 231 232 233 234 235 238 239 241 242 244 250 251
> Failures: 005 198 240
> Failed 3 of 35 tests
A typical XFS run gives:
Ran: 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 019 020 021 026 027 028 029 030 031 032 033 034 041 042 045 046 047 048 049 050 051 052 053 054 056 061 062 063 064 065 066 067 068 069 070 072 073 074 075 076 077 078 079 083 084 085 086 087 088 089 091 092 096 100 103 104 105 108 109 110 112 113 116 117 118 119 120 121 123 124 125 126 127 128 129 130 131 132 133 134 135 137 138 139 140 141 164 165 166 167 169 170 174 178 179 180 181 182 183 184 186 187 188 189 190 192 193 194 195 196 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
Not run: 035 040 044 057 058 090 093 094 095 097 098 099 122 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 168 175 176 177 185 191 197
Failures: 189 229 250
Failed 3 of 180 tests
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Mar 23, 2011 at 7:29 AM, Dave Chinner <[email protected]> wrote:
> On Tue, Mar 22, 2011 at 07:17:04PM +0100, Sedat Dilek wrote:
>> On Tue, Mar 22, 2011 at 12:23 PM, Dave Chinner <[email protected]> wrote:
>> > Hi Al,
>> >
>> > The following patches are the inode_lock breakup series originally
>> > derived from Nick Piggin's vfs-scale tree. I've kind of been sitting
>> > on them until the dcache_lock breakup and rcu path-walk has had some
>> > time to be shaken out. The patch ѕet is pretty much unchanged from
>> > the last round of review last last year - all I've done to bring it
>> > up to date is forward port it and run it through some testing on XFS
>> > and ext4.
>> >
>> > I know it's late in the .39 merge window, but I hope you'll consider
>> > it if the patches are still acceptable(*). Otherwise I'm happy to take
>> > the time to get it right for .40.
>> >
>> > Cheers,
>> >
>> > Dave.
>> >
>> > (*) The series can also be found here:
>> >
>> > git://git.kernel.org/pub/scm/linux/kernel/git/dgc/xfsdev.git inode-scale
>> >
>> > Dave Chinner (8):
>> > fs: protect inode->i_state with inode->i_lock
>> > fs: factor inode disposal
>> > fs: Lock the inode LRU list separately
>> > fs: remove inode_lock from iput_final and prune_icache
>> > fs: move i_sb_list out from under inode_lock
>> > fs: move i_wb_list out from under inode_lock
>> > fs: rename inode_lock to inode_hash_lock
>> > fs: pull inode->i_lock up out of writeback_single_inode
>> >
>> [...]
>>
>> Hi,
>>
>> I have tested this patch-series on top of linux-next (next-20110322)
>> by running xfstests-dev (built from git).
>>
>> My sdb2 partition (on an external 1GBytes USB-2.0 hdd) was formatted
>> and mounted as ext4-fs .
>
> If you really want to use xfstests to produce some system stress,
> you'd do better to use an XFS filesystem ;)
>
First, I was riddling why I could not install xfstests-dev to an
individual $PREFIX and waste(?) some time into looking into the
configure* files - dbtest.c failed building by not setting correct
$libgdbm (normal build via make is OK).
Then I saw the hardcoded /var/lib/ as INSTALL_PATH... and didn't
wanted dig deeper.
I overflew the README, but that file needs a bit of more hints.
I can do some XFS testing if you wish.
The numbers in the result say nothing to me as I can't compare them
with other systems.
It's like putting the hand in a glass of water and wild speculating
the temperature.
You can say cold or warm if you have a 2nd glass with different temperature.
So, it might be worth to add a file (name: benchmark.txt?) with some results?
>> The check-log is attached (not sure how to interpret the errors and failures).
>
> Nothing indicates an unknown failure...
>
>> 001 5s ... 4s
>> 002 1s ... 1s
>> 003 [not run] not suitable for this filesystem type: ext4
>> 004 [not run] not suitable for this filesystem type: ext4
>> 005 - output mismatch (see 005.out.bad)
>> --- 005.out 2011-03-22 17:47:03.861226933 +0100
>> +++ 005.out.bad 2011-03-22 18:47:58.847277538 +0100
>> @@ -1,7 +1,7 @@
>> QA output created by 005
>> *** touch deep symlinks
>>
>> -ELOOP returned. Good.
>> +No ELOOP? Unexpected!
>>
>> *** touch recusive symlinks
>
> This is a result of Al fixing the max nested loop depth very early
> on in .39, so the test needs to run to deeper nesting depths to
> produce ELOOP. So it's a test problem, not a bug.
>
>> 197 [not run] not suitable for this filesystem type: ext4
>> 198 [failed, exit status 127] - output mismatch (see 198.out.bad)
>> --- 198.out 2011-03-22 17:47:03.917226229 +0100
>> +++ 198.out.bad 2011-03-22 19:04:12.591035920 +0100
>> @@ -1,2 +1,3 @@
>> QA output created by 198
>> Silence is golden.
>> +./198: line 54: /home/sd/src/xfstests-dev/xfstests-dev/src/aio-dio-regress/aiodio_sparse2: No such file or directory
>
> You need to install libaio and friends so that the binary is
> built. We probably need to add a "requires_aio" test option to
> detect this situation and not_run the test gracefully.
>
Yupp, the build-/configure-system could be enhanced in that case.
(I add "Notes to myself" see my P.S. to remember what I did and what I missed.)
>> 238 [not run] not suitable for this filesystem type: ext4
>> 239 [not run] src/aio-dio-regress/aio-dio-hole-filling-race not built
>
> Like this one does....
>
>> 240 [failed, exit status 127] - output mismatch (see 240.out.bad)
>> --- 240.out 2011-03-22 17:47:03.925226129 +0100
>> +++ 240.out.bad 2011-03-22 19:04:59.866441589 +0100
>> @@ -1,2 +1,3 @@
>> QA output created by 240
>> Silence is golden.
>> +./240: line 72: /home/sd/src/xfstests-dev/xfstests-dev/src/aio-dio-regress/aiodio_sparse2: No such file or directory
>
> Same again.
>
>> 241 [not run] dbench not found
>> 242 [not run] not suitable for this filesystem type: ext4
>> 243 3s ... 3s
>> 244 [not run] not suitable for this filesystem type: ext4
>> 245 0s ... 0s
>> 246 0s ... 0s
>> 247 77s ... 78s
>> 248 0s ... 0s
>> 249 0s ... 1s
>> 250 [not run] not suitable for this filesystem type: ext4
>> 251 [not run] this test requires a valid $SCRATCH_DEV
>> Ran: 001 002 005 006 007 010 011 013 014 070 074 075 088 089 126 127 131 133 184 198 213 214 215 221 225 228 236 237 240 243 245 246 247 248 249
>> Not run: 003 004 008 009 012 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 071 072 073 076 077 078 079 080 081 082 083 084 085 086 087 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 128 129 130 132 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 185 186 187 188 189 190 191 192 193 194 195 196 197 199 200 201 202 203 204 205 206 207 208 209 210 211 212 216 217 218 219 220 222 223 224 226 227 229 230 231 232 233 234 235 238 239 241 242 244 250 251
>> Failures: 005 198 240
>> Failed 3 of 35 tests
>
> A typical XFS run gives:
>
> Ran: 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 019 020 021 026 027 028 029 030 031 032 033 034 041 042 045 046 047 048 049 050 051 052 053 054 056 061 062 063 064 065 066 067 068 069 070 072 073 074 075 076 077 078 079 083 084 085 086 087 088 089 091 092 096 100 103 104 105 108 109 110 112 113 116 117 118 119 120 121 123 124 125 126 127 128 129 130 131 132 133 134 135 137 138 139 140 141 164 165 166 167 169 170 174 178 179 180 181 182 183 184 186 187 188 189 190 192 193 194 195 196 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
> Not run: 035 040 044 057 058 090 093 094 095 097 098 099 122 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 168 175 176 177 185 191 197
> Failures: 189 229 250
> Failed 3 of 180 tests
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
>
Thanks for your explanations!
BTW, are changes/updates to xfstests-dev announced somewhere (like ML)?
- Sedat -
Merge get_new_inode/get_new_inode_fast into iget5_locked/iget_locked
as those were the only callers. Remove the internal ifind/ifind_fast
helpers - ifind_fast only had a single caller, and ifind had two
callers wanting it to do different things. Also clean up the comments
in this area to focus on information important to a developer trying
to use it, instead of overloading them with implementation details.
Signed-off-by: Christoph Hellwig <[email protected]>
Index: xfs/fs/inode.c
===================================================================
--- xfs.orig/fs/inode.c 2011-03-23 12:52:49.898877203 +0100
+++ xfs/fs/inode.c 2011-03-23 12:52:57.066879923 +0100
@@ -930,20 +930,42 @@ void unlock_new_inode(struct inode *inod
}
EXPORT_SYMBOL(unlock_new_inode);
-/*
- * This is called without the inode hash lock held.. Be careful.
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
*
- * We no longer cache the sb_flags in i_flags - see fs.h
- * -- [email protected]
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a generalized version of iget_locked() for file systems where the inode
+ * number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
*/
-static struct inode *get_new_inode(struct super_block *sb,
- struct hlist_head *head,
- int (*test)(struct inode *, void *),
- int (*set)(struct inode *, void *),
- void *data)
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
+ spin_lock(&inode_hash_lock);
+ inode = find_inode(sb, head, test, data);
+ spin_unlock(&inode_hash_lock);
+
+ if (inode) {
+ wait_on_inode(inode);
+ return inode;
+ }
+
inode = alloc_inode(sb);
if (inode) {
struct inode *old;
@@ -985,16 +1007,34 @@ set_failed:
destroy_inode(inode);
return NULL;
}
+EXPORT_SYMBOL(iget5_locked);
-/*
- * get_new_inode_fast is the fast path version of get_new_inode, see the
- * comment at iget_locked for details.
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * Search for the inode specified by @ino in the inode cache and if present
+ * return it with an increased reference count. This is for file systems
+ * where the inode number is sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
*/
-static struct inode *get_new_inode_fast(struct super_block *sb,
- struct hlist_head *head, unsigned long ino)
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+ spin_lock(&inode_hash_lock);
+ inode = find_inode_fast(sb, head, ino);
+ spin_unlock(&inode_hash_lock);
+ if (inode) {
+ wait_on_inode(inode);
+ return inode;
+ }
+
inode = alloc_inode(sb);
if (inode) {
struct inode *old;
@@ -1029,6 +1069,7 @@ static struct inode *get_new_inode_fast(
}
return inode;
}
+EXPORT_SYMBOL(iget_locked);
/*
* search the inode cache for a matching inode number.
@@ -1112,100 +1153,32 @@ struct inode *igrab(struct inode *inode)
EXPORT_SYMBOL(igrab);
/**
- * ifind - internal function, you want ilookup5() or iget5().
+ * ilookup5_nowait - search for an inode in the inode cache
* @sb: super block of file system to search
- * @head: the head of the list to search
+ * @hashval: hash value (usually inode number) to search for
* @test: callback used for comparisons between inodes
* @data: opaque data pointer to pass to @test
- * @wait: if true wait for the inode to be unlocked, if false do not
- *
- * ifind() searches for the inode specified by @data in the inode
- * cache. This is a generalized version of ifind_fast() for file systems where
- * the inode number is not sufficient for unique identification of an inode.
*
+ * Search for the inode specified by @hashval and @data in the inode cache.
* If the inode is in the cache, the inode is returned with an incremented
* reference count.
*
- * Otherwise NULL is returned.
+ * Note: I_NEW is not waited upon so you have to be very careful what you do
+ * with the returned inode. You probably should be using ilookup5() instead.
*
- * Note, @test is called with the inode_hash_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
*/
-static struct inode *ifind(struct super_block *sb,
- struct hlist_head *head, int (*test)(struct inode *, void *),
- void *data, const int wait)
+struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
spin_lock(&inode_hash_lock);
inode = find_inode(sb, head, test, data);
- if (inode) {
- spin_unlock(&inode_hash_lock);
- if (likely(wait))
- wait_on_inode(inode);
- return inode;
- }
- spin_unlock(&inode_hash_lock);
- return NULL;
-}
-
-/**
- * ifind_fast - internal function, you want ilookup() or iget().
- * @sb: super block of file system to search
- * @head: head of the list to search
- * @ino: inode number to search for
- *
- * ifind_fast() searches for the inode @ino in the inode cache. This is for
- * file systems where the inode number is sufficient for unique identification
- * of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- */
-static struct inode *ifind_fast(struct super_block *sb,
- struct hlist_head *head, unsigned long ino)
-{
- struct inode *inode;
-
- spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
- if (inode) {
- spin_unlock(&inode_hash_lock);
- wait_on_inode(inode);
- return inode;
- }
spin_unlock(&inode_hash_lock);
- return NULL;
-}
-/**
- * ilookup5_nowait - search for an inode in the inode cache
- * @sb: super block of file system to search
- * @hashval: hash value (usually inode number) to search for
- * @test: callback used for comparisons between inodes
- * @data: opaque data pointer to pass to @test
- *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count. Note, the inode lock is not waited upon so you have to be
- * very careful what you do with the returned inode. You probably should be
- * using ilookup5() instead.
- *
- * Otherwise NULL is returned.
- *
- * Note, @test is called with the inode_hash_lock held, so can't sleep.
- */
-struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
- int (*test)(struct inode *, void *), void *data)
-{
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-
- return ifind(sb, head, test, data, 0);
+ return inode;
}
EXPORT_SYMBOL(ilookup5_nowait);
@@ -1216,24 +1189,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
* @test: callback used for comparisons between inodes
* @data: opaque data pointer to pass to @test
*
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode lock is waited upon and the inode is
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if the inode is in the cache, return the inode with an incremented
+ * reference count. Waits on I_NEW before returning the inode.
* returned with an incremented reference count.
*
- * Otherwise NULL is returned.
+ * This is a generalized version of ilookup() for file systems where the
+ * inode number is not sufficient for unique identification of an inode.
*
- * Note, @test is called with the inode_hash_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
*/
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
- return ifind(sb, head, test, data, 1);
+ if (inode)
+ wait_on_inode(inode);
+ return inode;
}
EXPORT_SYMBOL(ilookup5);
@@ -1242,92 +1215,23 @@ EXPORT_SYMBOL(ilookup5);
* @sb: super block of file system to search
* @ino: inode number to search for
*
- * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
- * This is for file systems where the inode number is sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
+ * Search for the inode @ino in the inode cache, and if the inode is in the
+ * cache, the inode is returned with an incremented reference count.
*/
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
-
- return ifind_fast(sb, head, ino);
-}
-EXPORT_SYMBOL(ilookup);
-
-/**
- * iget5_locked - obtain an inode from a mounted file system
- * @sb: super block of file system
- * @hashval: hash value (usually inode number) to get
- * @test: callback used for comparisons between inodes
- * @set: callback used to initialize a new struct inode
- * @data: opaque data pointer to pass to @test and @set
- *
- * iget5_locked() uses ifind() to search for the inode specified by @hashval
- * and @data in the inode cache and if present it is returned with an increased
- * reference count. This is a generalized version of iget_locked() for file
- * systems where the inode number is not sufficient for unique identification
- * of an inode.
- *
- * If the inode is not in cache, get_new_inode() is called to allocate a new
- * inode and this is returned locked, hashed, and with the I_NEW flag set. The
- * file system gets to fill it in before unlocking it via unlock_new_inode().
- *
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
- */
-struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
- int (*test)(struct inode *, void *),
- int (*set)(struct inode *, void *), void *data)
-{
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
- inode = ifind(sb, head, test, data, 1);
- if (inode)
- return inode;
- /*
- * get_new_inode() will do the right thing, re-trying the search
- * in case it had to block at any point.
- */
- return get_new_inode(sb, head, test, set, data);
-}
-EXPORT_SYMBOL(iget5_locked);
-
-/**
- * iget_locked - obtain an inode from a mounted file system
- * @sb: super block of file system
- * @ino: inode number to get
- *
- * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
- * the inode cache and if present it is returned with an increased reference
- * count. This is for file systems where the inode number is sufficient for
- * unique identification of an inode.
- *
- * If the inode is not in cache, get_new_inode_fast() is called to allocate a
- * new inode and this is returned locked, hashed, and with the I_NEW flag set.
- * The file system gets to fill it in before unlocking it via
- * unlock_new_inode().
- */
-struct inode *iget_locked(struct super_block *sb, unsigned long ino)
-{
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
- struct inode *inode;
+ spin_lock(&inode_hash_lock);
+ inode = find_inode_fast(sb, head, ino);
+ spin_unlock(&inode_hash_lock);
- inode = ifind_fast(sb, head, ino);
if (inode)
- return inode;
- /*
- * get_new_inode_fast() will do the right thing, re-trying the search
- * in case it had to block at any point.
- */
- return get_new_inode_fast(sb, head, ino);
+ wait_on_inode(inode);
+ return inode;
}
-EXPORT_SYMBOL(iget_locked);
+EXPORT_SYMBOL(ilookup);
int insert_inode_locked(struct inode *inode)
{
On Wed, Mar 23, 2011 at 08:53:19AM +0100, Sedat Dilek wrote:
> On Wed, Mar 23, 2011 at 7:29 AM, Dave Chinner <[email protected]> wrote:
> > On Tue, Mar 22, 2011 at 07:17:04PM +0100, Sedat Dilek wrote:
> >> On Tue, Mar 22, 2011 at 12:23 PM, Dave Chinner <[email protected]> wrote:
> >> > Hi Al,
> >> >
> >> > The following patches are the inode_lock breakup series originally
> >> > derived from Nick Piggin's vfs-scale tree. I've kind of been sitting
> >> > on them until the dcache_lock breakup and rcu path-walk has had some
> >> > time to be shaken out. The patch ѕet is pretty much unchanged from
> >> > the last round of review last last year - all I've done to bring it
> >> > up to date is forward port it and run it through some testing on XFS
> >> > and ext4.
> >> >
> >> > I know it's late in the .39 merge window, but I hope you'll consider
> >> > it if the patches are still acceptable(*). Otherwise I'm happy to take
> >> > the time to get it right for .40.
> >> >
> >> > Cheers,
> >> >
> >> > Dave.
> >> >
> >> > (*) The series can also be found here:
> >> >
> >> > git://git.kernel.org/pub/scm/linux/kernel/git/dgc/xfsdev.git inode-scale
> >> >
> >> > Dave Chinner (8):
> >> > fs: protect inode->i_state with inode->i_lock
> >> > fs: factor inode disposal
> >> > fs: Lock the inode LRU list separately
> >> > fs: remove inode_lock from iput_final and prune_icache
> >> > fs: move i_sb_list out from under inode_lock
> >> > fs: move i_wb_list out from under inode_lock
> >> > fs: rename inode_lock to inode_hash_lock
> >> > fs: pull inode->i_lock up out of writeback_single_inode
> >> >
> >> [...]
> >>
> >> Hi,
> >>
> >> I have tested this patch-series on top of linux-next (next-20110322)
> >> by running xfstests-dev (built from git).
> >>
> >> My sdb2 partition (on an external 1GBytes USB-2.0 hdd) was formatted
> >> and mounted as ext4-fs .
> >
> > If you really want to use xfstests to produce some system stress,
> > you'd do better to use an XFS filesystem ;)
> >
>
> First, I was riddling why I could not install xfstests-dev to an
> individual $PREFIX and waste(?) some time into looking into the
> configure* files - dbtest.c failed building by not setting correct
> $libgdbm (normal build via make is OK).
> Then I saw the hardcoded /var/lib/ as INSTALL_PATH... and didn't
> wanted dig deeper.
> I overflew the README, but that file needs a bit of more hints.
xfstests has never really been packaged as a standaone package, so
the build/configure system is a bit grotesque in places. It kind of
assumes that you've already installed everything you need manually
on your test box.
If someone wants to fix that all up so it can be packaged as .rpm
and .deb packages with aal the correct dependencies, then patches
are welcome...
> I can do some XFS testing if you wish.
No big deal, I was just pointing out that ext4 coverage of the test
suite is limited compared to the XFS coverage...
> The numbers in the result say nothing to me as I can't compare them
> with other systems.
> It's like putting the hand in a glass of water and wild speculating
> the temperature.
> You can say cold or warm if you have a 2nd glass with different temperature.
>
> So, it might be worth to add a file (name: benchmark.txt?) with some results?
I don't see any real point - the output is pass/fail for each test
with and inidcation of the failure. If you want to dig deeperr into
the failures, it points at where to look.
> >> The check-log is attached (not sure how to interpret the errors and failures).
> >
> > Nothing indicates an unknown failure...
> >
> >> 001 5s ... 4s
> >> 002 1s ... 1s
> >> 003 [not run] not suitable for this filesystem type: ext4
> >> 004 [not run] not suitable for this filesystem type: ext4
> >> 005 - output mismatch (see 005.out.bad)
> >> --- 005.out 2011-03-22 17:47:03.861226933 +0100
> >> +++ 005.out.bad 2011-03-22 18:47:58.847277538 +0100
> >> @@ -1,7 +1,7 @@
> >> QA output created by 005
> >> *** touch deep symlinks
> >>
> >> -ELOOP returned. Good.
> >> +No ELOOP? Unexpected!
> >>
> >> *** touch recusive symlinks
> >
> > This is a result of Al fixing the max nested loop depth very early
> > on in .39, so the test needs to run to deeper nesting depths to
> > produce ELOOP. So it's a test problem, not a bug.
> >
> >> 197 [not run] not suitable for this filesystem type: ext4
> >> 198 [failed, exit status 127] - output mismatch (see 198.out.bad)
> >> --- 198.out 2011-03-22 17:47:03.917226229 +0100
> >> +++ 198.out.bad 2011-03-22 19:04:12.591035920 +0100
> >> @@ -1,2 +1,3 @@
> >> QA output created by 198
> >> Silence is golden.
> >> +./198: line 54: /home/sd/src/xfstests-dev/xfstests-dev/src/aio-dio-regress/aiodio_sparse2: No such file or directory
> >
> > You need to install libaio and friends so that the binary is
> > built. We probably need to add a "requires_aio" test option to
> > detect this situation and not_run the test gracefully.
> >
>
> Yupp, the build-/configure-system could be enhanced in that case.
> (I add "Notes to myself" see my P.S. to remember what I did and what I missed.)
Well, it's not a configure system issue - it's that the test itself
doesn't correctly detect what was built.....
> BTW, are changes/updates to xfstests-dev announced somewhere (like ML)?
The XFS mailing list ([email protected]) is used for development and
release announcements. And there's usually information in
Christoph's monthly XFS summary that he posts to -fsdevel and LKML.
Cheers,
Dave.
--
Dave Chinner
[email protected]