2009-09-04 06:59:10

by Nick Piggin

[permalink] [raw]
Subject: [patch 11/33] fs: dcache scale subdirs

Protect d_subdirs and d_child with d_lock, except in filesystems that aren't
using dcache_lock for these anyway (eg. using i_mutex).

XXX: probably don't need parent lock in inotify (because child lock
should stabilize parent). Also, possibly some filesystems don't need so
much locking (eg. of child dentry when modifying d_child, so long as
parent is locked)... but be on the safe side. Hmm, maybe we should just
say d_child list is protected by d_parent->d_lock. d_parent could remain
protected with d_lock.

---
drivers/usb/core/inode.c | 6 +
fs/autofs4/expire.c | 81 ++++++++++++++-------
fs/autofs4/inode.c | 5 +
fs/autofs4/root.c | 9 ++
fs/coda/cache.c | 2
fs/dcache.c | 159 ++++++++++++++++++++++++++++++++++---------
fs/libfs.c | 40 ++++++----
fs/ncpfs/dir.c | 3
fs/ncpfs/ncplib_kernel.h | 4 +
fs/notify/fsnotify.c | 4 -
fs/notify/inotify/inotify.c | 4 -
fs/smbfs/cache.c | 4 +
include/linux/dcache.h | 1
kernel/cgroup.c | 19 ++++-
net/sunrpc/rpc_pipe.c | 2
security/selinux/selinuxfs.c | 12 ++-
16 files changed, 274 insertions(+), 81 deletions(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c
+++ linux-2.6/fs/dcache.c
@@ -44,6 +44,8 @@
* - d_flags
* - d_name
* - d_lru
+ * - d_unhashed
+ * - d_subdirs and children's d_child
*
* Ordering:
* dcache_lock
@@ -205,7 +207,8 @@ static void dentry_lru_del_init(struct d
*
* If this is the root of the dentry tree, return NULL.
*
- * dcache_lock and d_lock must be held by caller, are dropped by d_kill.
+ * dcache_lock and d_lock and d_parent->d_lock must be held by caller, and
+ * are dropped by d_kill.
*/
static struct dentry *d_kill(struct dentry *dentry)
__releases(dentry->d_lock)
@@ -214,12 +217,14 @@ static struct dentry *d_kill(struct dent
struct dentry *parent;

list_del(&dentry->d_u.d_child);
- /*drops the locks, at that point nobody can reach this dentry */
- dentry_iput(dentry);
+ if (dentry->d_parent && dentry != dentry->d_parent)
+ spin_unlock(&dentry->d_parent->d_lock);
if (IS_ROOT(dentry))
parent = NULL;
else
parent = dentry->d_parent;
+ /*drops the locks, at that point nobody can reach this dentry */
+ dentry_iput(dentry);
d_free(dentry);
return parent;
}
@@ -255,6 +260,7 @@ static struct dentry *d_kill(struct dent

void dput(struct dentry *dentry)
{
+ struct dentry *parent = NULL;
if (!dentry)
return;

@@ -273,6 +279,15 @@ repeat:
spin_unlock(&dentry->d_lock);
goto repeat;
}
+ parent = dentry->d_parent;
+ if (parent) {
+ BUG_ON(parent == dentry);
+ if (!spin_trylock(&parent->d_lock)) {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dcache_lock);
+ goto repeat;
+ }
+ }
}
dentry->d_count--;
if (dentry->d_count) {
@@ -296,6 +311,8 @@ repeat:
dentry_lru_add(dentry);
}
spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
return;

@@ -521,10 +538,22 @@ static void prune_one_dentry(struct dent
* because dcache_lock needs to be taken anyway.
*/
while (dentry) {
+ struct dentry *parent = NULL;
+
spin_lock(&dcache_lock);
+again:
spin_lock(&dentry->d_lock);
+ if (dentry->d_parent && dentry != dentry->d_parent) {
+ if (!spin_trylock(&dentry->d_parent->d_lock)) {
+ spin_unlock(&dentry->d_lock);
+ goto again;
+ }
+ parent = dentry->d_parent;
+ }
dentry->d_count--;
if (dentry->d_count) {
+ if (parent)
+ spin_unlock(&parent->d_lock);
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
return;
@@ -602,20 +631,28 @@ again:
dentry = list_entry(tmp.prev, struct dentry, d_lru);

if (!spin_trylock(&dentry->d_lock)) {
+again1:
spin_unlock(&dcache_lru_lock);
goto again;
}
- __dentry_lru_del_init(dentry);
/*
* We found an inuse dentry which was not removed from
* the LRU because of laziness during lookup. Do not free
* it - just keep it off the LRU list.
*/
if (dentry->d_count) {
+ __dentry_lru_del_init(dentry);
spin_unlock(&dentry->d_lock);
continue;
}
-
+ if (dentry->d_parent) {
+ BUG_ON(dentry == dentry->d_parent);
+ if (!spin_trylock(&dentry->d_parent->d_lock)) {
+ spin_unlock(&dentry->d_lock);
+ goto again1;
+ }
+ }
+ __dentry_lru_del_init(dentry);
spin_unlock(&dcache_lru_lock);
prune_one_dentry(dentry);
/* dcache_lock and dentry->d_lock dropped */
@@ -752,14 +789,15 @@ static void shrink_dcache_for_umount_sub
/* this is a branch with children - detach all of them
* from the system in one go */
spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
list_for_each_entry(loop, &dentry->d_subdirs,
d_u.d_child) {
- spin_lock(&loop->d_lock);
+ spin_lock_nested(&loop->d_lock, DENTRY_D_LOCK_NESTED);
dentry_lru_del_init(loop);
__d_drop(loop);
spin_unlock(&loop->d_lock);
- cond_resched_lock(&dcache_lock);
}
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);

/* move to the first child */
@@ -787,16 +825,17 @@ static void shrink_dcache_for_umount_sub
BUG();
}

- if (IS_ROOT(dentry))
+ if (IS_ROOT(dentry)) {
parent = NULL;
- else {
+ list_del(&dentry->d_u.d_child);
+ } else {
parent = dentry->d_parent;
spin_lock(&parent->d_lock);
parent->d_count--;
+ list_del(&dentry->d_u.d_child);
spin_unlock(&parent->d_lock);
}

- list_del(&dentry->d_u.d_child);
detached++;

inode = dentry->d_inode;
@@ -881,6 +920,7 @@ int have_submounts(struct dentry *parent
spin_lock(&dcache_lock);
if (d_mountpoint(parent))
goto positive;
+ spin_lock(&this_parent->d_lock);
repeat:
next = this_parent->d_subdirs.next;
resume:
@@ -888,22 +928,34 @@ resume:
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;
+
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
/* Have we found a mount point ? */
- if (d_mountpoint(dentry))
+ if (d_mountpoint(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&this_parent->d_lock);
goto positive;
+ }
if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
this_parent = dentry;
+ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
}
+ spin_unlock(&dentry->d_lock);
}
/*
* All done at this level ... ascend and resume the search.
*/
if (this_parent != parent) {
next = this_parent->d_u.d_child.next;
+ spin_unlock(&this_parent->d_lock);
this_parent = this_parent->d_parent;
+ spin_lock(&this_parent->d_lock);
goto resume;
}
+ spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
return 0; /* No mount points found in tree */
positive:
@@ -932,6 +984,7 @@ static int select_parent(struct dentry *
int found = 0;

spin_lock(&dcache_lock);
+ spin_lock(&this_parent->d_lock);
repeat:
next = this_parent->d_subdirs.next;
resume:
@@ -939,8 +992,9 @@ resume:
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;
+ BUG_ON(this_parent == dentry);

- spin_lock(&dentry->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
dentry_lru_del_init(dentry);
/*
* move only zero ref count dentries to the end
@@ -950,33 +1004,45 @@ resume:
dentry_lru_add_tail(dentry);
found++;
}
- spin_unlock(&dentry->d_lock);

/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
* the rest.
*/
- if (found && need_resched())
+ if (found && need_resched()) {
+ spin_unlock(&dentry->d_lock);
goto out;
+ }

/*
* Descend a level if the d_subdirs list is non-empty.
*/
if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
this_parent = dentry;
+ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
}
+
+ spin_unlock(&dentry->d_lock);
}
/*
* All done at this level ... ascend and resume the search.
*/
if (this_parent != parent) {
+ struct dentry *tmp;
next = this_parent->d_u.d_child.next;
- this_parent = this_parent->d_parent;
+ tmp = this_parent->d_parent;
+ spin_unlock(&this_parent->d_lock);
+ BUG_ON(tmp == this_parent);
+ this_parent = tmp;
+ spin_lock(&this_parent->d_lock);
goto resume;
}
out:
+ spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
return found;
}
@@ -1072,19 +1138,20 @@ struct dentry *d_alloc(struct dentry * p
INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
INIT_LIST_HEAD(&dentry->d_alias);
-
- if (parent) {
- dentry->d_parent = dget(parent);
- dentry->d_sb = parent->d_sb;
- } else {
- INIT_LIST_HEAD(&dentry->d_u.d_child);
- }
+ INIT_LIST_HEAD(&dentry->d_u.d_child);

if (parent) {
spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ dentry->d_parent = dget_dlock(parent);
+ dentry->d_sb = parent->d_sb;
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
}
+
atomic_inc(&dentry_stat.nr_dentry);

return dentry;
@@ -1763,15 +1830,27 @@ static void d_move_locked(struct dentry
printk(KERN_WARNING "VFS: moving negative dcache entry\n");

write_seqlock(&rename_lock);
- /*
- * XXXX: do we really need to take target->d_lock?
- */
+
+ if (target->d_parent != dentry->d_parent) {
+ if (target->d_parent < dentry->d_parent) {
+ spin_lock(&target->d_parent->d_lock);
+ spin_lock_nested(&dentry->d_parent->d_lock,
+ DENTRY_D_LOCK_NESTED);
+ } else {
+ spin_lock(&dentry->d_parent->d_lock);
+ spin_lock_nested(&target->d_parent->d_lock,
+ DENTRY_D_LOCK_NESTED);
+ }
+ } else {
+ spin_lock(&target->d_parent->d_lock);
+ }
+
if (target < dentry) {
- spin_lock(&target->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ spin_lock_nested(&target->d_lock, 2);
+ spin_lock_nested(&dentry->d_lock, 3);
} else {
- spin_lock(&dentry->d_lock);
- spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
+ spin_lock_nested(&dentry->d_lock, 2);
+ spin_lock_nested(&target->d_lock, 3);
}

/* Move the dentry to the target hash queue, if on different bucket */
@@ -1804,7 +1883,10 @@ static void d_move_locked(struct dentry
}

list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+ if (target->d_parent != dentry->d_parent)
+ spin_unlock(&dentry->d_parent->d_lock);
spin_unlock(&target->d_lock);
+ spin_unlock(&target->d_parent->d_lock);
fsnotify_d_move(dentry);
spin_unlock(&dentry->d_lock);
write_sequnlock(&rename_lock);
@@ -1903,6 +1985,12 @@ static void __d_materialise_dentry(struc
dparent = dentry->d_parent;
aparent = anon->d_parent;

+ /* XXX: hack */
+ spin_lock(&aparent->d_lock);
+ spin_lock(&dparent->d_lock);
+ spin_lock(&dentry->d_lock);
+ spin_lock(&anon->d_lock);
+
dentry->d_parent = (aparent == anon) ? dentry : aparent;
list_del(&dentry->d_u.d_child);
if (!IS_ROOT(dentry))
@@ -1917,6 +2005,11 @@ static void __d_materialise_dentry(struc
else
INIT_LIST_HEAD(&anon->d_u.d_child);

+ spin_unlock(&anon->d_lock);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dparent->d_lock);
+ spin_unlock(&aparent->d_lock);
+
anon->d_flags &= ~DCACHE_DISCONNECTED;
}

@@ -2315,6 +2408,7 @@ void d_genocide(struct dentry *root)
struct list_head *next;

spin_lock(&dcache_lock);
+ spin_lock(&this_parent->d_lock);
repeat:
next = this_parent->d_subdirs.next;
resume:
@@ -2328,8 +2422,10 @@ resume:
continue;
}
if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&dentry->d_lock);
+ spin_unlock(&this_parent->d_lock);
+ spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
this_parent = dentry;
+ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
}
dentry->d_count--;
@@ -2337,12 +2433,13 @@ resume:
}
if (this_parent != root) {
next = this_parent->d_u.d_child.next;
- spin_lock(&this_parent->d_lock);
this_parent->d_count--;
spin_unlock(&this_parent->d_lock);
this_parent = this_parent->d_parent;
+ spin_lock(&this_parent->d_lock);
goto resume;
}
+ spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
}

Index: linux-2.6/fs/libfs.c
===================================================================
--- linux-2.6.orig/fs/libfs.c
+++ linux-2.6/fs/libfs.c
@@ -84,7 +84,8 @@ int dcache_dir_close(struct inode *inode

loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
{
- mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+ struct dentry *dentry = file->f_path.dentry;
+ mutex_lock(&dentry->d_inode->i_mutex);
switch (origin) {
case 1:
offset += file->f_pos;
@@ -92,7 +93,7 @@ loff_t dcache_dir_lseek(struct file *fil
if (offset >= 0)
break;
default:
- mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -102,23 +103,27 @@ loff_t dcache_dir_lseek(struct file *fil
struct dentry *cursor = file->private_data;
loff_t n = file->f_pos - 2;

- spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
+ spin_lock_nested(&cursor->d_lock, DENTRY_D_LOCK_NESTED);
list_del(&cursor->d_u.d_child);
- p = file->f_path.dentry->d_subdirs.next;
- while (n && p != &file->f_path.dentry->d_subdirs) {
+ spin_unlock(&cursor->d_lock);
+ p = dentry->d_subdirs.next;
+ while (n && p != &dentry->d_subdirs) {
struct dentry *next;
next = list_entry(p, struct dentry, d_u.d_child);
- spin_lock(&next->d_lock);
+ spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(next))
n--;
spin_unlock(&next->d_lock);
p = p->next;
}
+ spin_lock_nested(&cursor->d_lock, DENTRY_D_LOCK_NESTED);
list_add_tail(&cursor->d_u.d_child, p);
- spin_unlock(&dcache_lock);
+ spin_unlock(&cursor->d_lock);
+ spin_unlock(&dentry->d_lock);
}
}
- mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
return offset;
}

@@ -158,9 +163,12 @@ int dcache_readdir(struct file * filp, v
i++;
/* fallthrough */
default:
- spin_lock(&dcache_lock);
- if (filp->f_pos == 2)
+ spin_lock(&dentry->d_lock);
+ if (filp->f_pos == 2) {
+ spin_lock_nested(&cursor->d_lock, DENTRY_D_LOCK_NESTED);
list_move(q, &dentry->d_subdirs);
+ spin_unlock(&cursor->d_lock);
+ }

for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
struct dentry *next;
@@ -172,19 +180,21 @@ int dcache_readdir(struct file * filp, v
}

spin_unlock(&next->d_lock);
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
if (filldir(dirent, next->d_name.name,
next->d_name.len, filp->f_pos,
next->d_inode->i_ino,
dt_type(next->d_inode)) < 0)
return 0;
- spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
+ spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
/* next is still alive */
list_move(q, p);
+ spin_unlock(&next->d_lock);
p = q;
filp->f_pos++;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
}
return 0;
}
@@ -281,7 +291,7 @@ int simple_empty(struct dentry *dentry)
struct dentry *child;
int ret = 0;

- spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
@@ -292,7 +302,7 @@ int simple_empty(struct dentry *dentry)
}
ret = 1;
out:
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
return ret;
}

Index: linux-2.6/fs/notify/inotify/inotify.c
===================================================================
--- linux-2.6.orig/fs/notify/inotify/inotify.c
+++ linux-2.6/fs/notify/inotify/inotify.c
@@ -189,17 +189,19 @@ static void set_dentry_child_flags(struc
list_for_each_entry(alias, &inode->i_dentry, d_alias) {
struct dentry *child;

+ spin_lock(&alias->d_lock);
list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
if (!child->d_inode)
continue;

- spin_lock(&child->d_lock);
+ spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (watched)
child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
else
child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
spin_unlock(&child->d_lock);
}
+ spin_unlock(&alias->d_lock);
}
spin_unlock(&dcache_lock);
}
Index: linux-2.6/include/linux/dcache.h
===================================================================
--- linux-2.6.orig/include/linux/dcache.h
+++ linux-2.6/include/linux/dcache.h
@@ -340,6 +340,7 @@ static inline struct dentry *dget_dlock(
}
return dentry;
}
+
static inline struct dentry *dget(struct dentry *dentry)
{
if (dentry) {
Index: linux-2.6/drivers/usb/core/inode.c
===================================================================
--- linux-2.6.orig/drivers/usb/core/inode.c
+++ linux-2.6/drivers/usb/core/inode.c
@@ -349,16 +349,18 @@ static int usbfs_empty (struct dentry *d
struct list_head *list;

spin_lock(&dcache_lock);
-
+ spin_lock(&dentry->d_lock);
list_for_each(list, &dentry->d_subdirs) {
struct dentry *de = list_entry(list, struct dentry, d_u.d_child);
if (usbfs_positive(de)) {
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
return 0;
}
}
-
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
+
return 1;
}

Index: linux-2.6/fs/autofs4/expire.c
===================================================================
--- linux-2.6.orig/fs/autofs4/expire.c
+++ linux-2.6/fs/autofs4/expire.c
@@ -93,22 +93,63 @@ done:
/*
* Calculate next entry in top down tree traversal.
* From next_mnt in namespace.c - elegant.
+ *
+ * How is this supposed to work if we drop dcache_lock between calls anyway?
+ * How does it cope with renames?
+ * And also callers dput the returned dentry before taking dcache_lock again
+ * so what prevents it from being freed??
*/
-static struct dentry *next_dentry(struct dentry *p, struct dentry *root)
+static struct dentry *get_next_positive_dentry(struct dentry *p,
+ struct dentry *root)
{
- struct list_head *next = p->d_subdirs.next;
+ struct list_head *next;
+ struct dentry *ret;

+ spin_lock(&dcache_lock);
+ spin_lock(&p->d_lock);
+again:
+ next = p->d_subdirs.next;
if (next == &p->d_subdirs) {
while (1) {
- if (p == root)
+ struct dentry *parent;
+
+ if (p == root) {
+ spin_unlock(&p->d_lock);
return NULL;
+ }
+
+ parent = p->d_parent;
+ if (!spin_trylock(&parent->d_lock)) {
+ dget_dlock(p);
+ spin_unlock(&p->d_lock);
+ parent = dget_parent(p);
+ spin_unlock(&dcache_lock);
+ dput(p);
+ spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
+ } else
+ spin_unlock(&p->d_lock);
next = p->d_u.d_child.next;
- if (next != &p->d_parent->d_subdirs)
+ p = parent;
+ if (next != &parent->d_subdirs)
break;
- p = p->d_parent;
}
}
- return list_entry(next, struct dentry, d_u.d_child);
+ ret = list_entry(next, struct dentry, d_u.d_child);
+
+ spin_lock(&ret->d_lock);
+ /* Negative dentry - give up */
+ if (!simple_positive(ret)) {
+ spin_unlock(&ret->d_lock);
+ p = ret;
+ goto again;
+ }
+ dget_dlock(ret);
+ spin_unlock(&ret->d_lock);
+
+ spin_unlock(&dcache_lock);
+
+ return ret;
}

/*
@@ -158,18 +199,11 @@ static int autofs4_tree_busy(struct vfsm
if (!simple_positive(top))
return 1;

- spin_lock(&dcache_lock);
- for (p = top; p; p = next_dentry(p, top)) {
- /* Negative dentry - give up */
- if (!simple_positive(p))
- continue;
+ for (p = top; p; p = get_next_positive_dentry(p, top)) {

DPRINTK("dentry %p %.*s",
p, (int) p->d_name.len, p->d_name.name);

- p = dget(p);
- spin_unlock(&dcache_lock);
-
/*
* Is someone visiting anywhere in the subtree ?
* If there's no mount we need to check the usage
@@ -205,9 +239,7 @@ static int autofs4_tree_busy(struct vfsm
}
}
dput(p);
- spin_lock(&dcache_lock);
}
- spin_unlock(&dcache_lock);

/* Timeout of a tree mount is ultimately determined by its top dentry */
if (!autofs4_can_expire(top, timeout, do_now))
@@ -226,18 +258,11 @@ static struct dentry *autofs4_check_leav
DPRINTK("parent %p %.*s",
parent, (int)parent->d_name.len, parent->d_name.name);

- spin_lock(&dcache_lock);
- for (p = parent; p; p = next_dentry(p, parent)) {
- /* Negative dentry - give up */
- if (!simple_positive(p))
- continue;
+ for (p = parent; p; p = get_next_positive_dentry(p, parent)) {

DPRINTK("dentry %p %.*s",
p, (int) p->d_name.len, p->d_name.name);

- p = dget(p);
- spin_unlock(&dcache_lock);
-
if (d_mountpoint(p)) {
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, p))
@@ -249,9 +274,7 @@ static struct dentry *autofs4_check_leav
}
cont:
dput(p);
- spin_lock(&dcache_lock);
}
- spin_unlock(&dcache_lock);
return NULL;
}

@@ -316,6 +339,7 @@ struct dentry *autofs4_expire_indirect(s
timeout = sbi->exp_timeout;

spin_lock(&dcache_lock);
+ spin_lock(&root->d_lock);
next = root->d_subdirs.next;

/* On exit from the loop expire is set to a dgot dentry
@@ -330,6 +354,7 @@ struct dentry *autofs4_expire_indirect(s
}

dentry = dget(dentry);
+ spin_unlock(&root->d_lock);
spin_unlock(&dcache_lock);

spin_lock(&sbi->fs_lock);
@@ -396,8 +421,10 @@ next:
spin_unlock(&sbi->fs_lock);
dput(dentry);
spin_lock(&dcache_lock);
+ spin_lock(&root->d_lock);
next = next->next;
}
+ spin_unlock(&root->d_lock);
spin_unlock(&dcache_lock);
return NULL;

@@ -409,7 +436,9 @@ found:
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
spin_lock(&dcache_lock);
+ spin_lock(&expired->d_parent->d_lock);
list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
+ spin_unlock(&expired->d_parent->d_lock);
spin_unlock(&dcache_lock);
return expired;
}
Index: linux-2.6/fs/autofs4/inode.c
===================================================================
--- linux-2.6.orig/fs/autofs4/inode.c
+++ linux-2.6/fs/autofs4/inode.c
@@ -111,6 +111,7 @@ static void autofs4_force_release(struct

spin_lock(&dcache_lock);
repeat:
+ spin_lock(&this_parent->d_lock);
next = this_parent->d_subdirs.next;
resume:
while (next != &this_parent->d_subdirs) {
@@ -128,6 +129,7 @@ resume:
}

next = next->next;
+ spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);

DPRINTK("dentry %p %.*s",
@@ -141,15 +143,18 @@ resume:
struct dentry *dentry = this_parent;

next = this_parent->d_u.d_child.next;
+ spin_unlock(&this_parent->d_lock);
this_parent = this_parent->d_parent;
spin_unlock(&dcache_lock);
DPRINTK("parent dentry %p %.*s",
dentry, (int)dentry->d_name.len, dentry->d_name.name);
dput(dentry);
spin_lock(&dcache_lock);
+ spin_lock(&this_parent->d_lock);
goto resume;
}
spin_unlock(&dcache_lock);
+ spin_unlock(&this_parent->d_lock);
}

void autofs4_kill_sb(struct super_block *sb)
Index: linux-2.6/fs/autofs4/root.c
===================================================================
--- linux-2.6.orig/fs/autofs4/root.c
+++ linux-2.6/fs/autofs4/root.c
@@ -93,10 +93,13 @@ static int autofs4_dir_open(struct inode
* it.
*/
spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
return -ENOENT;
}
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);

out:
@@ -212,8 +215,10 @@ static void *autofs4_follow_link(struct
* mount it again.
*/
spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
(!d_mountpoint(dentry) && __simple_empty(dentry))) {
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);

status = try_to_fill_dentry(dentry, 0);
@@ -222,6 +227,7 @@ static void *autofs4_follow_link(struct

goto follow;
}
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
follow:
/*
@@ -730,7 +736,9 @@ static int autofs4_dir_rmdir(struct inod
return -EACCES;

spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
return -ENOTEMPTY;
}
@@ -738,7 +746,6 @@ static int autofs4_dir_rmdir(struct inod
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
spin_unlock(&sbi->lookup_lock);
- spin_lock(&dentry->d_lock);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
Index: linux-2.6/fs/coda/cache.c
===================================================================
--- linux-2.6.orig/fs/coda/cache.c
+++ linux-2.6/fs/coda/cache.c
@@ -87,6 +87,7 @@ static void coda_flag_children(struct de
struct dentry *de;

spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
list_for_each(child, &parent->d_subdirs)
{
de = list_entry(child, struct dentry, d_u.d_child);
@@ -95,6 +96,7 @@ static void coda_flag_children(struct de
continue;
coda_flag_inode(de->d_inode, flag);
}
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
return;
}
Index: linux-2.6/fs/ncpfs/dir.c
===================================================================
--- linux-2.6.orig/fs/ncpfs/dir.c
+++ linux-2.6/fs/ncpfs/dir.c
@@ -365,6 +365,7 @@ ncp_dget_fpos(struct dentry *dentry, str

/* If a pointer is invalid, we search the dentry. */
spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
next = parent->d_subdirs.next;
while (next != &parent->d_subdirs) {
dent = list_entry(next, struct dentry, d_u.d_child);
@@ -373,11 +374,13 @@ ncp_dget_fpos(struct dentry *dentry, str
dget_locked(dent);
else
dent = NULL;
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
goto out;
}
next = next->next;
}
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
return NULL;

Index: linux-2.6/fs/ncpfs/ncplib_kernel.h
===================================================================
--- linux-2.6.orig/fs/ncpfs/ncplib_kernel.h
+++ linux-2.6/fs/ncpfs/ncplib_kernel.h
@@ -193,6 +193,7 @@ ncp_renew_dentries(struct dentry *parent
struct dentry *dentry;

spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
next = parent->d_subdirs.next;
while (next != &parent->d_subdirs) {
dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -204,6 +205,7 @@ ncp_renew_dentries(struct dentry *parent

next = next->next;
}
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
}

@@ -215,6 +217,7 @@ ncp_invalidate_dircache_entries(struct d
struct dentry *dentry;

spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
next = parent->d_subdirs.next;
while (next != &parent->d_subdirs) {
dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -222,6 +225,7 @@ ncp_invalidate_dircache_entries(struct d
ncp_age_dentry(server, dentry);
next = next->next;
}
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
}

Index: linux-2.6/fs/smbfs/cache.c
===================================================================
--- linux-2.6.orig/fs/smbfs/cache.c
+++ linux-2.6/fs/smbfs/cache.c
@@ -63,6 +63,7 @@ smb_invalidate_dircache_entries(struct d
struct dentry *dentry;

spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
next = parent->d_subdirs.next;
while (next != &parent->d_subdirs) {
dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -70,6 +71,7 @@ smb_invalidate_dircache_entries(struct d
smb_age_dentry(server, dentry);
next = next->next;
}
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
}

@@ -97,6 +99,7 @@ smb_dget_fpos(struct dentry *dentry, str

/* If a pointer is invalid, we search the dentry. */
spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
next = parent->d_subdirs.next;
while (next != &parent->d_subdirs) {
dent = list_entry(next, struct dentry, d_u.d_child);
@@ -111,6 +114,7 @@ smb_dget_fpos(struct dentry *dentry, str
}
dent = NULL;
out_unlock:
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
return dent;
}
Index: linux-2.6/kernel/cgroup.c
===================================================================
--- linux-2.6.orig/kernel/cgroup.c
+++ linux-2.6/kernel/cgroup.c
@@ -696,23 +696,31 @@ static void cgroup_clear_directory(struc

BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(node);
if (d->d_inode) {
/* This should never be called on a cgroup
* directory with child cgroups */
BUG_ON(d->d_inode->i_mode & S_IFDIR);
- d = dget_locked(d);
+ dget_locked_dlock(d);
+ spin_unlock(&d->d_lock);
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
d_delete(d);
simple_unlink(dentry->d_inode, d);
dput(d);
spin_lock(&dcache_lock);
- }
+ spin_lock(&dentry->d_lock);
+ } else
+ spin_unlock(&d->d_lock);
node = dentry->d_subdirs.next;
}
+ spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
}

@@ -721,10 +729,17 @@ static void cgroup_clear_directory(struc
*/
static void cgroup_d_remove_dir(struct dentry *dentry)
{
+ struct dentry *parent;
+
cgroup_clear_directory(dentry);

spin_lock(&dcache_lock);
+ parent = dentry->d_parent;
+ spin_lock(&parent->d_lock);
+ spin_lock(&dentry->d_lock);
list_del_init(&dentry->d_u.d_child);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
remove_dir(dentry);
}
Index: linux-2.6/net/sunrpc/rpc_pipe.c
===================================================================
--- linux-2.6.orig/net/sunrpc/rpc_pipe.c
+++ linux-2.6/net/sunrpc/rpc_pipe.c
@@ -548,6 +548,7 @@ static void rpc_depopulate(struct dentry
mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
repeat:
spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
list_for_each_safe(pos, next, &parent->d_subdirs) {
dentry = list_entry(pos, struct dentry, d_u.d_child);
if (!dentry->d_inode ||
@@ -565,6 +566,7 @@ repeat:
} else
spin_unlock(&dentry->d_lock);
}
+ spin_unlock(&parent->d_lock);
spin_unlock(&dcache_lock);
if (n) {
do {
Index: linux-2.6/security/selinux/selinuxfs.c
===================================================================
--- linux-2.6.orig/security/selinux/selinuxfs.c
+++ linux-2.6/security/selinux/selinuxfs.c
@@ -944,22 +944,30 @@ static void sel_remove_entries(struct de
struct list_head *node;

spin_lock(&dcache_lock);
+ spin_lock(&de->d_lock);
node = de->d_subdirs.next;
while (node != &de->d_subdirs) {
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(node);

if (d->d_inode) {
- d = dget_locked(d);
+ dget_locked_dlock(d);
+ spin_unlock(&de->d_lock);
+ spin_unlock(&d->d_lock);
spin_unlock(&dcache_lock);
d_delete(d);
simple_unlink(de->d_inode, d);
dput(d);
spin_lock(&dcache_lock);
- }
+ spin_lock(&de->d_lock);
+ } else
+ spin_unlock(&d->d_lock);
node = de->d_subdirs.next;
}

+ spin_unlock(&de->d_lock);
spin_unlock(&dcache_lock);
}

Index: linux-2.6/fs/notify/fsnotify.c
===================================================================
--- linux-2.6.orig/fs/notify/fsnotify.c
+++ linux-2.6/fs/notify/fsnotify.c
@@ -61,17 +61,19 @@ void __fsnotify_update_child_dentry_flag
/* run all of the children of the original inode and fix their
* d_flags to indicate parental interest (their parent is the
* original inode) */
+ spin_lock(&alias->d_lock);
list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
if (!child->d_inode)
continue;

- spin_lock(&child->d_lock);
+ spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (watched)
child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
else
child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
spin_unlock(&child->d_lock);
}
+ spin_unlock(&alias->d_lock);
}
spin_unlock(&dcache_lock);
}


2010-06-17 15:14:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Fri, 2009-09-04 at 16:51 +1000, [email protected] wrote:
> @@ -932,6 +984,7 @@ static int select_parent(struct dentry *
> int found = 0;
>
> spin_lock(&dcache_lock);
> + spin_lock(&this_parent->d_lock);
> repeat:
> next = this_parent->d_subdirs.next;
> resume:
> @@ -939,8 +992,9 @@ resume:
> struct list_head *tmp = next;
> struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
> next = tmp->next;
> + BUG_ON(this_parent == dentry);
>
> - spin_lock(&dentry->d_lock);
> + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

Right, so this isn't going to work well, this dentry recursion is
basically unbounded afaict, so the 2nd subdir will also be locked using
DENRTY_D_LOCKED_NESTED, resulting in the 1st and 2nd subdir both having
the same (sub)class and lockdep doesn't like that much.

Do we really need to keep the whole path locked? One of the comments
seems to suggest we could actually drop some locks and re-acquire.

> dentry_lru_del_init(dentry);
> /*
> * move only zero ref count dentries to the end
> @@ -950,33 +1004,45 @@ resume:
> dentry_lru_add_tail(dentry);
> found++;
> }
> - spin_unlock(&dentry->d_lock);
>
> /*
> * We can return to the caller if we have found some (this
> * ensures forward progress). We'll be coming back to find
> * the rest.
> */
> - if (found && need_resched())
> + if (found && need_resched()) {
> + spin_unlock(&dentry->d_lock);
> goto out;
> + }
>
> /*
> * Descend a level if the d_subdirs list is non-empty.
> */
> if (!list_empty(&dentry->d_subdirs)) {
> + spin_unlock(&this_parent->d_lock);
> + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
> this_parent = dentry;
> + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
> goto repeat;
> }
> +
> + spin_unlock(&dentry->d_lock);
> }

2010-06-17 16:53:36

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Thu, Jun 17, 2010 at 05:13:35PM +0200, Peter Zijlstra wrote:
> On Fri, 2009-09-04 at 16:51 +1000, [email protected] wrote:
> > @@ -932,6 +984,7 @@ static int select_parent(struct dentry *
> > int found = 0;
> >
> > spin_lock(&dcache_lock);
> > + spin_lock(&this_parent->d_lock);
> > repeat:
> > next = this_parent->d_subdirs.next;
> > resume:
> > @@ -939,8 +992,9 @@ resume:
> > struct list_head *tmp = next;
> > struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
> > next = tmp->next;
> > + BUG_ON(this_parent == dentry);
> >
> > - spin_lock(&dentry->d_lock);
> > + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
>
> Right, so this isn't going to work well, this dentry recursion is
> basically unbounded afaict, so the 2nd subdir will also be locked using
> DENRTY_D_LOCKED_NESTED, resulting in the 1st and 2nd subdir both having
> the same (sub)class and lockdep doesn't like that much.

No it's a bit of a trucky loop, but it is not unbounded. It takes the
parent, then the child, then it may continue again with the child as
the new parent but in that case it drops the parent lock and tricks
lockdep into not barfing.


> Do we really need to keep the whole path locked? One of the comments
> seems to suggest we could actually drop some locks and re-acquire.

As far as I can tell, RCU should be able to cover it without taking more
than 2 locks at a time. John saw some issues in the -rt tree (I haven't
reproduced yet) so he's locking the full chains there but I hope that
won't be needed.

>
> > dentry_lru_del_init(dentry);
> > /*
> > * move only zero ref count dentries to the end
> > @@ -950,33 +1004,45 @@ resume:
> > dentry_lru_add_tail(dentry);
> > found++;
> > }
> > - spin_unlock(&dentry->d_lock);
> >
> > /*
> > * We can return to the caller if we have found some (this
> > * ensures forward progress). We'll be coming back to find
> > * the rest.
> > */
> > - if (found && need_resched())
> > + if (found && need_resched()) {
> > + spin_unlock(&dentry->d_lock);
> > goto out;
> > + }
> >
> > /*
> > * Descend a level if the d_subdirs list is non-empty.
> > */
> > if (!list_empty(&dentry->d_subdirs)) {
> > + spin_unlock(&this_parent->d_lock);
> > + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
> > this_parent = dentry;
> > + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
> > goto repeat;

^^^ That's what we do when descending.

2010-06-21 13:35:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Fri, 2010-06-18 at 02:53 +1000, Nick Piggin wrote:

> > Right, so this isn't going to work well, this dentry recursion is
> > basically unbounded afaict, so the 2nd subdir will also be locked using
> > DENRTY_D_LOCKED_NESTED, resulting in the 1st and 2nd subdir both having
> > the same (sub)class and lockdep doesn't like that much.
>
> No it's a bit of a trucky loop, but it is not unbounded. It takes the
> parent, then the child, then it may continue again with the child as
> the new parent but in that case it drops the parent lock and tricks
> lockdep into not barfing.

Ah, indeed the thing you pointed out below should work.

> > Do we really need to keep the whole path locked? One of the comments
> > seems to suggest we could actually drop some locks and re-acquire.
>
> As far as I can tell, RCU should be able to cover it without taking more
> than 2 locks at a time. John saw some issues in the -rt tree (I haven't
> reproduced yet) so he's locking the full chains there but I hope that
> won't be needed.

Right, so I was staring at the -rt splat, so its John who created that
wreckage?

static int select_parent(struct dentry * parent)
{
struct dentry *this_parent;
struct list_head *next;
unsigned seq;
int found;

rename_retry:
found = 0;
this_parent = parent;
seq = read_seqbegin(&rename_lock);

spin_lock(&this_parent->d_lock);
repeat:
next = this_parent->d_subdirs.next;
resume:
while (next != &this_parent->d_subdirs) {
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;

spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
dentry_lru_del_init(dentry);
/*
* move only zero ref count dentries to the end
* of the unused list for prune_dcache
*/
if (!atomic_read(&dentry->d_count)) {
dentry_lru_add_tail(dentry);
found++;
}

/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
* the rest.
*/
if (found && need_resched()) {
spin_unlock(&dentry->d_lock);
goto out;
}

/*
* Descend a level if the d_subdirs list is non-empty.
* Note that we keep a hold on the parent lock while
* we descend, so we don't have to reacquire it on
* ascend.
*/
if (!list_empty(&dentry->d_subdirs)) {
this_parent = dentry;
goto repeat;
}

spin_unlock(&dentry->d_lock);
}
/*
* All done at this level ... ascend and resume the search.
*/
if (this_parent != parent) {
struct dentry *tmp;
struct dentry *child;

tmp = this_parent->d_parent;
child = this_parent;
next = child->d_u.d_child.next;
spin_unlock(&this_parent->d_lock);
this_parent = tmp;
goto resume;
}

out:
/* Make sure we unlock all the way back up the tree */
while (this_parent != parent) {
struct dentry *tmp = this_parent->d_parent;
spin_unlock(&this_parent->d_lock);
this_parent = tmp;
}
spin_unlock(&this_parent->d_lock);
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
return found;
}


> > > /*
> > > * Descend a level if the d_subdirs list is non-empty.
> > > */
> > > if (!list_empty(&dentry->d_subdirs)) {
> > > + spin_unlock(&this_parent->d_lock);
> > > + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
> > > this_parent = dentry;
> > > + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
> > > goto repeat;
>
> ^^^ That's what we do when descending.

You can write that as:
lock_set_subclass(&this_parent->d_lock.dep_map, 0, _RET_IP_);

See kernel/sched.c:double_unlock_balance().


2010-06-21 14:48:15

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Mon, Jun 21, 2010 at 03:35:22PM +0200, Peter Zijlstra wrote:
> On Fri, 2010-06-18 at 02:53 +1000, Nick Piggin wrote:
>
> > > Right, so this isn't going to work well, this dentry recursion is
> > > basically unbounded afaict, so the 2nd subdir will also be locked using
> > > DENRTY_D_LOCKED_NESTED, resulting in the 1st and 2nd subdir both having
> > > the same (sub)class and lockdep doesn't like that much.
> >
> > No it's a bit of a trucky loop, but it is not unbounded. It takes the
> > parent, then the child, then it may continue again with the child as
> > the new parent but in that case it drops the parent lock and tricks
> > lockdep into not barfing.
>
> Ah, indeed the thing you pointed out below should work.
>
> > > Do we really need to keep the whole path locked? One of the comments
> > > seems to suggest we could actually drop some locks and re-acquire.
> >
> > As far as I can tell, RCU should be able to cover it without taking more
> > than 2 locks at a time. John saw some issues in the -rt tree (I haven't
> > reproduced yet) so he's locking the full chains there but I hope that
> > won't be needed.
>
> Right, so I was staring at the -rt splat, so its John who created that
> wreckage?

It was, but apparently they saw an RCU bug there somewhere and hit it
with the big hammer. I haven't been able to reproduce it on a non-rt
kernel yet, and I see yet why RCU is not good enough here.

> > > > /*
> > > > * Descend a level if the d_subdirs list is non-empty.
> > > > */
> > > > if (!list_empty(&dentry->d_subdirs)) {
> > > > + spin_unlock(&this_parent->d_lock);
> > > > + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
> > > > this_parent = dentry;
> > > > + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
> > > > goto repeat;
> >
> > ^^^ That's what we do when descending.
>
> You can write that as:
> lock_set_subclass(&this_parent->d_lock.dep_map, 0, _RET_IP_);
>
> See kernel/sched.c:double_unlock_balance().

OK I'll keep that in mind, thanks!

2010-06-21 14:55:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Tue, 2010-06-22 at 00:48 +1000, Nick Piggin wrote:
> > Right, so I was staring at the -rt splat, so its John who created that
> > wreckage?
>
> It was, but apparently they saw an RCU bug there somewhere and hit it
> with the big hammer. I haven't been able to reproduce it on a non-rt
> kernel yet, and I see yet why RCU is not good enough here.

John, could you describe the failure you spotted?

2010-06-22 06:02:47

by john stultz

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Mon, 2010-06-21 at 16:55 +0200, Peter Zijlstra wrote:
> On Tue, 2010-06-22 at 00:48 +1000, Nick Piggin wrote:
> > > Right, so I was staring at the -rt splat, so its John who created that
> > > wreckage?
> >
> > It was, but apparently they saw an RCU bug there somewhere and hit it
> > with the big hammer. I haven't been able to reproduce it on a non-rt
> > kernel yet, and I see yet why RCU is not good enough here.
>
> John, could you describe the failure you spotted?

The problem was that the rcu_read_lock() on the dentry ascending wasn't
preventing d_put/d_kill from removing entries from the parent node. So
the next entry we tried to follow was invalid. So we were getting odd
oopses from select_parent().

I'm not as familiar with the rcu rules there, so the patch I made just
held the locks as it went down the chain. Not ideal of course, but still
an improvement over the dcache_lock that was there prior.

Peter: I'm sorry, I've been out for a few days. Can you give me some
background on what brought this up and what -rt splat you mean?

thanks
-john

2010-06-22 06:06:44

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Mon, Jun 21, 2010 at 11:02:37PM -0700, John Stultz wrote:
> On Mon, 2010-06-21 at 16:55 +0200, Peter Zijlstra wrote:
> > On Tue, 2010-06-22 at 00:48 +1000, Nick Piggin wrote:
> > > > Right, so I was staring at the -rt splat, so its John who created that
> > > > wreckage?
> > >
> > > It was, but apparently they saw an RCU bug there somewhere and hit it
> > > with the big hammer. I haven't been able to reproduce it on a non-rt
> > > kernel yet, and I see yet why RCU is not good enough here.
> >
> > John, could you describe the failure you spotted?
>
> The problem was that the rcu_read_lock() on the dentry ascending wasn't
> preventing d_put/d_kill from removing entries from the parent node. So
> the next entry we tried to follow was invalid. So we were getting odd
> oopses from select_parent().

Oh, ah OK that makes sense. I was thinking it was a use after grace
period problem. Hmm, I'll think about whether we can fix it better.

>
> I'm not as familiar with the rcu rules there, so the patch I made just
> held the locks as it went down the chain. Not ideal of course, but still
> an improvement over the dcache_lock that was there prior.
>
> Peter: I'm sorry, I've been out for a few days. Can you give me some
> background on what brought this up and what -rt splat you mean?
>
> thanks
> -john
>

2010-06-22 07:27:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Mon, 2010-06-21 at 23:02 -0700, john stultz wrote:
> On Mon, 2010-06-21 at 16:55 +0200, Peter Zijlstra wrote:
> > On Tue, 2010-06-22 at 00:48 +1000, Nick Piggin wrote:
> > > > Right, so I was staring at the -rt splat, so its John who created that
> > > > wreckage?
> > >
> > > It was, but apparently they saw an RCU bug there somewhere and hit it
> > > with the big hammer. I haven't been able to reproduce it on a non-rt
> > > kernel yet, and I see yet why RCU is not good enough here.
> >
> > John, could you describe the failure you spotted?
>
> The problem was that the rcu_read_lock() on the dentry ascending wasn't
> preventing d_put/d_kill from removing entries from the parent node. So
> the next entry we tried to follow was invalid. So we were getting odd
> oopses from select_parent().
>
> I'm not as familiar with the rcu rules there, so the patch I made just
> held the locks as it went down the chain. Not ideal of course, but still
> an improvement over the dcache_lock that was there prior.
>
> Peter: I'm sorry, I've been out for a few days. Can you give me some
> background on what brought this up and what -rt splat you mean?

Well, you make lockdep very unhappy by locking multiple dentries
(unbounded number) all in the same lock class.

2010-06-23 02:03:22

by john stultz

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Tue, 2010-06-22 at 09:27 +0200, Peter Zijlstra wrote:
> On Mon, 2010-06-21 at 23:02 -0700, john stultz wrote:
> > On Mon, 2010-06-21 at 16:55 +0200, Peter Zijlstra wrote:
> > > On Tue, 2010-06-22 at 00:48 +1000, Nick Piggin wrote:
> > > > > Right, so I was staring at the -rt splat, so its John who created that
> > > > > wreckage?
> > > >
> > > > It was, but apparently they saw an RCU bug there somewhere and hit it
> > > > with the big hammer. I haven't been able to reproduce it on a non-rt
> > > > kernel yet, and I see yet why RCU is not good enough here.
> > >
> > > John, could you describe the failure you spotted?
> >
> > The problem was that the rcu_read_lock() on the dentry ascending wasn't
> > preventing d_put/d_kill from removing entries from the parent node. So
> > the next entry we tried to follow was invalid. So we were getting odd
> > oopses from select_parent().
> >
> > I'm not as familiar with the rcu rules there, so the patch I made just
> > held the locks as it went down the chain. Not ideal of course, but still
> > an improvement over the dcache_lock that was there prior.
> >
> > Peter: I'm sorry, I've been out for a few days. Can you give me some
> > background on what brought this up and what -rt splat you mean?
>
> Well, you make lockdep very unhappy by locking multiple dentries
> (unbounded number) all in the same lock class.

So.. Is there a way to tell lockdep that the nesting is ok (I thought
that was what the spin_lock_nested call was doing...)?

Or is locking a (possibly quite long) chain of objects really just a
do-not-do type of operation?

thanks
-john

2010-06-23 07:23:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 11/33] fs: dcache scale subdirs

On Tue, 2010-06-22 at 19:03 -0700, john stultz wrote:

> > Well, you make lockdep very unhappy by locking multiple dentries
> > (unbounded number) all in the same lock class.
>
> So.. Is there a way to tell lockdep that the nesting is ok (I thought
> that was what the spin_lock_nested call was doing...)?

spin_lock_nested() allows you to nest a limited number of locks (up to
8, although the usual case is 1).

> Or is locking a (possibly quite long) chain of objects really just a
> do-not-do type of operation?

Usually, yeah. It would be really nice to do this another way (also for
scalability, keeping a large subtree locked is bound to to lead to more
contention).