From: Miklos Szeredi <[email protected]>
The time shrink_dcache_parent() takes, grows quadratically with the
depth of the tree under 'parent'. This starts to get noticable at
about 10,000.
These kinds of depths don't occur normally, and filesystems which
invoke shrink_dcache_parent() via d_invalidate() seem to have other
depth dependent timings, so it's not even easy to expose this problem.
However with FUSE it's easy to create a deep tree and d_invalidate()
will also get called. This can make a syscall hang for a very long
time.
This is the original discovery of the problem by Russ Cox:
http://article.gmane.org/gmane.comp.file-systems.fuse.devel/3826
The following patch fixes the quadratic behavior, by optionally
allowing prune_dcache() to prune ancestors of a dentry in one go,
instead of doing it one at a time.
Common code in dput() and prune_one_dentry() is extracted into a new
helper function d_kill().
shrink_dcache_parent() as well as shrink_dcache_sb() are converted to
use the ancestry-pruner option. Only for shrink_dcache_memory() is
this behavior not desirable, so it keeps using the old algorithm.
Signed-off-by: Miklos Szeredi <[email protected]>
---
Index: linux/fs/dcache.c
===================================================================
--- linux.orig/fs/dcache.c 2007-02-09 22:11:02.000000000 +0100
+++ linux/fs/dcache.c 2007-02-09 22:51:02.000000000 +0100
@@ -121,6 +121,28 @@ static void dentry_iput(struct dentry *
}
}
+/**
+ * d_kill - kill dentry and return parent
+ * @dentry: dentry to kill
+ *
+ * Called with dcache_lock and d_lock, releases both. The dentry must
+ * already be unhashed and removed from the LRU.
+ *
+ * If this is the root of the dentry tree, return NULL.
+ */
+static struct dentry *d_kill(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ list_del(&dentry->d_u.d_child);
+ dentry_stat.nr_dentry--; /* For d_free, below */
+ /*drops the locks, at that point nobody can reach this dentry */
+ dentry_iput(dentry);
+ parent = dentry->d_parent;
+ d_free(dentry);
+ return dentry == parent ? NULL : parent;
+}
+
/*
* This is dput
*
@@ -189,28 +211,17 @@ repeat:
unhash_it:
__d_drop(dentry);
-
-kill_it: {
- struct dentry *parent;
-
- /* If dentry was on d_lru list
- * delete it from there
- */
- if (!list_empty(&dentry->d_lru)) {
- list_del(&dentry->d_lru);
- dentry_stat.nr_unused--;
- }
- list_del(&dentry->d_u.d_child);
- dentry_stat.nr_dentry--; /* For d_free, below */
- /*drops the locks, at that point nobody can reach this dentry */
- dentry_iput(dentry);
- parent = dentry->d_parent;
- d_free(dentry);
- if (dentry == parent)
- return;
- dentry = parent;
- goto repeat;
+kill_it:
+ /* If dentry was on d_lru list
+ * delete it from there
+ */
+ if (!list_empty(&dentry->d_lru)) {
+ list_del(&dentry->d_lru);
+ dentry_stat.nr_unused--;
}
+ dentry = d_kill(dentry);
+ if (dentry)
+ goto repeat;
}
/**
@@ -371,22 +382,41 @@ restart:
* Throw away a dentry - free the inode, dput the parent. This requires that
* the LRU list has already been removed.
*
+ * If prune_parents is true, try to prune ancestors as well.
+ *
* Called with dcache_lock, drops it and then regains.
* Called with dentry->d_lock held, drops it.
*/
-static void prune_one_dentry(struct dentry * dentry)
+static void prune_one_dentry(struct dentry * dentry, int prune_parents)
{
- struct dentry * parent;
-
__d_drop(dentry);
- list_del(&dentry->d_u.d_child);
- dentry_stat.nr_dentry--; /* For d_free, below */
- dentry_iput(dentry);
- parent = dentry->d_parent;
- d_free(dentry);
- if (parent != dentry)
- dput(parent);
+ dentry = d_kill(dentry);
+ if (!prune_parents) {
+ if (dentry)
+ dput(dentry);
+ spin_lock(&dcache_lock);
+ return;
+ }
+
+ /*
+ * Prune ancestors. Locking is simpler than in dput(),
+ * because dcache_lock needs to be taken anyway.
+ */
spin_lock(&dcache_lock);
+ while (dentry) {
+ if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+ return;
+
+ if (dentry->d_op && dentry->d_op->d_delete)
+ dentry->d_op->d_delete(dentry);
+ if (!list_empty(&dentry->d_lru)) {
+ list_del(&dentry->d_lru);
+ dentry_stat.nr_unused--;
+ }
+ __d_drop(dentry);
+ dentry = d_kill(dentry);
+ spin_lock(&dcache_lock);
+ }
}
/**
@@ -394,6 +424,7 @@ static void prune_one_dentry(struct dent
* @count: number of entries to try and free
* @sb: if given, ignore dentries for other superblocks
* which are being unmounted.
+ * @prune_parents: if true, try to prune ancestors as well in one go
*
* Shrink the dcache. This is done when we need
* more memory, or simply when we need to unmount
@@ -404,7 +435,7 @@ static void prune_one_dentry(struct dent
* all the dentries are in use.
*/
-static void prune_dcache(int count, struct super_block *sb)
+static void prune_dcache(int count, struct super_block *sb, int prune_parents)
{
spin_lock(&dcache_lock);
for (; count ; count--) {
@@ -464,7 +495,7 @@ static void prune_dcache(int count, stru
* without taking the s_umount lock (I already hold it).
*/
if (sb && dentry->d_sb == sb) {
- prune_one_dentry(dentry);
+ prune_one_dentry(dentry, prune_parents);
continue;
}
/*
@@ -479,7 +510,7 @@ static void prune_dcache(int count, stru
s_umount = &dentry->d_sb->s_umount;
if (down_read_trylock(s_umount)) {
if (dentry->d_sb->s_root != NULL) {
- prune_one_dentry(dentry);
+ prune_one_dentry(dentry, prune_parents);
up_read(s_umount);
continue;
}
@@ -550,7 +581,7 @@ repeat:
spin_unlock(&dentry->d_lock);
continue;
}
- prune_one_dentry(dentry);
+ prune_one_dentry(dentry, 1);
cond_resched_lock(&dcache_lock);
goto repeat;
}
@@ -829,7 +860,7 @@ void shrink_dcache_parent(struct dentry
int found;
while ((found = select_parent(parent)) != 0)
- prune_dcache(found, parent->d_sb);
+ prune_dcache(found, parent->d_sb, 1);
}
/*
@@ -849,7 +880,7 @@ static int shrink_dcache_memory(int nr,
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
- prune_dcache(nr, NULL);
+ prune_dcache(nr, NULL, 0);
}
return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
On Fri, 09 Feb 2007 23:01:06 +0100
Miklos Szeredi <[email protected]> wrote:
> From: Miklos Szeredi <[email protected]>
>
> The time shrink_dcache_parent() takes, grows quadratically with the
> depth of the tree under 'parent'. This starts to get noticable at
> about 10,000.
>
> These kinds of depths don't occur normally, and filesystems which
> invoke shrink_dcache_parent() via d_invalidate() seem to have other
> depth dependent timings, so it's not even easy to expose this problem.
>
> However with FUSE it's easy to create a deep tree and d_invalidate()
> will also get called. This can make a syscall hang for a very long
> time.
>
> This is the original discovery of the problem by Russ Cox:
>
> http://article.gmane.org/gmane.comp.file-systems.fuse.devel/3826
"The file system mounted on /tmp/z in the example contains 2^50
directories". heh.
I do wonder how realistic this problem is in real life.
> The following patch fixes the quadratic behavior, by optionally
> allowing prune_dcache() to prune ancestors of a dentry in one go,
> instead of doing it one at a time.
>
> Common code in dput() and prune_one_dentry() is extracted into a new
> helper function d_kill().
>
> shrink_dcache_parent() as well as shrink_dcache_sb() are converted to
> use the ancestry-pruner option. Only for shrink_dcache_memory() is
> this behavior not desirable, so it keeps using the old algorithm.
>
I wonder if we should be setting shrink_parents=1 in
shrink_dcache_memory()? Because we have this problem where the dentry
slabs suffer lots of internal fragmentation and we end up with whole slab
pages pinned by a single directory dentry. I expect that if
shrink_dcache_memory() were aggressive about reaping newly-childless
directory dentries, some improvements might be realised there.
If so, we should change prune_dcache() to return the number pruned, so that
shrink_dcache_memory() can keep its arithmetic correct. Would require some
careful testing and is out of scope for your work.
> "The file system mounted on /tmp/z in the example contains 2^50
> directories". heh.
>
> I do wonder how realistic this problem is in real life.
That's a fair concern, although I was trying this as part
of evaluating how much someone could hose a system
if we let them mount arbitrary FUSE servers. And the
answer is: they could make it completely unusable,
requiring reboot.
I ran a later test that printed how deep it got into
the file tree and it was only a few hundred thousand
if I recall correctly. A determined attacker might even
manage to do this in a normal file system.
But sure, it's not a common case. ;-)
Russ
On Fri, 9 Feb 2007 19:23:31 -0500
"Russ Cox" <[email protected]> wrote:
> > "The file system mounted on /tmp/z in the example contains 2^50
> > directories". heh.
> >
> > I do wonder how realistic this problem is in real life.
>
> That's a fair concern, although I was trying this as part
> of evaluating how much someone could hose a system
> if we let them mount arbitrary FUSE servers. And the
> answer is: they could make it completely unusable,
> requiring reboot.
>
> I ran a later test that printed how deep it got into
> the file tree and it was only a few hundred thousand
> if I recall correctly. A determined attacker might even
> manage to do this in a normal file system.
>
> But sure, it's not a common case. ;-)
Well that's a good point - sometimes people do crazy things on purpose. We
were all University students once ;)
The patches look nice and as I said, potentially of some use for memory
reclaim. But I hope that someone who has worked on dcache.c more recently
than I has time to apply a toothcomb to this work.
> > "The file system mounted on /tmp/z in the example contains 2^50
> > directories". heh.
> >
> > I do wonder how realistic this problem is in real life.
>
> That's a fair concern, although I was trying this as part
> of evaluating how much someone could hose a system
> if we let them mount arbitrary FUSE servers. And the
> answer is: they could make it completely unusable,
> requiring reboot.
>
> I ran a later test that printed how deep it got into
> the file tree and it was only a few hundred thousand
> if I recall correctly. A determined attacker might even
> manage to do this in a normal file system.
Unfortunately this patch doesn't completely solve this problem, since
the system will still be hosed due to all memory being used up by
dentries. And I bet the OOM killer won't find the real target (du)
but will kill anything before that.
So the second part of the problem is to somehow limit the number of
dentries used. Not easy...
Miklos
Miklos Szeredi <[email protected]> writes:
> So the second part of the problem is to somehow limit the number of
> dentries used. Not easy...
OpenVZ has some existing work in this area to separate their virtual machines.
I assume they will eventually submit it.
-Andi
> Unfortunately this patch doesn't completely solve this problem, since
> the system will still be hosed due to all memory being used up by
> dentries. And I bet the OOM killer won't find the real target (du)
> but will kill anything before that.
>
> So the second part of the problem is to somehow limit the number of
> dentries used. Not easy...
At least with this patch (if I am reading it correctly), once the offending
culprit is identified and the programs are killed off, everything will go
back to normal without a reboot.
Russ
> > Unfortunately this patch doesn't completely solve this problem, since
> > the system will still be hosed due to all memory being used up by
> > dentries. And I bet the OOM killer won't find the real target (du)
> > but will kill anything before that.
> >
> > So the second part of the problem is to somehow limit the number of
> > dentries used. Not easy...
>
> At least with this patch (if I am reading it correctly), once the offending
> culprit is identified and the programs are killed off, everything will go
> back to normal without a reboot.
Yes, it's basically a "normal" OOM situation. What makes it worse
than usual, is that it's _kernel_ memory being used up which is not
attributed to any process.
So from the OOM killer's point of view it looks like none of the
userspace programs are responsible for the OOM, and it will just
select targets randomly.
By the time it has come to the actual culprits (the filesystem daemon
and 'du') it will have killed almost everything useful.
So it's definitely better than without the patch, but still has room
for improvement.
Thanks,
Miklos