2007-02-23 15:37:47

by Nick Piggin

[permalink] [raw]
Subject: [rfc][patch] dynamic resizing dentry hash using RCU


The dentry hash uses up 8MB for 1 million entries on my 4GB system is one
of the biggest wasters of memory for me. Because I rarely have more than one or
two hundred thousand dentries. And that's with several kernel trees worth of
entries. Most desktop and probably even many types of servers will only use a
fraction of that.

So I introduce a new method for resizing hash tables with RCU, and apply
that to the dentry hash.

The primitive heuristic is that the hash size is doubled when the number of
entries reaches 150% the hash size, and halved when the number is 50%.
It should also be able to shrink under memory pressure, and scale up as
large as we go.

A pity it uses vmalloc memory for the moment.

The implementation is not highly stress tested, but it is running now. It
could do a bit more RCU stuff asynchronously rather than with synchronize_rcu,
but who cares, for now.

The hash is costing me about 256K now, which is a 32x reduction in memory.

I don't know if it's worthwhile to do this, rather than move things to other
data structures, but something just tempted me to have a go! I'd be interested
to hear comments, and how many holes people can spot in my design ;)

Thanks,
Nick


RCU hash resizing algorithm as follows:

There is a 2 pointer system, the regular hash table, and an "old" hash table
pointer which is NULL except when resizing the hash.

When resizing the hash table, the pointer is set to the current table, and the
regular hash table pointer is switched from the current table to a newly
initialised hash table (so the current table becomes the old table, and the new
table becomes the current table). Memory barriers are used so a reader won't
find a NULL old table AND a new, empty current table.

Then we wait for a grace period, so that all readers can see this new
world order. Readers are anybody who wants to get a handle on the hash table,
including to add entries to it (ie. the data we're protecting is the actual
table -- hash entry locking is pretty well decoupled from this algorithm).
So any new insertions should be going into the current table at this point.

Now we start moving hash entries from the old table into the current table.
This is done under a seqlock, and we can do it bit by bit, as time and
latency allow... After all entries are moved, then we set the "old" pointer
to NULL, wait for a grace period (now no readers will have a handle on it),
then free it.

On the read side, we load the current hash pointer and the old hash pointer
first (remember this has to happen under rcu_read_lock). Then everything
happens as normal when the old pointer is NULL, which is the common case.
If the old pointer is not NULL, we have to be prepared to search both hash
tables to find the entry we want. We also have to do this under the read
seqlock in case we miss an entry when it is being moved from the old table
to the current table as part of the resize process.

NOTE: you can be creative with the seqlock. It could be several seqlocks, if
performance under resize is important. It doesn't even have to be a seqlock,
just something to give you synchronisation. Say if you have a spinlock per
bucket: just hold locks for both buckets while doing the move from one to the
other, and have the read-side search the old table first.

struct hash_table *table, *old_table;

lookup_hash()
{
struct hash_table *cur, *old;

rcu_read_lock();
cur = table;
smp_rmb();
old = old_table;

if (unlikely(old)) {
do {
read_seqbegin();
entry = search_table(cur);
if (!entry)
entry = search_table(old);
} while (read_seqretry());
} else {
entry = search_table(cur);
}
rcu_read_unlock();
}

resize_hash()
{
init(new_table, new_size);

old_table = table;
smp_wmb();
table = new_table;
synchronize_rcu();

for_each_entry_in(entry, old_table) {
write_seqlock();
rehash(entry, new_table);
write_sequnlock();
}

tmp = old_table;
old_table = NULL;
synchronize_rcu();
free(tmp);
}



fs/dcache.c | 324 +++++++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 243 insertions(+), 81 deletions(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c
+++ linux-2.6/fs/dcache.c
@@ -20,6 +20,7 @@
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/smp_lock.h>
#include <linux/hash.h>
@@ -31,15 +32,18 @@
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/swap.h>
-#include <linux/bootmem.h>
+#include <linux/mutex.h>
+#include <linux/kthread.h>
#include "internal.h"


int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);

- __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
static __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(resize_transfer_lock);
+static DEFINE_MUTEX(resize_mutex);

EXPORT_SYMBOL(dcache_lock);

@@ -55,12 +59,22 @@ static struct kmem_cache *dentry_cache _
* This hash-function tries to avoid losing too many bits of hash
* information, yet avoid using a prime hash-size or similar.
*/
-#define D_HASHBITS d_hash_shift
-#define D_HASHMASK d_hash_mask
+struct dentry_hash {
+ unsigned int shift;
+ unsigned long mask;
+ struct hlist_head *table;
+};
+
+struct dentry_hash *dentry_hash __read_mostly;
+struct dentry_hash *old_dentry_hash __read_mostly;
+
+#define MIN_DHASH_SHIFT 10 /* 4K */
+#if BITS_PER_LONG == 32
+#define MAX_DHASH_SHIFT 24 /* 64MB */
+#else
+#define MAX_DHASH_SHIFT 30 /* 8GB */
+#endif

-static unsigned int d_hash_mask __read_mostly;
-static unsigned int d_hash_shift __read_mostly;
-static struct hlist_head *dentry_hashtable __read_mostly;
static LIST_HEAD(dentry_unused);

/* Statistics gathering. */
@@ -68,6 +82,20 @@ struct dentry_stat_t dentry_stat = {
.age_limit = 45,
};

+static void dcache_hash_resize(unsigned int new_shift);
+static void mod_nr_dentry(int mod)
+{
+ unsigned long dentry_size;
+
+ dentry_stat.nr_dentry += mod;
+
+ dentry_size = 1 << dentry_hash->shift;
+ if (unlikely(dentry_stat.nr_dentry > dentry_size+(dentry_size>>1)))
+ dcache_hash_resize(dentry_hash->shift+1);
+ else if (unlikely(dentry_stat.nr_dentry < (dentry_size>>1)))
+ dcache_hash_resize(dentry_hash->shift-1);
+}
+
static void __d_free(struct dentry *dentry)
{
if (dname_external(dentry))
@@ -201,7 +229,7 @@ kill_it: {
dentry_stat.nr_unused--;
}
list_del(&dentry->d_u.d_child);
- dentry_stat.nr_dentry--; /* For d_free, below */
+ mod_nr_dentry(-1); /* For d_free, below */
/*drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);
parent = dentry->d_parent;
@@ -380,7 +408,7 @@ static void prune_one_dentry(struct dent

__d_drop(dentry);
list_del(&dentry->d_u.d_child);
- dentry_stat.nr_dentry--; /* For d_free, below */
+ mod_nr_dentry(-1); /* For d_free, below */
dentry_iput(dentry);
parent = dentry->d_parent;
d_free(dentry);
@@ -660,7 +688,7 @@ static void shrink_dcache_for_umount_sub
out:
/* several dentries were freed, need to correct nr_dentry */
spin_lock(&dcache_lock);
- dentry_stat.nr_dentry -= detached;
+ mod_nr_dentry(-detached);
spin_unlock(&dcache_lock);
}

@@ -916,7 +944,7 @@ struct dentry *d_alloc(struct dentry * p
spin_lock(&dcache_lock);
if (parent)
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
- dentry_stat.nr_dentry++;
+ mod_nr_dentry(1);
spin_unlock(&dcache_lock);

return dentry;
@@ -1057,12 +1085,12 @@ struct dentry * d_alloc_root(struct inod
return res;
}

-static inline struct hlist_head *d_hash(struct dentry *parent,
- unsigned long hash)
+static inline struct hlist_head *d_hash(struct dentry_hash *hashtable,
+ struct dentry *parent, unsigned long hash)
{
hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
- hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
- return dentry_hashtable + (hash & D_HASHMASK);
+ hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> hashtable->shift);
+ return hashtable->table + (hash & hashtable->mask);
}

/**
@@ -1219,18 +1247,18 @@ struct dentry * d_lookup(struct dentry *
return dentry;
}

-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+static struct dentry * __d_lookup_hash(struct dentry_hash * hashtable,
+ struct dentry * parent, struct qstr * name)
{
- unsigned int len = name->len;
unsigned int hash = name->hash;
+ unsigned int len = name->len;
const unsigned char *str = name->name;
- struct hlist_head *head = d_hash(parent,hash);
- struct dentry *found = NULL;
struct hlist_node *node;
+ struct hlist_head *head;
struct dentry *dentry;
+ struct dentry *found = NULL;

- rcu_read_lock();
-
+ head = d_hash(hashtable, parent, hash);
hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
struct qstr *qstr;

@@ -1273,9 +1301,39 @@ struct dentry * __d_lookup(struct dentry
next:
spin_unlock(&dentry->d_lock);
}
+
+ return found;
+}
+
+struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+{
+ struct dentry *ret;
+ struct dentry_hash *hashtable, *old_hashtable;
+
+ rcu_read_lock();
+ hashtable = dentry_hash;
+ smp_rmb(); /* see smp_wmb in dcache_hash_resize_work */
+ old_hashtable = old_dentry_hash;
+
+ if (unlikely(old_hashtable)) {
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&resize_transfer_lock);
+ ret = __d_lookup_hash(old_hashtable, parent, name);
+ if (ret)
+ goto out_rcu;
+ ret = __d_lookup_hash(hashtable, parent, name);
+ if (ret)
+ goto out_rcu;
+ } while (read_seqretry(&resize_transfer_lock, seq));
+ } else {
+ ret = __d_lookup_hash(hashtable, parent, name);
+ }
+
+out_rcu:
rcu_read_unlock();

- return found;
+ return ret;
}

/**
@@ -1304,6 +1362,28 @@ out:
return dentry;
}

+static int d_validate_hash(struct dentry_hash *hashtable,
+ struct dentry *dentry, struct dentry *parent)
+{
+ struct hlist_head *base;
+ struct hlist_node *lhp;
+
+ base = d_hash(hashtable, parent, dentry->d_name.hash);
+ spin_lock(&dcache_lock);
+ hlist_for_each(lhp,base) {
+ /* hlist_for_each_entry_rcu() not required for d_hash list
+ * as it is parsed under dcache_lock
+ */
+ if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
+ __dget_locked(dentry);
+ spin_unlock(&dcache_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&dcache_lock);
+ return 0;
+}
+
/**
* d_validate - verify dentry provided from insecure source
* @dentry: The dentry alleged to be valid child of @dparent
@@ -1316,33 +1396,42 @@ out:
* Zero is returned in the dentry is invalid.
*/

-int d_validate(struct dentry *dentry, struct dentry *dparent)
+int d_validate(struct dentry *dentry, struct dentry *parent)
{
- struct hlist_head *base;
- struct hlist_node *lhp;
+ struct dentry_hash *hashtable, *old_hashtable;
+ int ret = 0;

/* Check whether the ptr might be valid at all.. */
if (!kmem_ptr_validate(dentry_cache, dentry))
goto out;

- if (dentry->d_parent != dparent)
+ if (dentry->d_parent != parent)
goto out;

- spin_lock(&dcache_lock);
- base = d_hash(dparent, dentry->d_name.hash);
- hlist_for_each(lhp,base) {
- /* hlist_for_each_entry_rcu() not required for d_hash list
- * as it is parsed under dcache_lock
- */
- if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
- __dget_locked(dentry);
- spin_unlock(&dcache_lock);
- return 1;
- }
+ rcu_read_lock();
+ hashtable = dentry_hash;
+ smp_rmb(); /* see smp_wmb in dcache_hash_resize_work */
+ old_hashtable = old_dentry_hash;
+
+ if (unlikely(old_hashtable)) {
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&resize_transfer_lock);
+ ret = d_validate_hash(old_hashtable, dentry, parent);
+ if (ret)
+ goto out_rcu;
+ ret = d_validate_hash(hashtable, dentry, parent);
+ if (ret)
+ goto out_rcu;
+ } while (read_seqretry(&resize_transfer_lock, seq));
+ } else {
+ ret = d_validate_hash(hashtable, dentry, parent);
}
- spin_unlock(&dcache_lock);
+
+out_rcu:
+ rcu_read_unlock();
out:
- return 0;
+ return ret;
}

/*
@@ -1402,7 +1491,9 @@ static void __d_rehash(struct dentry * e

static void _d_rehash(struct dentry * entry)
{
- __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
+ rcu_read_lock();
+ __d_rehash(entry, d_hash(dentry_hash, entry->d_parent, entry->d_name.hash));
+ rcu_read_unlock();
}

/**
@@ -1518,8 +1609,10 @@ static void d_move_locked(struct dentry
hlist_del_rcu(&dentry->d_hash);

already_unhashed:
- list = d_hash(target->d_parent, target->d_name.hash);
+ rcu_read_lock();
+ list = d_hash(dentry_hash, target->d_parent, target->d_name.hash);
__d_rehash(dentry, list);
+ rcu_read_unlock();

/* Unhash the target: dput() will then get rid of it */
__d_drop(target);
@@ -2009,42 +2102,114 @@ ino_t find_inode_number(struct dentry *d
return ino;
}

-static __initdata unsigned long dhash_entries;
-static int __init set_dhash_entries(char *str)
+static unsigned int resize_new_shift;
+
+static void dcache_hash_resize_work(struct work_struct *work)
{
- if (!str)
- return 0;
- dhash_entries = simple_strtoul(str, &str, 0);
- return 1;
+ unsigned long new_size, old_size;
+ int i;
+ struct dentry_hash *new_hash, *old_hash;
+ unsigned long transferred;
+
+ if (!mutex_trylock(&resize_mutex))
+ goto out;
+
+ new_hash = kmalloc(sizeof(struct dentry_hash), GFP_KERNEL);
+ if (!new_hash)
+ goto out_unlock;
+
+ new_hash->shift = resize_new_shift;
+ new_size = 1 << new_hash->shift;
+ new_hash->mask = new_size - 1;
+ new_hash->table = vmalloc(new_size*sizeof(struct hlist_head));
+ if (!new_hash->table)
+ goto out_kfree;
+ for (i = 0; i < new_size; i++)
+ INIT_HLIST_HEAD(&new_hash->table[i]);
+
+ old_dentry_hash = dentry_hash;
+ /*
+ * ensure that if the reader sees the new dentry_hash,
+ * then they will also see the old_dentry_hash assignment,
+ * above.
+ */
+ smp_wmb();
+ dentry_hash = new_hash;
+ synchronize_rcu();
+
+ old_size = 1 << old_dentry_hash->shift;
+ transferred = 0;
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &old_dentry_hash->table[i];
+
+ if (hlist_empty(head))
+ continue;
+
+ spin_lock(&dcache_lock);
+ write_seqlock(&resize_transfer_lock);
+ while (!hlist_empty(head)) {
+ struct dentry *dentry;
+ struct hlist_head *new_head;
+
+ dentry = hlist_entry(head->first, struct dentry, d_hash);
+ new_head = d_hash(dentry_hash, dentry->d_parent, dentry->d_name.hash);
+
+ spin_lock(&dentry->d_lock);
+ hlist_del_rcu(head->first);
+ hlist_add_head_rcu(&dentry->d_hash, new_head);
+ spin_unlock(&dentry->d_lock);
+ transferred++;
+ }
+ write_sequnlock(&resize_transfer_lock);
+ spin_unlock(&dcache_lock);
+ }
+
+ printk("resize dcache hash from %lu to %lu, moved %lu entries\n", old_size, new_size, transferred);
+
+ old_hash = old_dentry_hash;
+ old_dentry_hash = NULL;
+ mutex_unlock(&resize_mutex);
+ synchronize_rcu();
+ vfree(old_hash->table);
+ kfree(old_hash);
+
+ resize_new_shift = 0;
+ return;
+
+out_kfree:
+ kfree(new_hash);
+out_unlock:
+ mutex_unlock(&resize_mutex);
+out:
+ resize_new_shift = 0;
+ return;
}
-__setup("dhash_entries=", set_dhash_entries);

-static void __init dcache_init_early(void)
+static DEFINE_SPINLOCK(resize_lock);
+
+static void dcache_hash_resize(unsigned int new_shift)
{
- int loop;
+ static DECLARE_WORK(resize_work, dcache_hash_resize_work);

- /* If hashes are distributed across NUMA nodes, defer
- * hash allocation until vmalloc space is available.
- */
- if (hashdist)
+ if (new_shift < MIN_DHASH_SHIFT || new_shift > MAX_DHASH_SHIFT)
return;

- dentry_hashtable =
- alloc_large_system_hash("Dentry cache",
- sizeof(struct hlist_head),
- dhash_entries,
- 13,
- HASH_EARLY,
- &d_hash_shift,
- &d_hash_mask,
- 0);
+ if (resize_new_shift)
+ return;
+ spin_lock(&resize_lock);
+ if (resize_new_shift) {
+ spin_unlock(&resize_lock);
+ return;
+ }
+ resize_new_shift = new_shift;
+ spin_unlock(&resize_lock);

- for (loop = 0; loop < (1 << d_hash_shift); loop++)
- INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+ schedule_work(&resize_work);
}

static void __init dcache_init(unsigned long mempages)
{
+ unsigned long hash_size;
int loop;

/*
@@ -2061,22 +2226,20 @@ static void __init dcache_init(unsigned

set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);

- /* Hash may have been set up in dcache_init_early */
- if (!hashdist)
- return;
-
- dentry_hashtable =
- alloc_large_system_hash("Dentry cache",
- sizeof(struct hlist_head),
- dhash_entries,
- 13,
- 0,
- &d_hash_shift,
- &d_hash_mask,
- 0);
+ old_dentry_hash = NULL;

- for (loop = 0; loop < (1 << d_hash_shift); loop++)
- INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+ dentry_hash = kmalloc(sizeof(struct dentry_hash), GFP_ATOMIC);
+ if (!dentry_hash)
+ panic("Failed to allocate dentry_hash\n");
+ dentry_hash->shift = MIN_DHASH_SHIFT;
+ hash_size = 1 << dentry_hash->shift;
+ dentry_hash->mask = hash_size - 1;
+ dentry_hash->table = __vmalloc(hash_size*sizeof(struct hlist_head),
+ GFP_ATOMIC, PAGE_KERNEL);
+ if (!dentry_hash->table)
+ panic("Failed to allocate dentry_hash->table\n");
+ for (loop = 0; loop < hash_size; loop++)
+ INIT_HLIST_HEAD(&dentry_hash->table[loop]);
}

/* SLAB cache for __getname() consumers */
@@ -2089,7 +2252,6 @@ EXPORT_SYMBOL(d_genocide);

void __init vfs_caches_init_early(void)
{
- dcache_init_early();
inode_init_early();
}


2007-02-23 16:31:31

by Eric Dumazet

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Friday 23 February 2007 16:37, Nick Piggin wrote:
> The dentry hash uses up 8MB for 1 million entries on my 4GB system is one
> of the biggest wasters of memory for me. Because I rarely have more than
> one or two hundred thousand dentries. And that's with several kernel trees
> worth of entries. Most desktop and probably even many types of servers will
> only use a fraction of that.
>
> So I introduce a new method for resizing hash tables with RCU, and apply
> that to the dentry hash.
>
> The primitive heuristic is that the hash size is doubled when the number of
> entries reaches 150% the hash size, and halved when the number is 50%.
> It should also be able to shrink under memory pressure, and scale up as
> large as we go.
>
> A pity it uses vmalloc memory for the moment.
>
> The implementation is not highly stress tested, but it is running now. It
> could do a bit more RCU stuff asynchronously rather than with
> synchronize_rcu, but who cares, for now.
>
> The hash is costing me about 256K now, which is a 32x reduction in memory.
>
> I don't know if it's worthwhile to do this, rather than move things to
> other data structures, but something just tempted me to have a go! I'd be
> interested to hear comments, and how many holes people can spot in my
> design ;)
>
> Thanks,
> Nick

Hi Nick

Thats a really good idea !

The vmalloc() thing could be a problem, so :

Could you bring back the support of 'dhash_entries=262144' boot param, so that
an admin could set the initial size of dhash table, (and not shrink it under
this size even if the number of dentries is low)

In case dhash_entries is set in boot params, we could try to use
alloc_large_system_hash() for the initial table, (eventually using Hugepages
(not vmalloc)), if we add a free_large_system_hash() function to be able to
free the initial table.

Or else, time is to add the possibility for vmalloc() to use hugepages
itself...

2007-02-23 17:55:33

by Zach Brown

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU


On Feb 23, 2007, at 7:37 AM, Nick Piggin wrote:

>
> The dentry hash uses up 8MB for 1 million entries on my 4GB system
> is one
> of the biggest wasters of memory for me. Because I rarely have more
> than one or
> two hundred thousand dentries. And that's with several kernel trees
> worth of
> entries. Most desktop and probably even many types of servers will
> only use a
> fraction of that.
>
> So I introduce a new method for resizing hash tables with RCU, and
> apply
> that to the dentry hash.

Can you compare what you've done to the design that Paul and David
talked about a year ago?

http://lkml.org/lkml/2006/1/30/74

I'd love to see a generic implementation of RCU hashing that
subsystems can then take advantage of. It's long been on the fun
side of my todo list. The side I never get to :/.

- z

2007-02-24 01:08:31

by Nick Piggin

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, Feb 23, 2007 at 05:31:17PM +0100, Eric Dumazet wrote:
> On Friday 23 February 2007 16:37, Nick Piggin wrote:
> > The dentry hash uses up 8MB for 1 million entries on my 4GB system is one
> > of the biggest wasters of memory for me. Because I rarely have more than
> > one or two hundred thousand dentries. And that's with several kernel trees
> > worth of entries. Most desktop and probably even many types of servers will
> > only use a fraction of that.
> >
> > So I introduce a new method for resizing hash tables with RCU, and apply
> > that to the dentry hash.
> >
> > The primitive heuristic is that the hash size is doubled when the number of
> > entries reaches 150% the hash size, and halved when the number is 50%.
> > It should also be able to shrink under memory pressure, and scale up as
> > large as we go.
> >
> > A pity it uses vmalloc memory for the moment.
> >
> > The implementation is not highly stress tested, but it is running now. It
> > could do a bit more RCU stuff asynchronously rather than with
> > synchronize_rcu, but who cares, for now.
> >
> > The hash is costing me about 256K now, which is a 32x reduction in memory.
> >
> > I don't know if it's worthwhile to do this, rather than move things to
> > other data structures, but something just tempted me to have a go! I'd be
> > interested to hear comments, and how many holes people can spot in my
> > design ;)
> >
> > Thanks,
> > Nick
>
> Hi Nick
>
> Thats a really good idea !
>
> The vmalloc() thing could be a problem, so :
>
> Could you bring back the support of 'dhash_entries=262144' boot param, so that
> an admin could set the initial size of dhash table, (and not shrink it under
> this size even if the number of dentries is low)

Hi Eric,

Yeah, that's a good idea. I'll look at doing that.

> In case dhash_entries is set in boot params, we could try to use
> alloc_large_system_hash() for the initial table, (eventually using Hugepages
> (not vmalloc)), if we add a free_large_system_hash() function to be able to
> free the initial table.
>
> Or else, time is to add the possibility for vmalloc() to use hugepages
> itself...

That sounds like a nice idea to have a hugepage vmalloc for very large
allocations. The big NUMA guys already use vmalloc to allocate large hashes,
so hugepages would probably be a big win for them.

2007-02-24 01:26:05

by Nick Piggin

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, Feb 23, 2007 at 09:25:28AM -0800, Zach Brown wrote:
>
> On Feb 23, 2007, at 7:37 AM, Nick Piggin wrote:
>
> >
> >The dentry hash uses up 8MB for 1 million entries on my 4GB system
> >is one
> >of the biggest wasters of memory for me. Because I rarely have more
> >than one or
> >two hundred thousand dentries. And that's with several kernel trees
> >worth of
> >entries. Most desktop and probably even many types of servers will
> >only use a
> >fraction of that.
> >
> >So I introduce a new method for resizing hash tables with RCU, and
> >apply
> >that to the dentry hash.
>
> Can you compare what you've done to the design that Paul and David
> talked about a year ago?
>
> http://lkml.org/lkml/2006/1/30/74

Thanks for the link, I wasn't aware of Paul's algorithm before.

I guess most variants are going to have a double pointer scheme,
so they are similar in that regard.

I think Paul's is quite complex in moving entries to the new table,
and it looks like it requires a lot of grace periods. I avoid all
that by using the seqlock.

It wasn't clear to me how Paul handled the case where an item is
present in the not_current table, but the lookup misses it when it
gets moved between the tables. It is a little tricky to follow
the find details because it is not in code or pseudo code format.


> I'd love to see a generic implementation of RCU hashing that
> subsystems can then take advantage of. It's long been on the fun
> side of my todo list. The side I never get to :/.

Yeah if this is to be used anywhere else, I think it definitely
needs to be made into a generic library if possible. Main thing
I wanted was to get something working and see what people think
about it.

Thanks,
Nick

2007-02-24 01:31:37

by Michael K. Edwards

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On 2/23/07, Zach Brown <[email protected]> wrote:
> I'd love to see a generic implementation of RCU hashing that
> subsystems can then take advantage of. It's long been on the fun
> side of my todo list. The side I never get to :/.

There's an active thread on netdev about implementing an RCU hash.
I'd suggest a 2-left (or possibly even k-left) hash for statistical
reasons discussed briefly there, and in greater depth in a paper by
Michael Mitzenmacher at
http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/iproute.ps.
Despite his paper's emphasis on hardware parallelism, there's a bigger
win associated with Poisson statistics and decreasing occupation
fraction (and therefore collision probability) in successive hashes.

Cheers,
- Michael

2007-02-24 01:52:49

by Nick Piggin

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, Feb 23, 2007 at 05:31:30PM -0800, Michael K. Edwards wrote:
> On 2/23/07, Zach Brown <[email protected]> wrote:
> >I'd love to see a generic implementation of RCU hashing that
> >subsystems can then take advantage of. It's long been on the fun
> >side of my todo list. The side I never get to :/.
>
> There's an active thread on netdev about implementing an RCU hash.
> I'd suggest a 2-left (or possibly even k-left) hash for statistical
> reasons discussed briefly there, and in greater depth in a paper by
> Michael Mitzenmacher at
> http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/iproute.ps.
> Despite his paper's emphasis on hardware parallelism, there's a bigger
> win associated with Poisson statistics and decreasing occupation
> fraction (and therefore collision probability) in successive hashes.

Thanks for the link, I'll take a look.

I'm *pretty* sure that my algorithm will work with any kind of data
structure. It could even change from one to another completely different
data structure.

The important point is that, when a switch is in progress, the data
structure effectively becomes the union of the old and the new. Readers
insert items only into the new structure, but it is implicit that they
have checked for collisions in the union (in the case where collisions are
possible).

And finally, (you probably gathered this, but I'll just make it clear),
I haven't implemented a lockless hash lookup. The dcache already has this.
My algorithm is a lockless dynamic data structure switch, rather than
having anything specifically to do with a specific type of hash, or even
a hash at all. So yes, it should be applicable to resizing a 2-left hash or
whatever.

Thanks,
Nick

2007-02-24 02:07:49

by Nick Piggin

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Sat, Feb 24, 2007 at 02:26:02AM +0100, Nick Piggin wrote:
> On Fri, Feb 23, 2007 at 09:25:28AM -0800, Zach Brown wrote:
> >
> > On Feb 23, 2007, at 7:37 AM, Nick Piggin wrote:
> >
> > >
> > >The dentry hash uses up 8MB for 1 million entries on my 4GB system
> > >is one
> > >of the biggest wasters of memory for me. Because I rarely have more
> > >than one or
> > >two hundred thousand dentries. And that's with several kernel trees
> > >worth of
> > >entries. Most desktop and probably even many types of servers will
> > >only use a
> > >fraction of that.
> > >
> > >So I introduce a new method for resizing hash tables with RCU, and
> > >apply
> > >that to the dentry hash.
> >
> > Can you compare what you've done to the design that Paul and David
> > talked about a year ago?
> >
> > http://lkml.org/lkml/2006/1/30/74
>
> Thanks for the link, I wasn't aware of Paul's algorithm before.
>
> I guess most variants are going to have a double pointer scheme,
> so they are similar in that regard.
>
> I think Paul's is quite complex in moving entries to the new table,
> and it looks like it requires a lot of grace periods. I avoid all
> that by using the seqlock.
>
> It wasn't clear to me how Paul handled the case where an item is
> present in the not_current table, but the lookup misses it when it
> gets moved between the tables. It is a little tricky to follow
> the find details because it is not in code or pseudo code format.


I guess the other thing is that Paul's seems quite hash specific,
wheras mine does not have to be tied to any particular data strcture.

All you need it to know how to perform a lookup on both your old
and new data structure, and the same algorithm is basically applicable
to switching between any 2 data structures.


2007-02-24 04:08:10

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, 23 Feb 2007 16:37:43 +0100
Nick Piggin <[email protected]> wrote:

> +static void dcache_hash_resize(unsigned int new_shift);
> +static void mod_nr_dentry(int mod)
> +{
> + unsigned long dentry_size;
> +
> + dentry_stat.nr_dentry += mod;
> +
> + dentry_size = 1 << dentry_hash->shift;
> + if (unlikely(dentry_stat.nr_dentry > dentry_size+(dentry_size>>1)))
> + dcache_hash_resize(dentry_hash->shift+1);
> + else if (unlikely(dentry_stat.nr_dentry < (dentry_size>>1)))
> + dcache_hash_resize(dentry_hash->shift-1);
> +}
> +

Do we need to do this kind of resizing in implicit automatic way ?
I think it's good to show contention rate by /proc and add sysctl for
resize this.

-Kame

2007-02-24 04:24:51

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, Feb 23, 2007 at 04:37:43PM +0100, Nick Piggin wrote:
> The dentry hash uses up 8MB for 1 million entries on my 4GB system is
> one of the biggest wasters of memory for me. Because I rarely have
> more than one or two hundred thousand dentries. And that's with
> several kernel trees worth of entries. Most desktop and probably even
> many types of servers will only use a fraction of that.
> So I introduce a new method for resizing hash tables with RCU, and apply
> that to the dentry hash.
> The primitive heuristic is that the hash size is doubled when the number of
> entries reaches 150% the hash size, and halved when the number is 50%.
> It should also be able to shrink under memory pressure, and scale up as
> large as we go.

You would be better served by a data structure different from a hashtable.


-- wli

2007-02-24 05:09:45

by Nick Piggin

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, Feb 23, 2007 at 08:24:44PM -0800, William Lee Irwin III wrote:
> On Fri, Feb 23, 2007 at 04:37:43PM +0100, Nick Piggin wrote:
> > The dentry hash uses up 8MB for 1 million entries on my 4GB system is
> > one of the biggest wasters of memory for me. Because I rarely have
> > more than one or two hundred thousand dentries. And that's with
> > several kernel trees worth of entries. Most desktop and probably even
> > many types of servers will only use a fraction of that.
> > So I introduce a new method for resizing hash tables with RCU, and apply
> > that to the dentry hash.
> > The primitive heuristic is that the hash size is doubled when the number of
> > entries reaches 150% the hash size, and halved when the number is 50%.
> > It should also be able to shrink under memory pressure, and scale up as
> > large as we go.
>
> You would be better served by a data structure different from a hashtable.

Possibly. Note that I wasn't intending to rewrite the dcache hash
specifically, but just demonstrate my lockless dynamic data structure
switch, and the dentry hash was the most obvious candidate.

But considering that it is lockless, and basically free in the fastpath
(ie. a single easily-predicted branch), then I'm better served by an
dynamically sized dynamic hash table than an inappropriately sized one.

Well, almost. The vmalloc issue is the downside, of course. But if I'm
on a NUMA that is doing vmalloc hashes anyway, then there is little
downside.

Out of curiosity, what better data structure do you have in mind for
the dentry hash?

2007-02-24 05:16:07

by Nick Piggin

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Sat, Feb 24, 2007 at 01:07:23PM +0900, KAMEZAWA Hiroyuki wrote:
> On Fri, 23 Feb 2007 16:37:43 +0100
> Nick Piggin <[email protected]> wrote:
>
> > +static void dcache_hash_resize(unsigned int new_shift);
> > +static void mod_nr_dentry(int mod)
> > +{
> > + unsigned long dentry_size;
> > +
> > + dentry_stat.nr_dentry += mod;
> > +
> > + dentry_size = 1 << dentry_hash->shift;
> > + if (unlikely(dentry_stat.nr_dentry > dentry_size+(dentry_size>>1)))
> > + dcache_hash_resize(dentry_hash->shift+1);
> > + else if (unlikely(dentry_stat.nr_dentry < (dentry_size>>1)))
> > + dcache_hash_resize(dentry_hash->shift-1);
> > +}
> > +
>
> Do we need to do this kind of resizing in implicit automatic way ?
> I think it's good to show contention rate by /proc and add sysctl for
> resize this.

Well having the kernel automatically adjust to the workload is better
than a sysctl. But it could well be the case that these simple heuristics
are bad (though they're fine for my system).

But a manual override is a good idea as well.

2007-02-24 22:56:45

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, Feb 23, 2007 at 08:24:44PM -0800, William Lee Irwin III wrote:
>> You would be better served by a data structure different from a hashtable.

On Sat, Feb 24, 2007 at 06:09:37AM +0100, Nick Piggin wrote:
> Out of curiosity, what better data structure do you have in mind for
> the dentry hash?

Per-directory indices of ->d_subdirs. Hash tries (spelled correctly)
and B+ trees hung off dentries corresponding to directories are
plausible. There are plenty of other possibilities; just do it on a
per-directory basis so you don't intermix children of different parents
in some boot-time -allocated global trainwreck and you're home free.
Benchmarking is probably needed to gauge which performs best.


-- wli

2007-02-25 00:56:33

by David Miller

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

From: William Lee Irwin III <[email protected]>
Date: Sat, 24 Feb 2007 14:56:31 -0800

> just do it on a per-directory basis so you don't intermix children
> of different parents in some boot-time -allocated global trainwreck
> and you're home free. Benchmarking is probably needed to gauge
> which performs best.

The original dentry implementation in the kernel did things
per-directory and it sucked.

The problem you run into is that you end up with recursive algorithms
all over the place, which chew up and overflow the kernel stack.

So it would be a regression to go to a per-directory type
lookup data structure for dentries.

2007-02-25 02:15:47

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

> From: William Lee Irwin III <[email protected]>
> Date: Sat, 24 Feb 2007 14:56:31 -0800
>> just do it on a per-directory basis so you don't intermix children
>> of different parents in some boot-time -allocated global trainwreck
>> and you're home free. Benchmarking is probably needed to gauge
>> which performs best.

On Sat, Feb 24, 2007 at 04:56:31PM -0800, David Miller wrote:
> The original dentry implementation in the kernel did things
> per-directory and it sucked.
> The problem you run into is that you end up with recursive algorithms
> all over the place, which chew up and overflow the kernel stack.
> So it would be a regression to go to a per-directory type
> lookup data structure for dentries.

Recursive code is clearly inadmissible in kernel environments. Any
implementations would clearly be required to be coded iteratively.
It's unclear to me where the choice of data structure would make a
demand for recursive code, but wherever it ostensibly does so, an
explicitly-allocated stack (or queue or worklist of whatever sort)
should be able to unravel it. There are furthermore rather stringent
bounds on balanced tree heights due to address space limits, so
whatever explicitly-allocated stacks may be required for e.g. tree
balancing algorithms are tightly bounded in depth.

The usual objection is to lexicographically ordered balanced trees
such as B+ trees on the grounds that string comparison is cache-
intensive. Hash tries resolve that concern, albeit at the amortized
cost of expansion and contraction of what are effectively nodes in
a radix tree with a large radix as well as undoing path compression
where required (speaking of which, lib/radix-tree.c should really do
path compression instead of the ->depth hack). The quest, as I see it,
is for self-sizing data structures with as little restructuring and
direct key (string) comparison as possible. Should the obvious
candidates not incur significant overhead on account of where they
do such, maybe they're good enough.

So I'm not sure what else to say as far as the per-directory lookup
structure commentary goes. There's no way I'd propose putting
recursive code in the kernel. I don't believe any of what I'm talking
about requires such. I'm aware of various drawbacks the algorithms
have, and have in mind that they should pass benchmark criteria in
addition to reducing kernel memory consumption in order to prove that
they don't overwhelm the putative benefits. So I think I've got all the
bases covered.

Am I off-base on any of this? Is there anything I missed?


-- wli

(P.S.: The only time I ever proposed putting something "recursive"
in the kernel, it used a flag to limit the recursion depth to two
stack frames. It was for the purpose of simplifying the reservation
of metadata in an allocator, and so not strictly necessary. A
simplified metadata allocator could've easily been devised, albeit at
the cost of some code duplication.)

2007-02-25 06:21:28

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [rfc][patch] dynamic resizing dentry hash using RCU

On Fri, Feb 23, 2007 at 04:37:43PM +0100, Nick Piggin wrote:
>
> The dentry hash uses up 8MB for 1 million entries on my 4GB system is one
> of the biggest wasters of memory for me. Because I rarely have more than one or
> two hundred thousand dentries. And that's with several kernel trees worth of
> entries. Most desktop and probably even many types of servers will only use a
> fraction of that.
>
> So I introduce a new method for resizing hash tables with RCU, and apply
> that to the dentry hash.
>
> The primitive heuristic is that the hash size is doubled when the number of
> entries reaches 150% the hash size, and halved when the number is 50%.
> It should also be able to shrink under memory pressure, and scale up as
> large as we go.
>
> A pity it uses vmalloc memory for the moment.
>
> The implementation is not highly stress tested, but it is running now. It
> could do a bit more RCU stuff asynchronously rather than with synchronize_rcu,
> but who cares, for now.
>
> The hash is costing me about 256K now, which is a 32x reduction in memory.
>
> I don't know if it's worthwhile to do this, rather than move things to other
> data structures, but something just tempted me to have a go! I'd be interested
> to hear comments, and how many holes people can spot in my design ;)

The pseudocode looks sound to me. The discussion on netdev has a few
alternatives that are interesting as well -- one of them gets rid of
the need for the sequence lock using a trick similar to one that Josh
Triplett came up with for atomically moving an element from one list
to another in face of RCU readers. Another is Robert Ohlsson's "trash"
trie-hash implementation.

Some rather impressive algorithms all around!

Thanx, Paul

> Thanks,
> Nick
>
>
> RCU hash resizing algorithm as follows:
>
> There is a 2 pointer system, the regular hash table, and an "old" hash table
> pointer which is NULL except when resizing the hash.
>
> When resizing the hash table, the pointer is set to the current table, and the
> regular hash table pointer is switched from the current table to a newly
> initialised hash table (so the current table becomes the old table, and the new
> table becomes the current table). Memory barriers are used so a reader won't
> find a NULL old table AND a new, empty current table.
>
> Then we wait for a grace period, so that all readers can see this new
> world order. Readers are anybody who wants to get a handle on the hash table,
> including to add entries to it (ie. the data we're protecting is the actual
> table -- hash entry locking is pretty well decoupled from this algorithm).
> So any new insertions should be going into the current table at this point.
>
> Now we start moving hash entries from the old table into the current table.
> This is done under a seqlock, and we can do it bit by bit, as time and
> latency allow... After all entries are moved, then we set the "old" pointer
> to NULL, wait for a grace period (now no readers will have a handle on it),
> then free it.
>
> On the read side, we load the current hash pointer and the old hash pointer
> first (remember this has to happen under rcu_read_lock). Then everything
> happens as normal when the old pointer is NULL, which is the common case.
> If the old pointer is not NULL, we have to be prepared to search both hash
> tables to find the entry we want. We also have to do this under the read
> seqlock in case we miss an entry when it is being moved from the old table
> to the current table as part of the resize process.
>
> NOTE: you can be creative with the seqlock. It could be several seqlocks, if
> performance under resize is important. It doesn't even have to be a seqlock,
> just something to give you synchronisation. Say if you have a spinlock per
> bucket: just hold locks for both buckets while doing the move from one to the
> other, and have the read-side search the old table first.
>
> struct hash_table *table, *old_table;
>
> lookup_hash()
> {
> struct hash_table *cur, *old;
>
> rcu_read_lock();
> cur = table;
> smp_rmb();
> old = old_table;
>
> if (unlikely(old)) {
> do {
> read_seqbegin();
> entry = search_table(cur);
> if (!entry)
> entry = search_table(old);
> } while (read_seqretry());
> } else {
> entry = search_table(cur);
> }
> rcu_read_unlock();
> }
>
> resize_hash()
> {
> init(new_table, new_size);
>
> old_table = table;
> smp_wmb();
> table = new_table;
> synchronize_rcu();
>
> for_each_entry_in(entry, old_table) {
> write_seqlock();
> rehash(entry, new_table);
> write_sequnlock();
> }
>
> tmp = old_table;
> old_table = NULL;
> synchronize_rcu();
> free(tmp);
> }
>
>
>
> fs/dcache.c | 324 +++++++++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 243 insertions(+), 81 deletions(-)
>
> Index: linux-2.6/fs/dcache.c
> ===================================================================
> --- linux-2.6.orig/fs/dcache.c
> +++ linux-2.6/fs/dcache.c
> @@ -20,6 +20,7 @@
> #include <linux/fs.h>
> #include <linux/fsnotify.h>
> #include <linux/slab.h>
> +#include <linux/vmalloc.h>
> #include <linux/init.h>
> #include <linux/smp_lock.h>
> #include <linux/hash.h>
> @@ -31,15 +32,18 @@
> #include <linux/security.h>
> #include <linux/seqlock.h>
> #include <linux/swap.h>
> -#include <linux/bootmem.h>
> +#include <linux/mutex.h>
> +#include <linux/kthread.h>
> #include "internal.h"
>
>
> int sysctl_vfs_cache_pressure __read_mostly = 100;
> EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
>
> - __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
> +__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
> static __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
> +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(resize_transfer_lock);
> +static DEFINE_MUTEX(resize_mutex);
>
> EXPORT_SYMBOL(dcache_lock);
>
> @@ -55,12 +59,22 @@ static struct kmem_cache *dentry_cache _
> * This hash-function tries to avoid losing too many bits of hash
> * information, yet avoid using a prime hash-size or similar.
> */
> -#define D_HASHBITS d_hash_shift
> -#define D_HASHMASK d_hash_mask
> +struct dentry_hash {
> + unsigned int shift;
> + unsigned long mask;
> + struct hlist_head *table;
> +};
> +
> +struct dentry_hash *dentry_hash __read_mostly;
> +struct dentry_hash *old_dentry_hash __read_mostly;
> +
> +#define MIN_DHASH_SHIFT 10 /* 4K */
> +#if BITS_PER_LONG == 32
> +#define MAX_DHASH_SHIFT 24 /* 64MB */
> +#else
> +#define MAX_DHASH_SHIFT 30 /* 8GB */
> +#endif
>
> -static unsigned int d_hash_mask __read_mostly;
> -static unsigned int d_hash_shift __read_mostly;
> -static struct hlist_head *dentry_hashtable __read_mostly;
> static LIST_HEAD(dentry_unused);
>
> /* Statistics gathering. */
> @@ -68,6 +82,20 @@ struct dentry_stat_t dentry_stat = {
> .age_limit = 45,
> };
>
> +static void dcache_hash_resize(unsigned int new_shift);
> +static void mod_nr_dentry(int mod)
> +{
> + unsigned long dentry_size;
> +
> + dentry_stat.nr_dentry += mod;
> +
> + dentry_size = 1 << dentry_hash->shift;
> + if (unlikely(dentry_stat.nr_dentry > dentry_size+(dentry_size>>1)))
> + dcache_hash_resize(dentry_hash->shift+1);
> + else if (unlikely(dentry_stat.nr_dentry < (dentry_size>>1)))
> + dcache_hash_resize(dentry_hash->shift-1);
> +}
> +
> static void __d_free(struct dentry *dentry)
> {
> if (dname_external(dentry))
> @@ -201,7 +229,7 @@ kill_it: {
> dentry_stat.nr_unused--;
> }
> list_del(&dentry->d_u.d_child);
> - dentry_stat.nr_dentry--; /* For d_free, below */
> + mod_nr_dentry(-1); /* For d_free, below */
> /*drops the locks, at that point nobody can reach this dentry */
> dentry_iput(dentry);
> parent = dentry->d_parent;
> @@ -380,7 +408,7 @@ static void prune_one_dentry(struct dent
>
> __d_drop(dentry);
> list_del(&dentry->d_u.d_child);
> - dentry_stat.nr_dentry--; /* For d_free, below */
> + mod_nr_dentry(-1); /* For d_free, below */
> dentry_iput(dentry);
> parent = dentry->d_parent;
> d_free(dentry);
> @@ -660,7 +688,7 @@ static void shrink_dcache_for_umount_sub
> out:
> /* several dentries were freed, need to correct nr_dentry */
> spin_lock(&dcache_lock);
> - dentry_stat.nr_dentry -= detached;
> + mod_nr_dentry(-detached);
> spin_unlock(&dcache_lock);
> }
>
> @@ -916,7 +944,7 @@ struct dentry *d_alloc(struct dentry * p
> spin_lock(&dcache_lock);
> if (parent)
> list_add(&dentry->d_u.d_child, &parent->d_subdirs);
> - dentry_stat.nr_dentry++;
> + mod_nr_dentry(1);
> spin_unlock(&dcache_lock);
>
> return dentry;
> @@ -1057,12 +1085,12 @@ struct dentry * d_alloc_root(struct inod
> return res;
> }
>
> -static inline struct hlist_head *d_hash(struct dentry *parent,
> - unsigned long hash)
> +static inline struct hlist_head *d_hash(struct dentry_hash *hashtable,
> + struct dentry *parent, unsigned long hash)
> {
> hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
> - hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
> - return dentry_hashtable + (hash & D_HASHMASK);
> + hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> hashtable->shift);
> + return hashtable->table + (hash & hashtable->mask);
> }
>
> /**
> @@ -1219,18 +1247,18 @@ struct dentry * d_lookup(struct dentry *
> return dentry;
> }
>
> -struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
> +static struct dentry * __d_lookup_hash(struct dentry_hash * hashtable,
> + struct dentry * parent, struct qstr * name)
> {
> - unsigned int len = name->len;
> unsigned int hash = name->hash;
> + unsigned int len = name->len;
> const unsigned char *str = name->name;
> - struct hlist_head *head = d_hash(parent,hash);
> - struct dentry *found = NULL;
> struct hlist_node *node;
> + struct hlist_head *head;
> struct dentry *dentry;
> + struct dentry *found = NULL;
>
> - rcu_read_lock();
> -
> + head = d_hash(hashtable, parent, hash);
> hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
> struct qstr *qstr;
>
> @@ -1273,9 +1301,39 @@ struct dentry * __d_lookup(struct dentry
> next:
> spin_unlock(&dentry->d_lock);
> }
> +
> + return found;
> +}
> +
> +struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
> +{
> + struct dentry *ret;
> + struct dentry_hash *hashtable, *old_hashtable;
> +
> + rcu_read_lock();
> + hashtable = dentry_hash;
> + smp_rmb(); /* see smp_wmb in dcache_hash_resize_work */
> + old_hashtable = old_dentry_hash;
> +
> + if (unlikely(old_hashtable)) {
> + unsigned long seq;
> + do {
> + seq = read_seqbegin(&resize_transfer_lock);
> + ret = __d_lookup_hash(old_hashtable, parent, name);
> + if (ret)
> + goto out_rcu;
> + ret = __d_lookup_hash(hashtable, parent, name);
> + if (ret)
> + goto out_rcu;
> + } while (read_seqretry(&resize_transfer_lock, seq));
> + } else {
> + ret = __d_lookup_hash(hashtable, parent, name);
> + }
> +
> +out_rcu:
> rcu_read_unlock();
>
> - return found;
> + return ret;
> }
>
> /**
> @@ -1304,6 +1362,28 @@ out:
> return dentry;
> }
>
> +static int d_validate_hash(struct dentry_hash *hashtable,
> + struct dentry *dentry, struct dentry *parent)
> +{
> + struct hlist_head *base;
> + struct hlist_node *lhp;
> +
> + base = d_hash(hashtable, parent, dentry->d_name.hash);
> + spin_lock(&dcache_lock);
> + hlist_for_each(lhp,base) {
> + /* hlist_for_each_entry_rcu() not required for d_hash list
> + * as it is parsed under dcache_lock
> + */
> + if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
> + __dget_locked(dentry);
> + spin_unlock(&dcache_lock);
> + return 1;
> + }
> + }
> + spin_unlock(&dcache_lock);
> + return 0;
> +}
> +
> /**
> * d_validate - verify dentry provided from insecure source
> * @dentry: The dentry alleged to be valid child of @dparent
> @@ -1316,33 +1396,42 @@ out:
> * Zero is returned in the dentry is invalid.
> */
>
> -int d_validate(struct dentry *dentry, struct dentry *dparent)
> +int d_validate(struct dentry *dentry, struct dentry *parent)
> {
> - struct hlist_head *base;
> - struct hlist_node *lhp;
> + struct dentry_hash *hashtable, *old_hashtable;
> + int ret = 0;
>
> /* Check whether the ptr might be valid at all.. */
> if (!kmem_ptr_validate(dentry_cache, dentry))
> goto out;
>
> - if (dentry->d_parent != dparent)
> + if (dentry->d_parent != parent)
> goto out;
>
> - spin_lock(&dcache_lock);
> - base = d_hash(dparent, dentry->d_name.hash);
> - hlist_for_each(lhp,base) {
> - /* hlist_for_each_entry_rcu() not required for d_hash list
> - * as it is parsed under dcache_lock
> - */
> - if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
> - __dget_locked(dentry);
> - spin_unlock(&dcache_lock);
> - return 1;
> - }
> + rcu_read_lock();
> + hashtable = dentry_hash;
> + smp_rmb(); /* see smp_wmb in dcache_hash_resize_work */
> + old_hashtable = old_dentry_hash;
> +
> + if (unlikely(old_hashtable)) {
> + unsigned long seq;
> + do {
> + seq = read_seqbegin(&resize_transfer_lock);
> + ret = d_validate_hash(old_hashtable, dentry, parent);
> + if (ret)
> + goto out_rcu;
> + ret = d_validate_hash(hashtable, dentry, parent);
> + if (ret)
> + goto out_rcu;
> + } while (read_seqretry(&resize_transfer_lock, seq));
> + } else {
> + ret = d_validate_hash(hashtable, dentry, parent);
> }
> - spin_unlock(&dcache_lock);
> +
> +out_rcu:
> + rcu_read_unlock();
> out:
> - return 0;
> + return ret;
> }
>
> /*
> @@ -1402,7 +1491,9 @@ static void __d_rehash(struct dentry * e
>
> static void _d_rehash(struct dentry * entry)
> {
> - __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
> + rcu_read_lock();
> + __d_rehash(entry, d_hash(dentry_hash, entry->d_parent, entry->d_name.hash));
> + rcu_read_unlock();
> }
>
> /**
> @@ -1518,8 +1609,10 @@ static void d_move_locked(struct dentry
> hlist_del_rcu(&dentry->d_hash);
>
> already_unhashed:
> - list = d_hash(target->d_parent, target->d_name.hash);
> + rcu_read_lock();
> + list = d_hash(dentry_hash, target->d_parent, target->d_name.hash);
> __d_rehash(dentry, list);
> + rcu_read_unlock();
>
> /* Unhash the target: dput() will then get rid of it */
> __d_drop(target);
> @@ -2009,42 +2102,114 @@ ino_t find_inode_number(struct dentry *d
> return ino;
> }
>
> -static __initdata unsigned long dhash_entries;
> -static int __init set_dhash_entries(char *str)
> +static unsigned int resize_new_shift;
> +
> +static void dcache_hash_resize_work(struct work_struct *work)
> {
> - if (!str)
> - return 0;
> - dhash_entries = simple_strtoul(str, &str, 0);
> - return 1;
> + unsigned long new_size, old_size;
> + int i;
> + struct dentry_hash *new_hash, *old_hash;
> + unsigned long transferred;
> +
> + if (!mutex_trylock(&resize_mutex))
> + goto out;
> +
> + new_hash = kmalloc(sizeof(struct dentry_hash), GFP_KERNEL);
> + if (!new_hash)
> + goto out_unlock;
> +
> + new_hash->shift = resize_new_shift;
> + new_size = 1 << new_hash->shift;
> + new_hash->mask = new_size - 1;
> + new_hash->table = vmalloc(new_size*sizeof(struct hlist_head));
> + if (!new_hash->table)
> + goto out_kfree;
> + for (i = 0; i < new_size; i++)
> + INIT_HLIST_HEAD(&new_hash->table[i]);
> +
> + old_dentry_hash = dentry_hash;
> + /*
> + * ensure that if the reader sees the new dentry_hash,
> + * then they will also see the old_dentry_hash assignment,
> + * above.
> + */
> + smp_wmb();
> + dentry_hash = new_hash;
> + synchronize_rcu();
> +
> + old_size = 1 << old_dentry_hash->shift;
> + transferred = 0;
> + for (i = 0; i < old_size; i++) {
> + struct hlist_head *head = &old_dentry_hash->table[i];
> +
> + if (hlist_empty(head))
> + continue;
> +
> + spin_lock(&dcache_lock);
> + write_seqlock(&resize_transfer_lock);
> + while (!hlist_empty(head)) {
> + struct dentry *dentry;
> + struct hlist_head *new_head;
> +
> + dentry = hlist_entry(head->first, struct dentry, d_hash);
> + new_head = d_hash(dentry_hash, dentry->d_parent, dentry->d_name.hash);
> +
> + spin_lock(&dentry->d_lock);
> + hlist_del_rcu(head->first);
> + hlist_add_head_rcu(&dentry->d_hash, new_head);
> + spin_unlock(&dentry->d_lock);
> + transferred++;
> + }
> + write_sequnlock(&resize_transfer_lock);
> + spin_unlock(&dcache_lock);
> + }
> +
> + printk("resize dcache hash from %lu to %lu, moved %lu entries\n", old_size, new_size, transferred);
> +
> + old_hash = old_dentry_hash;
> + old_dentry_hash = NULL;
> + mutex_unlock(&resize_mutex);
> + synchronize_rcu();
> + vfree(old_hash->table);
> + kfree(old_hash);
> +
> + resize_new_shift = 0;
> + return;
> +
> +out_kfree:
> + kfree(new_hash);
> +out_unlock:
> + mutex_unlock(&resize_mutex);
> +out:
> + resize_new_shift = 0;
> + return;
> }
> -__setup("dhash_entries=", set_dhash_entries);
>
> -static void __init dcache_init_early(void)
> +static DEFINE_SPINLOCK(resize_lock);
> +
> +static void dcache_hash_resize(unsigned int new_shift)
> {
> - int loop;
> + static DECLARE_WORK(resize_work, dcache_hash_resize_work);
>
> - /* If hashes are distributed across NUMA nodes, defer
> - * hash allocation until vmalloc space is available.
> - */
> - if (hashdist)
> + if (new_shift < MIN_DHASH_SHIFT || new_shift > MAX_DHASH_SHIFT)
> return;
>
> - dentry_hashtable =
> - alloc_large_system_hash("Dentry cache",
> - sizeof(struct hlist_head),
> - dhash_entries,
> - 13,
> - HASH_EARLY,
> - &d_hash_shift,
> - &d_hash_mask,
> - 0);
> + if (resize_new_shift)
> + return;
> + spin_lock(&resize_lock);
> + if (resize_new_shift) {
> + spin_unlock(&resize_lock);
> + return;
> + }
> + resize_new_shift = new_shift;
> + spin_unlock(&resize_lock);
>
> - for (loop = 0; loop < (1 << d_hash_shift); loop++)
> - INIT_HLIST_HEAD(&dentry_hashtable[loop]);
> + schedule_work(&resize_work);
> }
>
> static void __init dcache_init(unsigned long mempages)
> {
> + unsigned long hash_size;
> int loop;
>
> /*
> @@ -2061,22 +2226,20 @@ static void __init dcache_init(unsigned
>
> set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
>
> - /* Hash may have been set up in dcache_init_early */
> - if (!hashdist)
> - return;
> -
> - dentry_hashtable =
> - alloc_large_system_hash("Dentry cache",
> - sizeof(struct hlist_head),
> - dhash_entries,
> - 13,
> - 0,
> - &d_hash_shift,
> - &d_hash_mask,
> - 0);
> + old_dentry_hash = NULL;
>
> - for (loop = 0; loop < (1 << d_hash_shift); loop++)
> - INIT_HLIST_HEAD(&dentry_hashtable[loop]);
> + dentry_hash = kmalloc(sizeof(struct dentry_hash), GFP_ATOMIC);
> + if (!dentry_hash)
> + panic("Failed to allocate dentry_hash\n");
> + dentry_hash->shift = MIN_DHASH_SHIFT;
> + hash_size = 1 << dentry_hash->shift;
> + dentry_hash->mask = hash_size - 1;
> + dentry_hash->table = __vmalloc(hash_size*sizeof(struct hlist_head),
> + GFP_ATOMIC, PAGE_KERNEL);
> + if (!dentry_hash->table)
> + panic("Failed to allocate dentry_hash->table\n");
> + for (loop = 0; loop < hash_size; loop++)
> + INIT_HLIST_HEAD(&dentry_hash->table[loop]);
> }
>
> /* SLAB cache for __getname() consumers */
> @@ -2089,7 +2252,6 @@ EXPORT_SYMBOL(d_genocide);
>
> void __init vfs_caches_init_early(void)
> {
> - dcache_init_early();
> inode_init_early();
> }
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>