LinuxLists.cc - [PATCH] fix return value for mb_cache_shrink_fn when nr_to

2010-07-21 10:54:08

Subject: [PATCH] fix return value for mb_cache_shrink_fn when nr_to_scan > 0

Sorry. regerated the patch, please check it.
I wrapped most code in single pair of spinlock ops for 2 reasons:
1) get spinlock 2 times seems time consuming
2) use single pair of spinlock ops can keep "count"
consistent for the shrink operation. 2 pairs may
get some new ces created by other processes.

Signed-off-by: Wang Sheng-Hui <[email protected]>
---
fs/mbcache.c | 24 ++++++++++++------------
1 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3..ee57aa3 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -201,21 +201,15 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(free_list);
struct list_head *l, *ltmp;
+ struct mb_cache *cache;
int count = 0;

- spin_lock(&mb_cache_spinlock);
- list_for_each(l, &mb_cache_list) {
- struct mb_cache *cache =
- list_entry(l, struct mb_cache, c_cache_list);
- mb_debug("cache %s (%d)", cache->c_name,
- atomic_read(&cache->c_entry_count));
- count += atomic_read(&cache->c_entry_count);
- }
mb_debug("trying to free %d entries", nr_to_scan);
- if (nr_to_scan == 0) {
- spin_unlock(&mb_cache_spinlock);
+
+ spin_lock(&mb_cache_spinlock);
+ if (nr_to_scan == 0)
goto out;
- }
+
while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
struct mb_cache_entry *ce =
list_entry(mb_cache_lru_list.next,
@@ -223,12 +217,18 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
list_move_tail(&ce->e_lru_list, &free_list);
__mb_cache_entry_unhash(ce);
}
- spin_unlock(&mb_cache_spinlock);
list_for_each_safe(l, ltmp, &free_list) {
__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
e_lru_list), gfp_mask);
}
out:
+ list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+ mb_debug("cache %s (%d)", cache->c_name,
+ atomic_read(&cache->c_entry_count));
+ count += atomic_read(&cache->c_entry_count);
+ }
+ spin_unlock(&mb_cache_spinlock);
+
return (count / 100) * sysctl_vfs_cache_pressure;
}

--
1.6.3.3

2010-07-21 14:00:07

by Eric Sandeen

[permalink] [raw]

Subject: Re: [PATCH] fix return value for mb_cache_shrink_fn when nr_to_scan > 0

Wang Sheng-Hui wrote:
> Sorry. regerated the patch, please check it.
> I wrapped most code in single pair of spinlock ops for 2 reasons:
> 1) get spinlock 2 times seems time consuming
> 2) use single pair of spinlock ops can keep "count"
> consistent for the shrink operation. 2 pairs may
> get some new ces created by other processes.
>

Sorry, this patch appears to have whitespace cut & paste mangling.

More comments below.

> Signed-off-by: Wang Sheng-Hui <[email protected]>
> ---
> fs/mbcache.c | 24 ++++++++++++------------
> 1 files changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index ec88ff3..ee57aa3 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -201,21 +201,15 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
> {
> LIST_HEAD(free_list);
> struct list_head *l, *ltmp;
> + struct mb_cache *cache;
> int count = 0;
>
> - spin_lock(&mb_cache_spinlock);
> - list_for_each(l, &mb_cache_list) {
> - struct mb_cache *cache =
> - list_entry(l, struct mb_cache, c_cache_list);
> - mb_debug("cache %s (%d)", cache->c_name,
> - atomic_read(&cache->c_entry_count));
> - count += atomic_read(&cache->c_entry_count);
> - }
> mb_debug("trying to free %d entries", nr_to_scan);
> - if (nr_to_scan == 0) {
> - spin_unlock(&mb_cache_spinlock);
> +
> + spin_lock(&mb_cache_spinlock);
> + if (nr_to_scan == 0)
> goto out;
> - }
> +
> while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
> struct mb_cache_entry *ce =
> list_entry(mb_cache_lru_list.next,
> @@ -223,12 +217,18 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
> list_move_tail(&ce->e_lru_list, &free_list);
> __mb_cache_entry_unhash(ce);
> }
> - spin_unlock(&mb_cache_spinlock);

you can't do this because

> list_for_each_safe(l, ltmp, &free_list) {
> __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,

this takes the spinlock too and you'll deadlock.

Did you test this patch?

-Eric

> e_lru_list), gfp_mask);
> }
> out:
> + list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
> + mb_debug("cache %s (%d)", cache->c_name,
> + atomic_read(&cache->c_entry_count));
> + count += atomic_read(&cache->c_entry_count);
> + }
> + spin_unlock(&mb_cache_spinlock);
> +
> return (count / 100) * sysctl_vfs_cache_pressure;
> }
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2010-07-21 20:26:38

by Andreas Gruenbacher

[permalink] [raw]

Subject: [PATCH 2/2] mbcache: fix shrinker function return value

The shrinker function is supposed to return the number of cache
entries after shrinking, not before shrinking. Fix that.

Based on a patch from Wang Sheng-Hui <[email protected]>.

Signed-off-by: Andreas Gruenbacher <[email protected]>
---
fs/mbcache.c | 27 ++++++++++-----------------
1 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 8a2cbd8..cf4e6cd 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -176,22 +176,12 @@ static int
mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(free_list);
- struct list_head *l, *ltmp;
+ struct mb_cache *cache;
+ struct mb_cache_entry *entry, *tmp;
int count = 0;

- spin_lock(&mb_cache_spinlock);
- list_for_each(l, &mb_cache_list) {
- struct mb_cache *cache =
- list_entry(l, struct mb_cache, c_cache_list);
- mb_debug("cache %s (%d)", cache->c_name,
- atomic_read(&cache->c_entry_count));
- count += atomic_read(&cache->c_entry_count);
- }
mb_debug("trying to free %d entries", nr_to_scan);
- if (nr_to_scan == 0) {
- spin_unlock(&mb_cache_spinlock);
- goto out;
- }
+ spin_lock(&mb_cache_spinlock);
while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
struct mb_cache_entry *ce =
list_entry(mb_cache_lru_list.next,
@@ -199,12 +189,15 @@ mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
list_move_tail(&ce->e_lru_list, &free_list);
__mb_cache_entry_unhash(ce);
}
+ list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+ mb_debug("cache %s (%d)", cache->c_name,
+ atomic_read(&cache->c_entry_count));
+ count += atomic_read(&cache->c_entry_count);
+ }
spin_unlock(&mb_cache_spinlock);
- list_for_each_safe(l, ltmp, &free_list) {
- __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
- e_lru_list), gfp_mask);
+ list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+ __mb_cache_entry_forget(entry, gfp_mask);
}
-out:
return (count / 100) * sysctl_vfs_cache_pressure;
}

--
1.7.2.rc3.57.g77b5b

2010-07-21 17:57:20

by Andreas Gruenbacher

[permalink] [raw]

Subject: [PATCH 0/2] mbcache fixes

Al,

here is an mbcache cleanup and then a fixed version of Shenghui's minor
shrinker function fix. The patches have survived functional testing
here.

This seems slightly too much for kernel-janitors, so could you please
take the patches?

Thanks,
Andreas

Andreas Gruenbacher (2):
mbcache: Remove unused features
mbcache: fix shrinker function return value

fs/ext2/xattr.c | 12 ++--
fs/ext3/xattr.c | 12 ++--
fs/ext4/xattr.c | 12 ++--
fs/mbcache.c | 168 ++++++++++++++---------------------------------
include/linux/mbcache.h | 20 ++----
5 files changed, 70 insertions(+), 154 deletions(-)

--
1.7.2.rc3.57.g77b5b

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2010-07-19 16:19:41

by Andreas Gruenbacher

[permalink] [raw]

Subject: [PATCH 1/2] mbcache: Remove unused features

The mbcache code was written to support a variable number of indexes,
but all the existing users use exactly one index. Simplify to code to
support only that case.

There are also no users of the cache entry free operation, and none of
the users keep extra data in cache entries. Remove those features as
well.

Signed-off-by: Andreas Gruenbacher <[email protected]>
---
fs/ext2/xattr.c | 12 ++--
fs/ext3/xattr.c | 12 ++--
fs/ext4/xattr.c | 12 ++--
fs/mbcache.c | 141 +++++++++++++---------------------------------
include/linux/mbcache.h | 20 ++-----
5 files changed, 60 insertions(+), 137 deletions(-)

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 7c39157..af59915 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -838,7 +838,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
if (!ce)
return -ENOMEM;
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
if (error) {
mb_cache_entry_free(ce);
if (error == -EBUSY) {
@@ -912,8 +912,8 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext2_xattr_cache, 0,
- inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
+ hash);
while (ce) {
struct buffer_head *bh;

@@ -945,7 +945,7 @@ again:
unlock_buffer(bh);
brelse(bh);
}
- ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
}
return NULL;
}
@@ -1021,9 +1021,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
int __init
init_ext2_xattr(void)
{
- ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
- sizeof(struct mb_cache_entry) +
- sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+ ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
if (!ext2_xattr_cache)
return -ENOMEM;
return 0;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 71fb8d6..e69dc6d 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1139,7 +1139,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
ea_bdebug(bh, "out of memory");
return;
}
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
if (error) {
mb_cache_entry_free(ce);
if (error == -EBUSY) {
@@ -1211,8 +1211,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
- inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
+ hash);
while (ce) {
struct buffer_head *bh;

@@ -1237,7 +1237,7 @@ again:
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
}
return NULL;
}
@@ -1313,9 +1313,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
int __init
init_ext3_xattr(void)
{
- ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
- sizeof(struct mb_cache_entry) +
- sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+ ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
if (!ext3_xattr_cache)
return -ENOMEM;
return 0;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 0433800..1c93198 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1418,7 +1418,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
ea_bdebug(bh, "out of memory");
return;
}
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
if (error) {
mb_cache_entry_free(ce);
if (error == -EBUSY) {
@@ -1490,8 +1490,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
- inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+ hash);
while (ce) {
struct buffer_head *bh;

@@ -1515,7 +1515,7 @@ again:
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
}
return NULL;
}
@@ -1591,9 +1591,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
int __init
init_ext4_xattr(void)
{
- ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
- sizeof(struct mb_cache_entry) +
- sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+ ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
if (!ext4_xattr_cache)
return -ENOMEM;
return 0;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e28f21b..8a2cbd8 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -79,15 +79,11 @@ EXPORT_SYMBOL(mb_cache_entry_find_next);
struct mb_cache {
struct list_head c_cache_list;
const char *c_name;
- struct mb_cache_op c_op;
atomic_t c_entry_count;
int c_bucket_bits;
-#ifndef MB_CACHE_INDEXES_COUNT
- int c_indexes_count;
-#endif
- struct kmem_cache *c_entry_cache;
+ struct kmem_cache *c_entry_cache;
struct list_head *c_block_hash;
- struct list_head *c_indexes_hash[0];
+ struct list_head *c_index_hash;
};

@@ -101,16 +97,6 @@ static LIST_HEAD(mb_cache_list);
static LIST_HEAD(mb_cache_lru_list);
static DEFINE_SPINLOCK(mb_cache_spinlock);

-static inline int
-mb_cache_indexes(struct mb_cache *cache)
-{
-#ifdef MB_CACHE_INDEXES_COUNT
- return MB_CACHE_INDEXES_COUNT;
-#else
- return cache->c_indexes_count;
-#endif
-}
-
/*
* What the mbcache registers as to get shrunk dynamically.
*/
@@ -132,12 +118,9 @@ __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
static void
__mb_cache_entry_unhash(struct mb_cache_entry *ce)
{
- int n;
-
if (__mb_cache_entry_is_hashed(ce)) {
list_del_init(&ce->e_block_list);
- for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
- list_del(&ce->e_indexes[n].o_list);
+ list_del(&ce->e_index.o_list);
}
}

@@ -148,16 +131,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
struct mb_cache *cache = ce->e_cache;

mb_assert(!(ce->e_used || ce->e_queued));
- if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
- /* free failed -- put back on the lru list
- for freeing later. */
- spin_lock(&mb_cache_spinlock);
- list_add(&ce->e_lru_list, &mb_cache_lru_list);
- spin_unlock(&mb_cache_spinlock);
- } else {
- kmem_cache_free(cache->c_entry_cache, ce);
- atomic_dec(&cache->c_entry_count);
- }
+ kmem_cache_free(cache->c_entry_cache, ce);
+ atomic_dec(&cache->c_entry_count);
}

@@ -243,72 +218,49 @@ out:
* memory was available.
*
* @name: name of the cache (informal)
- * @cache_op: contains the callback called when freeing a cache entry
- * @entry_size: The size of a cache entry, including
- * struct mb_cache_entry
- * @indexes_count: number of additional indexes in the cache. Must equal
- * MB_CACHE_INDEXES_COUNT if the number of indexes is
- * hardwired.
* @bucket_bits: log2(number of hash buckets)
*/
struct mb_cache *
-mb_cache_create(const char *name, struct mb_cache_op *cache_op,
- size_t entry_size, int indexes_count, int bucket_bits)
+mb_cache_create(const char *name, int bucket_bits)
{
- int m=0, n, bucket_count = 1 << bucket_bits;
+ int n, bucket_count = 1 << bucket_bits;
struct mb_cache *cache = NULL;

- if(entry_size < sizeof(struct mb_cache_entry) +
- indexes_count * sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]))
- return NULL;
-
- cache = kmalloc(sizeof(struct mb_cache) +
- indexes_count * sizeof(struct list_head), GFP_KERNEL);
+ cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
if (!cache)
- goto fail;
+ return NULL;
cache->c_name = name;
- cache->c_op.free = NULL;
- if (cache_op)
- cache->c_op.free = cache_op->free;
atomic_set(&cache->c_entry_count, 0);
cache->c_bucket_bits = bucket_bits;
-#ifdef MB_CACHE_INDEXES_COUNT
- mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
-#else
- cache->c_indexes_count = indexes_count;
-#endif
cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
GFP_KERNEL);
if (!cache->c_block_hash)
goto fail;
for (n=0; n<bucket_count; n++)
INIT_LIST_HEAD(&cache->c_block_hash[n]);
- for (m=0; m<indexes_count; m++) {
- cache->c_indexes_hash[m] = kmalloc(bucket_count *
- sizeof(struct list_head),
- GFP_KERNEL);
- if (!cache->c_indexes_hash[m])
- goto fail;
- for (n=0; n<bucket_count; n++)
- INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
- }
- cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
+ cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!cache->c_index_hash)
+ goto fail;
+ for (n=0; n<bucket_count; n++)
+ INIT_LIST_HEAD(&cache->c_index_hash[n]);
+ cache->c_entry_cache = kmem_cache_create(name,
+ sizeof(struct mb_cache_entry), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
if (!cache->c_entry_cache)
- goto fail;
+ goto fail2;

spin_lock(&mb_cache_spinlock);
list_add(&cache->c_cache_list, &mb_cache_list);
spin_unlock(&mb_cache_spinlock);
return cache;

+fail2:
+ kfree(cache->c_index_hash);
+
fail:
- if (cache) {
- while (--m >= 0)
- kfree(cache->c_indexes_hash[m]);
- kfree(cache->c_block_hash);
- kfree(cache);
- }
+ kfree(cache->c_block_hash);
+ kfree(cache);
return NULL;
}

@@ -357,7 +309,6 @@ mb_cache_destroy(struct mb_cache *cache)
{
LIST_HEAD(free_list);
struct list_head *l, *ltmp;
- int n;

spin_lock(&mb_cache_spinlock);
list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
@@ -384,8 +335,7 @@ mb_cache_destroy(struct mb_cache *cache)

kmem_cache_destroy(cache->c_entry_cache);

- for (n=0; n < mb_cache_indexes(cache); n++)
- kfree(cache->c_indexes_hash[n]);
+ kfree(cache->c_index_hash);
kfree(cache->c_block_hash);
kfree(cache);
}
@@ -429,17 +379,16 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
*
* @bdev: device the cache entry belongs to
* @block: block number
- * @keys: array of additional keys. There must be indexes_count entries
- * in the array (as specified when creating the cache).
+ * @key: lookup key
*/
int
mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
- sector_t block, unsigned int keys[])
+ sector_t block, unsigned int key)
{
struct mb_cache *cache = ce->e_cache;
unsigned int bucket;
struct list_head *l;
- int error = -EBUSY, n;
+ int error = -EBUSY;

bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
cache->c_bucket_bits);
@@ -454,12 +403,9 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
ce->e_bdev = bdev;
ce->e_block = block;
list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
- for (n=0; n<mb_cache_indexes(cache); n++) {
- ce->e_indexes[n].o_key = keys[n];
- bucket = hash_long(keys[n], cache->c_bucket_bits);
- list_add(&ce->e_indexes[n].o_list,
- &cache->c_indexes_hash[n][bucket]);
- }
+ ce->e_index.o_key = key;
+ bucket = hash_long(key, cache->c_bucket_bits);
+ list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
error = 0;
out:
spin_unlock(&mb_cache_spinlock);
@@ -555,13 +501,12 @@ cleanup:

static struct mb_cache_entry *
__mb_cache_entry_find(struct list_head *l, struct list_head *head,
- int index, struct block_device *bdev, unsigned int key)
+ struct block_device *bdev, unsigned int key)
{
while (l != head) {
struct mb_cache_entry *ce =
- list_entry(l, struct mb_cache_entry,
- e_indexes[index].o_list);
- if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
+ list_entry(l, struct mb_cache_entry, e_index.o_list);
+ if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
DEFINE_WAIT(wait);

if (!list_empty(&ce->e_lru_list))
@@ -603,23 +548,20 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
* returned cache entry is locked for shared access ("multiple readers").
*
* @cache: the cache to search
- * @index: the number of the additonal index to search (0<=index<indexes_count)
* @bdev: the device the cache entry should belong to
* @key: the key in the index
*/
struct mb_cache_entry *
-mb_cache_entry_find_first(struct mb_cache *cache, int index,
- struct block_device *bdev, unsigned int key)
+mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
+ unsigned int key)
{
unsigned int bucket = hash_long(key, cache->c_bucket_bits);
struct list_head *l;
struct mb_cache_entry *ce;

- mb_assert(index < mb_cache_indexes(cache));
spin_lock(&mb_cache_spinlock);
- l = cache->c_indexes_hash[index][bucket].next;
- ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
- index, bdev, key);
+ l = cache->c_index_hash[bucket].next;
+ ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
spin_unlock(&mb_cache_spinlock);
return ce;
}
@@ -640,12 +582,11 @@ mb_cache_entry_find_first(struct mb_cache *cache, int index,
* }
*
* @prev: The previous match
- * @index: the number of the additonal index to search (0<=index<indexes_count)
* @bdev: the device the cache entry should belong to
* @key: the key in the index
*/
struct mb_cache_entry *
-mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
+mb_cache_entry_find_next(struct mb_cache_entry *prev,
struct block_device *bdev, unsigned int key)
{
struct mb_cache *cache = prev->e_cache;
@@ -653,11 +594,9 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
struct list_head *l;
struct mb_cache_entry *ce;

- mb_assert(index < mb_cache_indexes(cache));
spin_lock(&mb_cache_spinlock);
- l = prev->e_indexes[index].o_list.next;
- ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
- index, bdev, key);
+ l = prev->e_index.o_list.next;
+ ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
__mb_cache_entry_release_unlock(prev);
return ce;
}
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index a09b84e..54cbbac 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -4,9 +4,6 @@
(C) 2001 by Andreas Gruenbacher, <[email protected]>
*/

-/* Hardwire the number of additional indexes */
-#define MB_CACHE_INDEXES_COUNT 1
-
struct mb_cache_entry {
struct list_head e_lru_list;
struct mb_cache *e_cache;
@@ -18,17 +15,12 @@ struct mb_cache_entry {
struct {
struct list_head o_list;
unsigned int o_key;
- } e_indexes[0];
-};
-
-struct mb_cache_op {
- int (*free)(struct mb_cache_entry *, gfp_t);
+ } e_index;
};

/* Functions on caches */

-struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
- int, int);
+struct mb_cache *mb_cache_create(const char *, int);
void mb_cache_shrink(struct block_device *);
void mb_cache_destroy(struct mb_cache *);

@@ -36,17 +28,15 @@ void mb_cache_destroy(struct mb_cache *);

struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t);
int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *,
- sector_t, unsigned int[]);
+ sector_t, unsigned int);
void mb_cache_entry_release(struct mb_cache_entry *);
void mb_cache_entry_free(struct mb_cache_entry *);
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *,
struct block_device *,
sector_t);
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
struct block_device *,
unsigned int);
-struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *,
struct block_device *,
unsigned int);
-#endif
--
1.7.2.rc3.57.g77b5b

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2010-07-21 23:18:40

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH 1/2] mbcache: Remove unused features

On 2010-07-19, at 10:19, Andreas Gruenbacher wrote:
> The mbcache code was written to support a variable number of indexes,
> but all the existing users use exactly one index. Simplify to code to
> support only that case.
>
> There are also no users of the cache entry free operation, and none of
> the users keep extra data in cache entries. Remove those features as
> well.

Is it possible to allow mbcache to be disabled, either for the whole kernel, on a per-filesystem basis, or adaptively if the cache hit rate is very low (any of these is fine, not all of them).

The reason I ask is because under some uses mbcache is adding significant overhead for very little/no benefit. If the xattr blocks are not shared, then every xattr is stored in a separate entry, and there is a single spinlock protecting the whole mbcache for all filesystems.

On systems with large memory for the buffer cache (6M+ buffer heads, 5M inodes in memory) there are very long hash chains to be searched, and this slows down filesystem performance dramatically. We became aware of this problem because of NMIs triggering due to long spinlock hold times in mb_cache_entry_insert() on a server with 32GB of RAM.

To reproduce the problem, a simple test can be done with a bunch of kernel source trees (not hard-linked trees though, must be unpacked separately):

$ for i in linux-* ; do
time du ${i}
done

gives :
8s for the first one
12s for the 10th
27s for the 25th
48s for the 50th
95s for the 100th
=> this is strictly linear

opreport -l vmlinux show :
68.12% in mb_cache_entry_insert
21.71% in mb_cache_entry_release
4.27% in mb_cache_entry_free
1.49% in mb_cache_entry_find_first
0.82% in __mb_cache_entry_find

(see https://bugzilla.lustre.org/show_bug.cgi?id=22771 for full details)

I don't think fixing the mbcache to be more efficient (more buckets, more locks, etc) is really solving the problem which is that mbcache is adding overhead without value in these situations.

Attached is a patch that allows manually disabling mbcache on a per-filesystem basis with a mount option. Better would be to automatically disable it if e.g. some hundreds or thousands of objects were inserted into the cache and there was < 1% cache hit rate. That would help everyone, even those people who don't know they have a problem.

Cheers, Andreas

2010-07-21 23:22:54

by Al Viro

[permalink] [raw]

Subject: Re: [PATCH 0/2] mbcache fixes

On Wed, Jul 21, 2010 at 07:57:20PM +0200, Andreas Gruenbacher wrote:
> Al,
>
> here is an mbcache cleanup and then a fixed version of Shenghui's minor
> shrinker function fix. The patches have survived functional testing
> here.
>
> This seems slightly too much for kernel-janitors, so could you please
> take the patches?

Done.

2010-07-22 00:07:46

by Andreas Gruenbacher

[permalink] [raw]

Subject: Re: [PATCH 1/2] mbcache: Remove unused features

>From https://bugzilla.lustre.org/show_bug.cgi?id=22771#c24 :
> On our production system, the hash table contain 64 entries (6 bits) for a
> cache of 2307267 entries.
> A count in each list give a good load balance : number of entries vary
> between 35782 to 36496 while the optimal repartition is 2307267 / 64 =
> 36051.

Hehe, I like that sense of humor :)

On Thursday 22 July 2010 01:18:39 Andreas Dilger wrote:
> Is it possible to allow mbcache to be disabled, either for the whole
> kernel, on a per-filesystem basis, or adaptively if the cache hit rate is
> very low (any of these is fine, not all of them).

We could do that, but making the cache not degrade so badly would be a good
idea in any case. The number of buckets is currently fixed for ext[234] so it
would make sense to either make that number dynamic or limit the maximum
number of cache entries. The latter will probably be good enough for most
workloads.

> Attached is a patch that allows manually disabling mbcache on a
> per-filesystem basis with a mount option.

> I don't think fixing the mbcache to be more efficient (more buckets, more
> locks, etc) is really solving the problem which is that mbcache is adding
> overhead without value in these situations.

A mount option would be very ugly, but a kernel internal NO_MBCACHE flag
sounds more acceptable to me.

> Better would be to
> automatically disable it if e.g. some hundreds or thousands of objects
> were inserted into the cache and there was < 1% cache hit rate.

This assumes that the workload won't change.

> That would help everyone, even those people who don't know they have a
> problem.

People who don't know they have a problem would also be helped by making the
cache not degrade so badly, right?

Even better would be to use a more appropriate inode size, but you've pointed
that out in the bug already.

Thanks,
Andreas

2010-07-22 00:54:45

by shenghui

[permalink] [raw]

Subject: re: [PATCH] fix return value for mb_cache_shrink_fn when nr_to_scan > 0

Sorry, missed that. Regerated and passed checkpatch.pl check.
Please check it.

Signed-off-by: Wang Sheng-Hui <[email protected]>
---
fs/mbcache.c | 23 ++++++++++++-----------
1 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3..603170e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -201,21 +201,14 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(free_list);
struct list_head *l, *ltmp;
+ struct mb_cache *cache;
int count = 0;

- spin_lock(&mb_cache_spinlock);
- list_for_each(l, &mb_cache_list) {
- struct mb_cache *cache =
- list_entry(l, struct mb_cache, c_cache_list);
- mb_debug("cache %s (%d)", cache->c_name,
- atomic_read(&cache->c_entry_count));
- count += atomic_read(&cache->c_entry_count);
- }
mb_debug("trying to free %d entries", nr_to_scan);
- if (nr_to_scan == 0) {
- spin_unlock(&mb_cache_spinlock);
+ if (nr_to_scan == 0)
goto out;
- }
+
+ spin_lock(&mb_cache_spinlock);
while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
struct mb_cache_entry *ce =
list_entry(mb_cache_lru_list.next,
@@ -229,6 +222,14 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
e_lru_list), gfp_mask);
}
out:
+ spin_lock(&mb_cache_spinlock);
+ list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+ mb_debug("cache %s (%d)", cache->c_name,
+ atomic_read(&cache->c_entry_count));
+ count += atomic_read(&cache->c_entry_count);
+ }
+ spin_unlock(&mb_cache_spinlock);
+
return (count / 100) * sysctl_vfs_cache_pressure;
}

--
1.6.3.3

2010-07-22 01:06:35

by shenghui

[permalink] [raw]

Subject: Re: [PATCH] fix return value for mb_cache_shrink_fn when nr_to_scan > 0

Sorry to trouble you all & Thanks for your instructions!
I noticed that Andreas Gruenbacher has submitted patches
on mbcache.
Please ignore mine.

--

Thanks and Best Regards,
shenghui

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>